Merge tag 'fsnotify_for_v5.17-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 28 Jan 2022 15:51:31 +0000 (17:51 +0200)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 28 Jan 2022 15:51:31 +0000 (17:51 +0200)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Jan 2022 15:51:31 +0000 (17:51 +0200)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Jan 2022 15:51:31 +0000 (17:51 +0200)
diff --combined fs/btrfs/ioctl.c

index eef5b30,7807b28..d8af662
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -1214,35 -1214,6 +1214,35 @@@ static int defrag_collect_targets(struc
                         goto next;
   
                 /*
+ +               * Our start offset might be in the middle of an existing extent
+ +               * map, so take that into account.
+ +               */
+ +              range_len = em->len - (cur - em->start);
+ +              /*
+ +               * If this range of the extent map is already flagged for delalloc,
+ +               * skip it, because:
+ +               *
+ +               * 1) We could deadlock later, when trying to reserve space for
+ +               *    delalloc, because in case we can't immediately reserve space
+ +               *    the flusher can start delalloc and wait for the respective
+ +               *    ordered extents to complete. The deadlock would happen
+ +               *    because we do the space reservation while holding the range
+ +               *    locked, and starting writeback, or finishing an ordered
+ +               *    extent, requires locking the range;
+ +               *
+ +               * 2) If there's delalloc there, it means there's dirty pages for
+ +               *    which writeback has not started yet (we clean the delalloc
+ +               *    flag when starting writeback and after creating an ordered
+ +               *    extent). If we mark pages in an adjacent range for defrag,
+ +               *    then we will have a larger contiguous range for delalloc,
+ +               *    very likely resulting in a larger extent after writeback is
+ +               *    triggered (except in a case of free space fragmentation).
+ +               */
+ +              if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
+ +                                 EXTENT_DELALLOC, 0, NULL))
+ +                      goto next;
+ +
+ +              /*
                  * For do_compress case, we want to compress all valid file
                  * extents, thus no @extent_thresh or mergeable check.
                  */
@@@ -1250,7 -1221,7 +1250,7 @@@
                         goto add;
   
                 /* Skip too large extent */
- -              if (em->len >= extent_thresh)
+ +              if (range_len >= extent_thresh)
                         goto next;
   
                 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
@@@ -1471,11 -1442,9 +1471,11 @@@ static int defrag_one_cluster(struct bt
         list_for_each_entry(entry, &target_list, list) {
                 u32 range_len = entry->len;
   
- -              /* Reached the limit */
- -              if (max_sectors && max_sectors == *sectors_defragged)
+ +              /* Reached or beyond the limit */
+ +              if (max_sectors && *sectors_defragged >= max_sectors) {
+ +                      ret = 1;
                         break;
+ +              }
   
                 if (max_sectors)
                         range_len = min_t(u32, range_len,
@@@ -1496,8 -1465,7 +1496,8 @@@
                                        extent_thresh, newer_than, do_compress);
                 if (ret < 0)
                         break;
- -              *sectors_defragged += range_len;
+ +              *sectors_defragged += range_len >>
+ +                                    inode->root->fs_info->sectorsize_bits;
         }
   out:
         list_for_each_entry_safe(entry, tmp, &target_list, list) {
@@@ -1516,12 -1484,6 +1516,12 @@@
    * @newer_than:          minimum transid to defrag
    * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
    *               will be defragged.
+ + *
+ + * Return <0 for error.
+ + * Return >=0 for the number of sectors defragged, and range->start will be updated
+ + * to indicate the file offset where next defrag should be started at.
+ + * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
+ + *  defragging all the range).
    */
   int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
                       struct btrfs_ioctl_defrag_range_args *range,
@@@ -1537,7 -1499,6 +1537,7 @@@
         int compress_type = BTRFS_COMPRESS_ZLIB;
         int ret = 0;
         u32 extent_thresh = range->extent_thresh;
+ +      pgoff_t start_index;
   
         if (isize == 0)
                 return 0;
@@@ -1557,16 -1518,12 +1557,16 @@@
   
         if (range->start + range->len > range->start) {
                 /* Got a specific range */
- -              last_byte = min(isize, range->start + range->len) - 1;
+ +              last_byte = min(isize, range->start + range->len);
         } else {
                 /* Defrag until file end */
- -              last_byte = isize - 1;
+ +              last_byte = isize;
         }
   
+ +      /* Align the range */
+ +      cur = round_down(range->start, fs_info->sectorsize);
+ +      last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
+ +
         /*
          * If we were not given a ra, allocate a readahead context. As
          * readahead is just an optimization, defrag will work without it so
@@@ -1579,26 -1536,16 +1579,26 @@@
                         file_ra_state_init(ra, inode->i_mapping);
         }
   
- -      /* Align the range */
- -      cur = round_down(range->start, fs_info->sectorsize);
- -      last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
+ +      /*
+ +       * Make writeback start from the beginning of the range, so that the
+ +       * defrag range can be written sequentially.
+ +       */
+ +      start_index = cur >> PAGE_SHIFT;
+ +      if (start_index < inode->i_mapping->writeback_index)
+ +              inode->i_mapping->writeback_index = start_index;
   
         while (cur < last_byte) {
+ +              const unsigned long prev_sectors_defragged = sectors_defragged;
                 u64 cluster_end;
   
                 /* The cluster size 256K should always be page aligned */
                 BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
   
+ +              if (btrfs_defrag_cancelled(fs_info)) {
+ +                      ret = -EAGAIN;
+ +                      break;
+ +              }
+ +
                 /* We want the cluster end at page boundary when possible */
                 cluster_end = (((cur >> PAGE_SHIFT) +
                                (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
@@@ -1620,27 -1567,14 +1620,27 @@@
                                 cluster_end + 1 - cur, extent_thresh,
                                 newer_than, do_compress,
                                 &sectors_defragged, max_to_defrag);
+ +
+ +              if (sectors_defragged > prev_sectors_defragged)
+ +                      balance_dirty_pages_ratelimited(inode->i_mapping);
+ +
                 btrfs_inode_unlock(inode, 0);
                 if (ret < 0)
                         break;
                 cur = cluster_end + 1;
+ +              if (ret > 0) {
+ +                      ret = 0;
+ +                      break;
+ +              }
         }
   
         if (ra_allocated)
                 kfree(ra);
+ +      /*
+ +       * Update range.start for autodefrag, this will indicate where to start
+ +       * in next run.
+ +       */
+ +      range->start = cur;
         if (sectors_defragged) {
                 /*
                  * We have defragged some sectors, for compression case they
@@@ -3152,10 -3086,8 +3152,8 @@@ static noinline int btrfs_ioctl_snap_de
         btrfs_inode_lock(inode, 0);
         err = btrfs_delete_subvolume(dir, dentry);
         btrfs_inode_unlock(inode, 0);
-       if (!err) {
-               fsnotify_rmdir(dir, dentry);
-               d_delete(dentry);
-       }
+       if (!err)
+               d_delete_notify(dir, dentry);
   
   out_dput:
         dput(dentry);
diff --combined fs/namei.c

index b867a92,4ed0e41..3f1829b
--- 1/fs/namei.c
--- 2/fs/namei.c
+++ b/fs/namei.c
@@@ -1020,60 -1020,10 +1020,60 @@@ static inline void put_link(struct name
                 path_put(&last->link);
   }
   
- -int sysctl_protected_symlinks __read_mostly = 0;
- -int sysctl_protected_hardlinks __read_mostly = 0;
- -int sysctl_protected_fifos __read_mostly;
- -int sysctl_protected_regular __read_mostly;
+ +static int sysctl_protected_symlinks __read_mostly;
+ +static int sysctl_protected_hardlinks __read_mostly;
+ +static int sysctl_protected_fifos __read_mostly;
+ +static int sysctl_protected_regular __read_mostly;
+ +
+ +#ifdef CONFIG_SYSCTL
+ +static struct ctl_table namei_sysctls[] = {
+ +      {
+ +              .procname       = "protected_symlinks",
+ +              .data           = &sysctl_protected_symlinks,
+ +              .maxlen         = sizeof(int),
+ +              .mode           = 0600,
+ +              .proc_handler   = proc_dointvec_minmax,
+ +              .extra1         = SYSCTL_ZERO,
+ +              .extra2         = SYSCTL_ONE,
+ +      },
+ +      {
+ +              .procname       = "protected_hardlinks",
+ +              .data           = &sysctl_protected_hardlinks,
+ +              .maxlen         = sizeof(int),
+ +              .mode           = 0600,
+ +              .proc_handler   = proc_dointvec_minmax,
+ +              .extra1         = SYSCTL_ZERO,
+ +              .extra2         = SYSCTL_ONE,
+ +      },
+ +      {
+ +              .procname       = "protected_fifos",
+ +              .data           = &sysctl_protected_fifos,
+ +              .maxlen         = sizeof(int),
+ +              .mode           = 0600,
+ +              .proc_handler   = proc_dointvec_minmax,
+ +              .extra1         = SYSCTL_ZERO,
+ +              .extra2         = SYSCTL_TWO,
+ +      },
+ +      {
+ +              .procname       = "protected_regular",
+ +              .data           = &sysctl_protected_regular,
+ +              .maxlen         = sizeof(int),
+ +              .mode           = 0600,
+ +              .proc_handler   = proc_dointvec_minmax,
+ +              .extra1         = SYSCTL_ZERO,
+ +              .extra2         = SYSCTL_TWO,
+ +      },
+ +      { }
+ +};
+ +
+ +static int __init init_fs_namei_sysctls(void)
+ +{
+ +      register_sysctl_init("fs", namei_sysctls);
+ +      return 0;
+ +}
+ +fs_initcall(init_fs_namei_sysctls);
+ +
+ +#endif /* CONFIG_SYSCTL */
   
   /**
    * may_follow_link - Check symlink following for unsafe situations
@@@ -4024,13 -3974,12 +4024,12 @@@ int vfs_rmdir(struct user_namespace *mn
         dentry->d_inode->i_flags |= S_DEAD;
         dont_mount(dentry);
         detach_mounts(dentry);
-       fsnotify_rmdir(dir, dentry);
   
   out:
         inode_unlock(dentry->d_inode);
         dput(dentry);
         if (!error)
-               d_delete(dentry);
+               d_delete_notify(dir, dentry);
         return error;
   }
   EXPORT_SYMBOL(vfs_rmdir);
@@@ -4152,7 -4101,6 +4151,6 @@@ int vfs_unlink(struct user_namespace *m
                         if (!error) {
                                 dont_mount(dentry);
                                 detach_mounts(dentry);
-                               fsnotify_unlink(dir, dentry);
                         }
                 }
         }
@@@ -4160,9 -4108,11 +4158,11 @@@ out
         inode_unlock(target);
   
         /* We don't d_delete() NFS sillyrenamed files--they still exist. */
-       if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
+       if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+               fsnotify_unlink(dir, dentry);
+       } else if (!error) {
                 fsnotify_link_count(target);
-               d_delete(dentry);
+               d_delete_notify(dir, dentry);
         }
   
         return error;
diff --combined fs/notify/fanotify/fanotify_user.c

index 73b1615,2641b08..1026f67
--- 1/fs/notify/fanotify/fanotify_user.c
--- 2/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@@ -59,7 -59,7 +59,7 @@@ static int fanotify_max_queued_events _
   static long ft_zero = 0;
   static long ft_int_max = INT_MAX;
   
- -struct ctl_table fanotify_table[] = {
+ +static struct ctl_table fanotify_table[] = {
         {
                 .procname       = "max_user_groups",
                 .data   = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
@@@ -88,13 -88,6 +88,13 @@@
         },
         { }
   };
+ +
+ +static void __init fanotify_sysctls_init(void)
+ +{
+ +      register_sysctl("fs/fanotify", fanotify_table);
+ +}
+ +#else
+ +#define fanotify_sysctls_init() do { } while (0)
   #endif /* CONFIG_SYSCTL */
   
   /*
@@@ -158,7 -151,6 +158,6 @@@ static size_t fanotify_event_len(unsign
                                  struct fanotify_event *event)
   {
         size_t event_len = FAN_EVENT_METADATA_LEN;
-       struct fanotify_info *info;
         int fh_len;
         int dot_len = 0;
   
@@@ -168,8 -160,6 +167,6 @@@
         if (fanotify_is_error_event(event->mask))
                 event_len += FANOTIFY_ERROR_INFO_LEN;
   
-       info = fanotify_event_info(event);
- 
         if (fanotify_event_has_any_dir_fh(event)) {
                 event_len += fanotify_dir_name_info_len(event);
         } else if ((info_mode & FAN_REPORT_NAME) &&
@@@ -1750,7 -1740,6 +1747,7 @@@ static int __init fanotify_user_setup(v
         init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
                                         FANOTIFY_DEFAULT_MAX_GROUPS;
         init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
+ +      fanotify_sysctls_init();
   
         return 0;
   }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 28 Jan 2022 15:51:31 +0000 (17:51 +0200)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 28 Jan 2022 15:51:31 +0000 (17:51 +0200)
		1	2
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/notify/fanotify/fanotify_user.c	patch \|	diff1 \|	diff2 \|	blob \| history