Merge tag 'fsnotify_for_v5.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 29 Apr 2021 18:06:13 +0000 (11:06 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 29 Apr 2021 18:06:13 +0000 (11:06 -0700)
Pull fsnotify updates from Jan Kara:

 - support for limited fanotify functionality for unpriviledged users

 - faster merging of fanotify events

 - a few smaller fsnotify improvements

* tag 'fsnotify_for_v5.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs:
  shmem: allow reporting fanotify events with file handles on tmpfs
  fs: introduce a wrapper uuid_to_fsid()
  fanotify_user: use upper_32_bits() to verify mask
  fanotify: support limited functionality for unprivileged users
  fanotify: configurable limits via sysfs
  fanotify: limit number of event merge attempts
  fsnotify: use hash table for faster events merge
  fanotify: mix event info and pid into merge key hash
  fanotify: reduce event objectid to 29-bit hash
  fsnotify: allow fsnotify_{peek,remove}_first_event with empty queue

1  2 
fs/ext2/super.c
fs/ext4/super.c
fs/zonefs/super.c
include/linux/user_namespace.h
kernel/sysctl.c

diff --combined fs/ext2/super.c
@@@ -1104,7 -1104,7 +1104,7 @@@ static int ext2_fill_super(struct super
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
  
 -      /* per fileystem reservation list head & lock */
 +      /* per filesystem reservation list head & lock */
        spin_lock_init(&sbi->s_rsv_window_lock);
        sbi->s_rsv_window_root = RB_ROOT;
        /*
@@@ -1399,7 -1399,6 +1399,6 @@@ static int ext2_statfs (struct dentry 
        struct super_block *sb = dentry->d_sb;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
        struct ext2_super_block *es = sbi->s_es;
-       u64 fsid;
  
        spin_lock(&sbi->s_lock);
  
        buf->f_ffree = ext2_count_free_inodes(sb);
        es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
        buf->f_namelen = EXT2_NAME_LEN;
-       fsid = le64_to_cpup((void *)es->s_uuid) ^
-              le64_to_cpup((void *)es->s_uuid + sizeof(u64));
-       buf->f_fsid = u64_to_fsid(fsid);
+       buf->f_fsid = uuid_to_fsid(es->s_uuid);
        spin_unlock(&sbi->s_lock);
        return 0;
  }
diff --combined fs/ext4/super.c
@@@ -1210,7 -1210,6 +1210,7 @@@ static void ext4_put_super(struct super
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
 +      percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
        percpu_free_rwsem(&sbi->s_writepages_rwsem);
  #ifdef CONFIG_QUOTA
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@@ -5013,9 -5012,6 +5013,9 @@@ no_journal
                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
                                          GFP_KERNEL);
        if (!err)
 +              err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
 +                                        GFP_KERNEL);
 +      if (!err)
                err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
  
        if (err) {
@@@ -5128,7 -5124,6 +5128,7 @@@ failed_mount6
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
 +      percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
        percpu_free_rwsem(&sbi->s_writepages_rwsem);
  failed_mount5:
        ext4_ext_release(sb);
@@@ -5154,8 -5149,8 +5154,8 @@@ failed_mount_wq
  failed_mount3a:
        ext4_es_unregister_shrinker(sbi);
  failed_mount3:
 -      del_timer_sync(&sbi->s_err_report);
        flush_work(&sbi->s_error_work);
 +      del_timer_sync(&sbi->s_err_report);
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
  failed_mount2:
@@@ -6153,7 -6148,6 +6153,6 @@@ static int ext4_statfs(struct dentry *d
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        ext4_fsblk_t overhead = 0, resv_blocks;
-       u64 fsid;
        s64 bfree;
        resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
  
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
        buf->f_namelen = EXT4_NAME_LEN;
-       fsid = le64_to_cpup((void *)es->s_uuid) ^
-              le64_to_cpup((void *)es->s_uuid + sizeof(u64));
-       buf->f_fsid = u64_to_fsid(fsid);
+       buf->f_fsid = uuid_to_fsid(es->s_uuid);
  
  #ifdef CONFIG_QUOTA
        if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
diff --combined fs/zonefs/super.c
@@@ -165,21 -165,6 +165,21 @@@ static int zonefs_writepages(struct add
        return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
  }
  
 +static int zonefs_swap_activate(struct swap_info_struct *sis,
 +                              struct file *swap_file, sector_t *span)
 +{
 +      struct inode *inode = file_inode(swap_file);
 +      struct zonefs_inode_info *zi = ZONEFS_I(inode);
 +
 +      if (zi->i_ztype != ZONEFS_ZTYPE_CNV) {
 +              zonefs_err(inode->i_sb,
 +                         "swap file: not a conventional zone file\n");
 +              return -EINVAL;
 +      }
 +
 +      return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops);
 +}
 +
  static const struct address_space_operations zonefs_file_aops = {
        .readpage               = zonefs_readpage,
        .readahead              = zonefs_readahead,
        .is_partially_uptodate  = iomap_is_partially_uptodate,
        .error_remove_page      = generic_error_remove_page,
        .direct_IO              = noop_direct_IO,
 +      .swap_activate          = zonefs_swap_activate,
  };
  
  static void zonefs_update_stats(struct inode *inode, loff_t new_isize)
@@@ -744,68 -728,6 +744,68 @@@ out_release
  }
  
  /*
 + * Do not exceed the LFS limits nor the file zone size. If pos is under the
 + * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
 + */
 +static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
 +                                      loff_t count)
 +{
 +      struct inode *inode = file_inode(file);
 +      struct zonefs_inode_info *zi = ZONEFS_I(inode);
 +      loff_t limit = rlimit(RLIMIT_FSIZE);
 +      loff_t max_size = zi->i_max_size;
 +
 +      if (limit != RLIM_INFINITY) {
 +              if (pos >= limit) {
 +                      send_sig(SIGXFSZ, current, 0);
 +                      return -EFBIG;
 +              }
 +              count = min(count, limit - pos);
 +      }
 +
 +      if (!(file->f_flags & O_LARGEFILE))
 +              max_size = min_t(loff_t, MAX_NON_LFS, max_size);
 +
 +      if (unlikely(pos >= max_size))
 +              return -EFBIG;
 +
 +      return min(count, max_size - pos);
 +}
 +
 +static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
 +{
 +      struct file *file = iocb->ki_filp;
 +      struct inode *inode = file_inode(file);
 +      struct zonefs_inode_info *zi = ZONEFS_I(inode);
 +      loff_t count;
 +
 +      if (IS_SWAPFILE(inode))
 +              return -ETXTBSY;
 +
 +      if (!iov_iter_count(from))
 +              return 0;
 +
 +      if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
 +              return -EINVAL;
 +
 +      if (iocb->ki_flags & IOCB_APPEND) {
 +              if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
 +                      return -EINVAL;
 +              mutex_lock(&zi->i_truncate_mutex);
 +              iocb->ki_pos = zi->i_wpoffset;
 +              mutex_unlock(&zi->i_truncate_mutex);
 +      }
 +
 +      count = zonefs_write_check_limits(file, iocb->ki_pos,
 +                                        iov_iter_count(from));
 +      if (count < 0)
 +              return count;
 +
 +      iov_iter_truncate(from, count);
 +      return iov_iter_count(from);
 +}
 +
 +/*
   * Handle direct writes. For sequential zone files, this is the only possible
   * write path. For these files, check that the user is issuing writes
   * sequentially from the end of the file. This code assumes that the block layer
@@@ -822,7 -744,8 +822,7 @@@ static ssize_t zonefs_file_dio_write(st
        struct super_block *sb = inode->i_sb;
        bool sync = is_sync_kiocb(iocb);
        bool append = false;
 -      size_t count;
 -      ssize_t ret;
 +      ssize_t ret, count;
  
        /*
         * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
                inode_lock(inode);
        }
  
 -      ret = generic_write_checks(iocb, from);
 -      if (ret <= 0)
 +      count = zonefs_write_checks(iocb, from);
 +      if (count <= 0) {
 +              ret = count;
                goto inode_unlock;
 -
 -      iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos);
 -      count = iov_iter_count(from);
 +      }
  
        if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
                ret = -EINVAL;
@@@ -904,10 -828,12 +904,10 @@@ static ssize_t zonefs_file_buffered_wri
                inode_lock(inode);
        }
  
 -      ret = generic_write_checks(iocb, from);
 +      ret = zonefs_write_checks(iocb, from);
        if (ret <= 0)
                goto inode_unlock;
  
 -      iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos);
 -
        ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops);
        if (ret > 0)
                iocb->ki_pos += ret;
@@@ -1040,7 -966,9 +1040,7 @@@ static int zonefs_open_zone(struct inod
  
        mutex_lock(&zi->i_truncate_mutex);
  
 -      zi->i_wr_refcnt++;
 -      if (zi->i_wr_refcnt == 1) {
 -
 +      if (!zi->i_wr_refcnt) {
                if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
                        atomic_dec(&sbi->s_open_zones);
                        ret = -EBUSY;
                if (i_size_read(inode) < zi->i_max_size) {
                        ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
                        if (ret) {
 -                              zi->i_wr_refcnt--;
                                atomic_dec(&sbi->s_open_zones);
                                goto unlock;
                        }
                }
        }
  
 +      zi->i_wr_refcnt++;
 +
  unlock:
        mutex_unlock(&zi->i_truncate_mutex);
  
@@@ -1177,7 -1104,6 +1177,6 @@@ static int zonefs_statfs(struct dentry 
        struct super_block *sb = dentry->d_sb;
        struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
        enum zonefs_ztype t;
-       u64 fsid;
  
        buf->f_type = ZONEFS_MAGIC;
        buf->f_bsize = sb->s_blocksize;
  
        spin_unlock(&sbi->s_lock);
  
-       fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^
-               le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64));
-       buf->f_fsid = u64_to_fsid(fsid);
+       buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b);
  
        return 0;
  }
@@@ -50,6 -50,10 +50,10 @@@ enum ucount_type 
        UCOUNT_INOTIFY_INSTANCES,
        UCOUNT_INOTIFY_WATCHES,
  #endif
+ #ifdef CONFIG_FANOTIFY
+       UCOUNT_FANOTIFY_GROUPS,
+       UCOUNT_FANOTIFY_MARKS,
+ #endif
        UCOUNT_COUNTS,
  };
  
@@@ -63,9 -67,6 +67,9 @@@ struct user_namespace 
        kgid_t                  group;
        struct ns_common        ns;
        unsigned long           flags;
 +      /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
 +       * in its effective capability set at the child ns creation time. */
 +      bool                    parent_could_setfcap;
  
  #ifdef CONFIG_KEYS
        /* List of joinable keyrings in this namespace.  Modification access of
diff --combined kernel/sysctl.c
@@@ -148,6 -148,9 +148,9 @@@ static unsigned long hung_task_timeout_
  #ifdef CONFIG_INOTIFY_USER
  #include <linux/inotify.h>
  #endif
+ #ifdef CONFIG_FANOTIFY
+ #include <linux/fanotify.h>
+ #endif
  
  #ifdef CONFIG_PROC_SYSCTL
  
@@@ -184,6 -187,17 +187,6 @@@ static enum sysctl_writes_mode sysctl_w
  int sysctl_legacy_va_layout;
  #endif
  
 -#ifdef CONFIG_SCHED_DEBUG
 -static int min_sched_granularity_ns = 100000;         /* 100 usecs */
 -static int max_sched_granularity_ns = NSEC_PER_SEC;   /* 1 second */
 -static int min_wakeup_granularity_ns;                 /* 0 usecs */
 -static int max_wakeup_granularity_ns = NSEC_PER_SEC;  /* 1 second */
 -#ifdef CONFIG_SMP
 -static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 -static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
 -#endif /* CONFIG_SMP */
 -#endif /* CONFIG_SCHED_DEBUG */
 -
  #ifdef CONFIG_COMPACTION
  static int min_extfrag_threshold;
  static int max_extfrag_threshold = 1000;
@@@ -1648,6 -1662,58 +1651,6 @@@ static struct ctl_table kern_table[] = 
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
 -#ifdef CONFIG_SCHED_DEBUG
 -      {
 -              .procname       = "sched_min_granularity_ns",
 -              .data           = &sysctl_sched_min_granularity,
 -              .maxlen         = sizeof(unsigned int),
 -              .mode           = 0644,
 -              .proc_handler   = sched_proc_update_handler,
 -              .extra1         = &min_sched_granularity_ns,
 -              .extra2         = &max_sched_granularity_ns,
 -      },
 -      {
 -              .procname       = "sched_latency_ns",
 -              .data           = &sysctl_sched_latency,
 -              .maxlen         = sizeof(unsigned int),
 -              .mode           = 0644,
 -              .proc_handler   = sched_proc_update_handler,
 -              .extra1         = &min_sched_granularity_ns,
 -              .extra2         = &max_sched_granularity_ns,
 -      },
 -      {
 -              .procname       = "sched_wakeup_granularity_ns",
 -              .data           = &sysctl_sched_wakeup_granularity,
 -              .maxlen         = sizeof(unsigned int),
 -              .mode           = 0644,
 -              .proc_handler   = sched_proc_update_handler,
 -              .extra1         = &min_wakeup_granularity_ns,
 -              .extra2         = &max_wakeup_granularity_ns,
 -      },
 -#ifdef CONFIG_SMP
 -      {
 -              .procname       = "sched_tunable_scaling",
 -              .data           = &sysctl_sched_tunable_scaling,
 -              .maxlen         = sizeof(enum sched_tunable_scaling),
 -              .mode           = 0644,
 -              .proc_handler   = sched_proc_update_handler,
 -              .extra1         = &min_sched_tunable_scaling,
 -              .extra2         = &max_sched_tunable_scaling,
 -      },
 -      {
 -              .procname       = "sched_migration_cost_ns",
 -              .data           = &sysctl_sched_migration_cost,
 -              .maxlen         = sizeof(unsigned int),
 -              .mode           = 0644,
 -              .proc_handler   = proc_dointvec,
 -      },
 -      {
 -              .procname       = "sched_nr_migrate",
 -              .data           = &sysctl_sched_nr_migrate,
 -              .maxlen         = sizeof(unsigned int),
 -              .mode           = 0644,
 -              .proc_handler   = proc_dointvec,
 -      },
  #ifdef CONFIG_SCHEDSTATS
        {
                .procname       = "sched_schedstats",
                .extra2         = SYSCTL_ONE,
        },
  #endif /* CONFIG_SCHEDSTATS */
 -#endif /* CONFIG_SMP */
  #ifdef CONFIG_NUMA_BALANCING
        {
 -              .procname       = "numa_balancing_scan_delay_ms",
 -              .data           = &sysctl_numa_balancing_scan_delay,
 -              .maxlen         = sizeof(unsigned int),
 -              .mode           = 0644,
 -              .proc_handler   = proc_dointvec,
 -      },
 -      {
 -              .procname       = "numa_balancing_scan_period_min_ms",
 -              .data           = &sysctl_numa_balancing_scan_period_min,
 -              .maxlen         = sizeof(unsigned int),
 -              .mode           = 0644,
 -              .proc_handler   = proc_dointvec,
 -      },
 -      {
 -              .procname       = "numa_balancing_scan_period_max_ms",
 -              .data           = &sysctl_numa_balancing_scan_period_max,
 -              .maxlen         = sizeof(unsigned int),
 -              .mode           = 0644,
 -              .proc_handler   = proc_dointvec,
 -      },
 -      {
 -              .procname       = "numa_balancing_scan_size_mb",
 -              .data           = &sysctl_numa_balancing_scan_size,
 -              .maxlen         = sizeof(unsigned int),
 -              .mode           = 0644,
 -              .proc_handler   = proc_dointvec_minmax,
 -              .extra1         = SYSCTL_ONE,
 -      },
 -      {
                .procname       = "numa_balancing",
                .data           = NULL, /* filled in by handler */
                .maxlen         = sizeof(unsigned int),
                .extra2         = SYSCTL_ONE,
        },
  #endif /* CONFIG_NUMA_BALANCING */
 -#endif /* CONFIG_SCHED_DEBUG */
        {
                .procname       = "sched_rt_period_us",
                .data           = &sysctl_sched_rt_period,
@@@ -3164,7 -3261,14 +3167,14 @@@ static struct ctl_table fs_table[] = 
                .mode           = 0555,
                .child          = inotify_table,
        },
- #endif        
+ #endif
+ #ifdef CONFIG_FANOTIFY
+       {
+               .procname       = "fanotify",
+               .mode           = 0555,
+               .child          = fanotify_table,
+       },
+ #endif
  #ifdef CONFIG_EPOLL
        {
                .procname       = "epoll",