**mandatory**
->setxattr() and xattr_handler.set() get dentry and inode passed separately.
+ The xattr_handler.set() gets passed the user namespace of the mount the inode
+ is seen from so filesystems can idmap the i_uid and i_gid accordingly.
dentry might be yet to be attached to inode, so do _not_ use its ->d_inode
in the instances. Rationale: !@#!@# security_d_instantiate() needs to be
called before we attach dentry to inode and !@#!@##!@$!$#!@#$!@$!@$ smack
clone_private_mount() returns a longterm mount now, so the proper destructor of
its result is kern_unmount() or kern_unmount_array().
+
+---
+
+**mandatory**
+
+zero-length bvec segments are disallowed, they must be filtered out before
+passed on to an iterator.
+
+---
+
+**mandatory**
+
+For bvec based itererators bio_iov_iter_get_pages() now doesn't copy bvecs but
+uses the one provided. Anyone issuing kiocb-I/O should ensure that the bvec and
+page references stay until I/O has completed, i.e. until ->ki_complete() has
+been called or returned with non -EIOCBQUEUED code.
.. code-block:: c
- struct file_system_operations {
+ struct file_system_type {
const char *name;
int fs_flags;
struct dentry *(*mount) (struct file_system_type *, int,
->alloc_inode.
``dirty_inode``
- this method is called by the VFS to mark an inode dirty.
+ this method is called by the VFS when an inode is marked dirty.
+ This is specifically for the inode itself being marked dirty,
+ not its data. If the update needs to be persisted by fdatasync(),
+ then I_DIRTY_DATASYNC will be set in the flags argument.
``write_inode``
this method is called when the VFS needs to write an inode to
.. code-block:: c
struct inode_operations {
- int (*create) (struct inode *,struct dentry *, umode_t, bool);
+ int (*create) (struct user_namespace *, struct inode *,struct dentry *, umode_t, bool);
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
- int (*symlink) (struct inode *,struct dentry *,const char *);
- int (*mkdir) (struct inode *,struct dentry *,umode_t);
+ int (*symlink) (struct user_namespace *, struct inode *,struct dentry *,const char *);
+ int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *,umode_t);
int (*rmdir) (struct inode *,struct dentry *);
- int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
- int (*rename) (struct inode *, struct dentry *,
+ int (*mknod) (struct user_namespace *, struct inode *,struct dentry *,umode_t,dev_t);
+ int (*rename) (struct user_namespace *, struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
const char *(*get_link) (struct dentry *, struct inode *,
struct delayed_call *);
- int (*permission) (struct inode *, int);
+ int (*permission) (struct user_namespace *, struct inode *, int);
int (*get_acl)(struct inode *, int);
- int (*setattr) (struct dentry *, struct iattr *);
- int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
+ int (*setattr) (struct user_namespace *, struct dentry *, struct iattr *);
+ int (*getattr) (struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
void (*update_time)(struct inode *, struct timespec *, int);
int (*atomic_open)(struct inode *, struct dentry *, struct file *,
unsigned open_flag, umode_t create_mode);
- int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+ int (*tmpfile) (struct user_namespace *, struct inode *, struct dentry *, umode_t);
+ int (*set_acl)(struct user_namespace *, struct inode *, struct posix_acl *, int);
};
Again, all methods are called without any locks being held, unless
#
0 nospu restart_syscall sys_restart_syscall
1 nospu exit sys_exit
-2 32 fork ppc_fork sys_fork
-2 64 fork sys_fork
-2 spu fork sys_ni_syscall
+2 nospu fork sys_fork
3 common read sys_read
4 common write sys_write
5 common open sys_open compat_sys_open
119 32 sigreturn sys_sigreturn compat_sys_sigreturn
119 64 sigreturn sys_ni_syscall
119 spu sigreturn sys_ni_syscall
-120 32 clone ppc_clone sys_clone
-120 64 clone sys_clone
-120 spu clone sys_ni_syscall
+120 nospu clone sys_clone
121 common setdomainname sys_setdomainname
122 common uname sys_newuname
123 common modify_ldt sys_ni_syscall
186 spu sendfile sys_sendfile64
187 common getpmsg sys_ni_syscall
188 common putpmsg sys_ni_syscall
-189 32 vfork ppc_vfork sys_vfork
-189 64 vfork sys_vfork
-189 spu vfork sys_ni_syscall
+189 nospu vfork sys_vfork
190 common ugetrlimit sys_getrlimit compat_sys_getrlimit
191 common readahead sys_readahead compat_sys_readahead
192 32 mmap2 sys_mmap2 compat_sys_mmap2
248 32 clock_nanosleep sys_clock_nanosleep_time32
248 64 clock_nanosleep sys_clock_nanosleep
248 spu clock_nanosleep sys_clock_nanosleep
-249 32 swapcontext ppc_swapcontext compat_sys_swapcontext
-249 64 swapcontext sys_swapcontext
-249 spu swapcontext sys_ni_syscall
+249 nospu swapcontext sys_swapcontext compat_sys_swapcontext
250 common tgkill sys_tgkill
251 32 utimes sys_utimes_time32
251 64 utimes sys_utimes
432 common fsmount sys_fsmount
433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
-435 32 clone3 ppc_clone3 sys_clone3
-435 64 clone3 sys_clone3
-435 spu clone3 sys_ni_syscall
+435 nospu clone3 sys_clone3
436 common close_range sys_close_range
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
+ 442 common mount_setattr sys_mount_setattr
}
int
- affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+ affs_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
}
int
- affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
int error;
}
int
- affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+ affs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *symname)
{
struct super_block *sb = dir->i_sb;
struct buffer_head *bh;
return -EIO;
bh_new = affs_bread(sb, d_inode(new_dentry)->i_ino);
- if (!bh_new)
+ if (!bh_new) {
+ affs_brelse(bh_old);
return -EIO;
+ }
/* Remove old header from its parent directory. */
affs_lock_dir(old_dir);
return retval;
}
- int affs_rename2(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+ int affs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
- BTRFS_FEATURE_INCOMPAT_RAID1C34)
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED)
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
/* Indicate that we need to cleanup space cache v1 */
BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
+
+ /* Indicate that we can't trust the free space tree for caching yet */
+ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
};
/*
/* used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
struct percpu_counter delalloc_bytes;
- struct percpu_counter dio_bytes;
+ struct percpu_counter ordered_bytes;
s32 dirty_metadata_batch;
s32 delalloc_batch;
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
struct work_struct async_data_reclaim_work;
+ struct work_struct preempt_reclaim_work;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
/* Max size to emit ZONE_APPEND write command */
u64 max_zone_append_size;
+ struct mutex zoned_meta_io_lock;
+ spinlock_t treelog_bg_lock;
+ u64 treelog_bg;
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
u32 type;
- u64 highest_objectid;
+ u64 free_objectid;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
ALLOC_CHUNK_FORCE = 8,
RUN_DELAYED_IPUTS = 9,
COMMIT_TRANS = 10,
+ FORCE_COMMIT_TRANS = 11,
};
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
u32 min_type);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- u64 new_dirid);
+ struct btrfs_root *parent_root);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
unsigned *bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
unsigned long bio_flags);
+bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
+ unsigned int size);
void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
/* acl.c */
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
- int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type);
int btrfs_init_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir);
#else
#include "delalloc-space.h"
#include "block-group.h"
#include "space-info.h"
+#include "zoned.h"
struct btrfs_iget_args {
u64 ino;
NULL,
clear_flags,
PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK |
+ PAGE_START_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
ins.objectid,
async_extent->ram_size,
ins.offset,
- BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
if (ret) {
btrfs_drop_extent_cache(inode, async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
- PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK);
+ PAGE_UNLOCK | PAGE_START_WRITEBACK);
if (btrfs_submit_compressed_write(inode, async_extent->start,
async_extent->ram_size,
ins.objectid,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
- PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
- PAGE_SET_ERROR);
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
kfree(async_extent);
goto again;
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
- PAGE_END_WRITEBACK);
+ PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
*nr_written = *nr_written +
(end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
free_extent_map(em);
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
- ram_size, cur_alloc_size, 0);
+ ram_size, cur_alloc_size,
+ BTRFS_ORDERED_REGULAR);
if (ret)
goto out_drop_extent_cache;
out_unlock:
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
- page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
- PAGE_END_WRITEBACK;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
/*
* If we reserved an extent for our delalloc range (or a subrange) and
* failed to create the respective ordered extent, then it means that
unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING;
- unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
- PAGE_SET_ERROR;
+ unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR;
extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits, page_ops);
return 0;
}
+static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
+ struct page *locked_page, u64 start,
+ u64 end, int *page_started,
+ unsigned long *nr_written)
+{
+ int ret;
+
+ ret = cow_file_range(inode, locked_page, start, end, page_started,
+ nr_written, 0);
+ if (ret)
+ return ret;
+
+ if (*page_started)
+ return 0;
+
+ __set_page_dirty_nobuffers(locked_page);
+ account_page_redirty(locked_page);
+ extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
+ *page_started = 1;
+
+ return 0;
+}
+
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes)
{
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK |
+ PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
return -ENOMEM;
}
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK |
+ PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
btrfs_free_path(path);
return ret;
{
int ret;
int force_cow = need_force_cow(inode, start, end);
+ const bool zoned = btrfs_is_zoned(inode->root->fs_info);
if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
+ ASSERT(!zoned);
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
} else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
+ ASSERT(!zoned);
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
} else if (!inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
- ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
+ if (zoned)
+ ret = run_delalloc_zoned(inode, locked_page, start, end,
+ page_started, nr_written);
+ else
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1);
} else {
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 logical = bio->bi_iter.bi_sector << 9;
+ struct extent_map *em;
u64 length = 0;
u64 map_length;
- int ret;
+ int ret = 0;
struct btrfs_io_geometry geom;
if (bio_flags & EXTENT_BIO_COMPRESSED)
length = bio->bi_iter.bi_size;
map_length = length;
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
- &geom);
+ em = btrfs_get_chunk_map(fs_info, logical, map_length);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical,
+ map_length, &geom);
if (ret < 0)
- return ret;
+ goto out;
if (geom.len < length + size)
- return 1;
- return 0;
+ ret = 1;
+out:
+ free_extent_map(em);
+ return ret;
}
/*
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
+bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
+ unsigned int size)
+{
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_ordered_extent *ordered;
+ u64 len = bio->bi_iter.bi_size + size;
+ bool ret = true;
+
+ ASSERT(btrfs_is_zoned(fs_info));
+ ASSERT(fs_info->max_zone_append_size > 0);
+ ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND);
+
+ /* Ordered extent not yet created, so we're good */
+ ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
+ if (!ordered)
+ return ret;
+
+ if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len >
+ ordered->disk_bytenr + ordered->disk_num_bytes)
+ ret = false;
+
+ btrfs_put_ordered_extent(ordered);
+
+ return ret;
+}
+
+static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+ struct bio *bio, loff_t file_offset)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct extent_map *em = NULL, *em_new = NULL;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 len = bio->bi_iter.bi_size;
+ u64 end = start + len;
+ u64 ordered_end;
+ u64 pre, post;
+ int ret = 0;
+
+ ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+ if (WARN_ON_ONCE(!ordered))
+ return BLK_STS_IOERR;
+
+ /* No need to split */
+ if (ordered->disk_num_bytes == len)
+ goto out;
+
+ /* We cannot split once end_bio'd ordered extent */
+ if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* We cannot split a compressed ordered extent */
+ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
+ /* bio must be in one ordered extent */
+ if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Checksum list should be empty */
+ if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pre = start - ordered->disk_bytenr;
+ post = ordered_end - end;
+
+ ret = btrfs_split_ordered_extent(ordered, pre, post);
+ if (ret)
+ goto out;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
+ if (!em) {
+ read_unlock(&em_tree->lock);
+ ret = -EIO;
+ goto out;
+ }
+ read_unlock(&em_tree->lock);
+
+ ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ /*
+ * We cannot reuse em_new here but have to create a new one, as
+ * unpin_extent_cache() expects the start of the extent map to be the
+ * logical offset of the file, which does not hold true anymore after
+ * splitting.
+ */
+ em_new = create_io_em(inode, em->start + pre, len,
+ em->start + pre, em->block_start + pre, len,
+ len, len, BTRFS_COMPRESS_NONE,
+ BTRFS_ORDERED_REGULAR);
+ if (IS_ERR(em_new)) {
+ ret = PTR_ERR(em_new);
+ goto out;
+ }
+ free_extent_map(em_new);
+
+out:
+ free_extent_map(em);
+ btrfs_put_ordered_extent(ordered);
+
+ return errno_to_blk_status(ret);
+}
+
/*
* extent_io.c submission hook. This does the right thing for csum calculation
* on write, or reading the csums from the tree before a read.
if (btrfs_is_free_space_inode(BTRFS_I(inode)))
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
- if (bio_op(bio) != REQ_OP_WRITE) {
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct page *page = bio_first_bvec_all(bio)->bv_page;
+ loff_t file_offset = page_offset(page);
+
+ ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
+ if (ret)
+ goto out;
+ }
+
+ if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
if (ret)
goto out;
goto out;
}
+ if (ordered_extent->disk)
+ btrfs_rewrite_logical_zoned(ordered_extent);
+
btrfs_free_io_failure_record(inode, start, end);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
}
/**
- * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
- * @fs_info - the fs_info for this fs
- * @return - EINTR if we were killed, 0 if nothing's pending
+ * Wait for flushing all delayed iputs
+ *
+ * @fs_info: the filesystem
*
* This will wait on any delayed iputs that are currently running with KILLABLE
* set. Once they are all done running we will return, unless we are killed in
* which case we return EINTR. This helps in user operations like fallocate etc
* that might get blocked on the iputs.
+ *
+ * Return EINTR if we were killed, 0 if nothing's pending
*/
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
{
ret = -ENOMEM;
goto out;
}
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto out_unlock;
if (!PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
wait_on_page_writeback(page);
lock_extent_bits(io_tree, block_start, block_end, &cached_state);
- set_page_extent_mapped(page);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
+ if (btrfs_is_zoned(fs_info)) {
+ ret = btrfs_wait_ordered_range(inode,
+ ALIGN(newsize, fs_info->sectorsize),
+ (u64)-1);
+ if (ret)
+ return ret;
+ }
/*
* We're truncating a file that used to have good data down to
return ret;
}
- static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
if (btrfs_root_readonly(root))
return -EROFS;
- err = setattr_prepare(dentry, attr);
+ err = setattr_prepare(&init_user_ns, dentry, attr);
if (err)
return err;
}
if (attr->ia_valid) {
- setattr_copy(inode, attr);
+ setattr_copy(&init_user_ns, inode, attr);
inode_inc_iversion(inode);
err = btrfs_dirty_inode(inode);
if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(inode, inode->i_mode);
+ err = posix_acl_chmod(&init_user_ns, inode,
+ inode->i_mode);
}
return err;
if (ret != 0)
goto fail_unlock;
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(&init_user_ns, inode, dir, mode);
inode_set_bytes(inode, 0);
inode->i_mtime = current_time(inode);
return err;
}
- static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
+ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_objectid(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_unlock;
return err;
}
- static int btrfs_create(struct inode *dir, struct dentry *dentry,
- umode_t mode, bool excl)
+ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_objectid(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_unlock;
return err;
}
- static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode = NULL;
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_objectid(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_fail;
* @strict: if true, omit optimizations that might force us into unnecessary
* cow. e.g., don't trust generation number.
*
- * This function will flush ordered extents in the range to ensure proper
- * nocow checks for (nowait == false) case.
- *
* Return:
* >0 and update @len if we can do nocow write
* 0 if we can't do nocow write
iomap->bdev = fs_info->fs_devices->latest_bdev;
iomap->length = len;
+ if (write && btrfs_use_zone_append(BTRFS_I(inode), em))
+ iomap->flags |= IOMAP_F_ZONE_APPEND;
+
free_extent_map(em);
return 0;
if (!refcount_dec_and_test(&dip->refs))
return;
- if (bio_op(dip->dio_bio) == REQ_OP_WRITE) {
+ if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
__endio_write_update_ordered(BTRFS_I(dip->inode),
dip->logical_offset,
dip->bytes,
NULL);
btrfs_queue_work(wq, &ordered->work);
}
- /*
- * If btrfs_dec_test_ordered_pending does not find any ordered
- * extent in the range, we can exit.
- */
+
+ /* No ordered extent found in the range, exit */
if (ordered_offset == last_offset)
return;
/*
if (err)
dip->dio_bio->bi_status = err;
+ btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
+
bio_put(bio);
btrfs_dio_private_put(dip);
}
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_private *dip = bio->bi_private;
- bool write = bio_op(bio) == REQ_OP_WRITE;
+ bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
blk_status_t ret;
/* Check btrfs_submit_bio_hook() for rules about async submit. */
struct inode *inode,
loff_t file_offset)
{
- const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
size_t dip_size;
struct btrfs_dio_private *dip;
static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
struct bio *dio_bio, loff_t file_offset)
{
- const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
BTRFS_BLOCK_GROUP_RAID56_MASK);
u64 submit_len;
int clone_offset = 0;
int clone_len;
+ u64 logical;
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
struct btrfs_dio_data *dio_data = iomap->private;
+ struct extent_map *em = NULL;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) {
submit_len = dio_bio->bi_iter.bi_size;
do {
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio),
- start_sector << 9, submit_len,
- &geom);
+ logical = start_sector << 9;
+ em = btrfs_get_chunk_map(fs_info, logical, submit_len);
+ if (IS_ERR(em)) {
+ status = errno_to_blk_status(PTR_ERR(em));
+ em = NULL;
+ goto out_err_em;
+ }
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
+ logical, submit_len, &geom);
if (ret) {
status = errno_to_blk_status(ret);
- goto out_err;
+ goto out_err_em;
}
ASSERT(geom.len <= INT_MAX);
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
+ WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) &&
+ fs_info->max_zone_append_size &&
+ bio_op(bio) != REQ_OP_ZONE_APPEND);
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ status = extract_ordered_extent(BTRFS_I(inode), bio,
+ file_offset);
+ if (status) {
+ bio_put(bio);
+ goto out_err;
+ }
+ }
+
ASSERT(submit_len >= clone_len);
submit_len -= clone_len;
bio_put(bio);
if (submit_len > 0)
refcount_dec(&dip->refs);
- goto out_err;
+ goto out_err_em;
}
dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
+
+ free_extent_map(em);
} while (submit_len > 0);
return BLK_QC_T_NONE;
+out_err_em:
+ free_extent_map(em);
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
+
return BLK_QC_T_NONE;
}
{
int ret = try_release_extent_mapping(page, gfp_flags);
if (ret == 1)
- detach_page_private(page);
+ clear_page_extent_mapped(page);
return ret;
}
if (!inode_evicting)
lock_extent_bits(tree, page_start, page_end, &cached_state);
-again:
+
start = page_start;
+again:
ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
if (ordered) {
found_ordered = true;
}
ClearPageChecked(page);
- detach_page_private(page);
+ clear_page_extent_mapped(page);
}
/*
wait_on_page_writeback(page);
lock_extent_bits(io_tree, page_start, page_end, &cached_state);
- set_page_extent_mapped(page);
+ ret2 = set_page_extent_mapped(page);
+ if (ret2 < 0) {
+ ret = vmf_error(ret2);
+ unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ goto out_unlock;
+ }
/*
* we can't set the delalloc bits if there are pending ordered
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- u64 new_dirid)
+ struct btrfs_root *parent_root)
{
struct inode *inode;
int err;
u64 index = 0;
+ u64 ino;
+
+ err = btrfs_get_free_objectid(new_root, &ino);
+ if (err < 0)
+ return err;
- inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
- new_dirid, new_dirid,
+ inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
S_IFDIR | (~current_umask() & S_IRWXUGO),
&index);
if (IS_ERR(inode))
return -ENOMEM;
}
- static int btrfs_getattr(const struct path *path, struct kstat *stat,
+ static int btrfs_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
{
u64 delalloc_bytes;
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
u64 objectid;
u64 index;
- ret = btrfs_find_free_objectid(root, &objectid);
+ ret = btrfs_get_free_objectid(root, &objectid);
if (ret)
return ret;
return ret;
}
- static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
return start_delalloc_inodes(root, &wbc, true, false);
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context)
{
struct writeback_control wbc = {
- .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr,
+ .nr_to_write = nr,
.sync_mode = WB_SYNC_NONE,
.range_start = 0,
.range_end = LLONG_MAX,
mutex_lock(&fs_info->delalloc_root_mutex);
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
- while (!list_empty(&splice) && nr) {
+ while (!list_empty(&splice)) {
/*
* Reset nr_to_write here so we know that we're doing a full
* flush.
*/
- if (nr == U64_MAX)
+ if (nr == LONG_MAX)
wbc.nr_to_write = LONG_MAX;
root = list_first_entry(&splice, struct btrfs_root,
return ret;
}
- static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
- const char *symname)
+ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *symname)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_objectid(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_unlock;
return __set_page_dirty_nobuffers(page);
}
- static int btrfs_permission(struct inode *inode, int mask)
+ static int btrfs_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
umode_t mode = inode->i_mode;
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
return -EACCES;
}
- return generic_permission(inode, mask);
+ return generic_permission(&init_user_ns, inode, mask);
}
- static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_find_free_objectid(root, &objectid);
+ ret = btrfs_get_free_objectid(root, &objectid);
if (ret)
goto out;
const char *comp = NULL;
u32 binode_flags;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EPERM;
if (btrfs_root_readonly(root))
unsigned old_i_flags;
int ret = 0;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EPERM;
if (btrfs_root_readonly(root))
return -EPERM;
/*
+ * btrfs_trim_block_group() depends on space cache, which is not
+ * available in zoned filesystem. So, disallow fitrim on a zoned
+ * filesystem for now.
+ */
+ if (btrfs_is_zoned(fs_info))
+ return -EOPNOTSUPP;
+
+ /*
* If the fs is mounted with nologreplay, which requires it to be
* mounted in RO mode as well, we can not allow discard on free space
* inside block groups, because log trees refer to extents that are not
int err;
dev_t anon_dev = 0;
u64 objectid;
- u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
return -ENOMEM;
- ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid);
+ ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
if (ret)
goto fail_free;
free_extent_buffer(leaf);
leaf = NULL;
- btrfs_set_root_dirid(root_item, new_dirid);
+ btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
key.objectid = objectid;
key.offset = 0;
btrfs_record_root_in_trans(trans, new_root);
- ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
+ ret = btrfs_create_subvol_root(trans, new_root, root);
btrfs_put_root(new_root);
if (ret) {
/* We potentially lose an unused inode item here */
goto fail;
}
- mutex_lock(&new_root->objectid_mutex);
- new_root->highest_objectid = new_dirid;
- mutex_unlock(&new_root->objectid_mutex);
-
/*
* insert the directory item
*/
BUG_ON(d_inode(victim->d_parent) != dir);
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
- error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
if (error)
return error;
if (IS_APPEND(dir))
return -EPERM;
- if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) ||
- IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim)))
+ if (check_sticky(&init_user_ns, dir, d_inode(victim)) ||
+ IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
+ IS_SWAPFILE(d_inode(victim)))
return -EPERM;
if (isdir) {
if (!d_is_dir(victim))
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
- return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
}
/*
if (!page)
break;
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ break;
+ }
+
page_start = page_offset(page);
page_end = page_start + PAGE_SIZE - 1;
while (1) {
for (i = 0; i < i_done; i++) {
clear_page_dirty_for_io(pages[i]);
ClearPageChecked(pages[i]);
- set_page_extent_mapped(pages[i]);
set_page_dirty(pages[i]);
unlock_page(pages[i]);
put_page(pages[i]);
btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS");
ret = -EXDEV;
- } else if (!inode_owner_or_capable(src_inode)) {
+ } else if (!inode_owner_or_capable(&init_user_ns, src_inode)) {
/*
* Subvolume creation is not restricted, but snapshots
* are limited to own subvolumes only
u64 flags;
int ret = 0;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EPERM;
ret = mnt_want_write_file(file);
ret = PTR_ERR(temp_inode);
goto out_put;
}
- ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
+ ret = inode_permission(&init_user_ns, temp_inode,
+ MAY_READ | MAY_EXEC);
iput(temp_inode);
if (ret) {
ret = -EACCES;
if (root == dest)
goto out_dput;
- err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+ err = inode_permission(&init_user_ns, inode,
+ MAY_WRITE | MAY_EXEC);
if (err)
goto out_dput;
}
* running and allows defrag on files open in read-only mode.
*/
if (!capable(CAP_SYS_ADMIN) &&
- inode_permission(inode, MAY_WRITE)) {
+ inode_permission(&init_user_ns, inode, MAY_WRITE)) {
ret = -EPERM;
goto out;
}
int ret = 0;
int received_uuid_changed;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EPERM;
ret = mnt_want_write_file(file);
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
+ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
}
}
-/*
- * Write back inode data in a worker thread. (This can't be done
- * in the message handler context.)
- */
-void ceph_queue_writeback(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- set_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask);
-
- ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ci->i_work)) {
- dout("ceph_queue_writeback %p\n", inode);
- } else {
- dout("ceph_queue_writeback %p already queued, mask=%lx\n",
- inode, ci->i_work_mask);
- iput(inode);
- }
-}
-
-/*
- * queue an async invalidation
- */
-void ceph_queue_invalidate(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- set_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask);
-
- ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ceph_inode(inode)->i_work)) {
- dout("ceph_queue_invalidate %p\n", inode);
- } else {
- dout("ceph_queue_invalidate %p already queued, mask=%lx\n",
- inode, ci->i_work_mask);
- iput(inode);
- }
-}
-
-/*
- * Queue an async vmtruncate. If we fail to queue work, we will handle
- * the truncation the next time we call __ceph_do_pending_vmtruncate.
- */
-void ceph_queue_vmtruncate(struct inode *inode)
+void ceph_queue_inode_work(struct inode *inode, int work_bit)
{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- set_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask);
+ set_bit(work_bit, &ci->i_work_mask);
ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ci->i_work)) {
- dout("ceph_queue_vmtruncate %p\n", inode);
+ if (queue_work(fsc->inode_wq, &ci->i_work)) {
+ dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask);
} else {
- dout("ceph_queue_vmtruncate %p already queued, mask=%lx\n",
+ dout("queue_inode_work %p already queued, mask=%lx\n",
inode, ci->i_work_mask);
iput(inode);
}
if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
__ceph_do_pending_vmtruncate(inode);
+ if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
+ ceph_check_caps(ci, 0, NULL);
+
+ if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
+ ceph_flush_snaps(ci, NULL);
+
iput(inode);
}
/*
* setattr
*/
- int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- err = setattr_prepare(dentry, attr);
+ err = setattr_prepare(&init_user_ns, dentry, attr);
if (err != 0)
return err;
err = __ceph_setattr(inode, attr);
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
- err = posix_acl_chmod(inode, attr->ia_mode);
+ err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode);
return err;
}
* Check inode permissions. We verify we have a valid value for
* the AUTH cap, then call the generic handler.
*/
- int ceph_permission(struct inode *inode, int mask)
+ int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode,
+ int mask)
{
int err;
err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
if (!err)
- err = generic_permission(inode, mask);
+ err = generic_permission(&init_user_ns, inode, mask);
return err;
}
* Get all the attributes. If we have sufficient caps for the requested attrs,
* then we can avoid talking to the MDS at all.
*/
- int ceph_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
+ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct inode *inode = d_inode(path->dentry);
struct ceph_inode_info *ci = ceph_inode(inode);
return err;
}
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
stat->ino = ceph_present_inode(inode);
/*
/*
* Masks of ceph inode work.
*/
-#define CEPH_I_WORK_WRITEBACK 0 /* writeback */
-#define CEPH_I_WORK_INVALIDATE_PAGES 1 /* invalidate pages */
-#define CEPH_I_WORK_VMTRUNCATE 2 /* vmtruncate */
+#define CEPH_I_WORK_WRITEBACK 0
+#define CEPH_I_WORK_INVALIDATE_PAGES 1
+#define CEPH_I_WORK_VMTRUNCATE 2
+#define CEPH_I_WORK_CHECK_CAPS 3
+#define CEPH_I_WORK_FLUSH_SNAPS 4
/*
* We set the ERROR_WRITE bit when we start seeing write errors on an inode
extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
extern void __ceph_do_pending_vmtruncate(struct inode *inode);
-extern void ceph_queue_vmtruncate(struct inode *inode);
-extern void ceph_queue_invalidate(struct inode *inode);
-extern void ceph_queue_writeback(struct inode *inode);
+
extern void ceph_async_iput(struct inode *inode);
+void ceph_queue_inode_work(struct inode *inode, int work_bit);
+
+static inline void ceph_queue_vmtruncate(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_VMTRUNCATE);
+}
+
+static inline void ceph_queue_invalidate(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_INVALIDATE_PAGES);
+}
+
+static inline void ceph_queue_writeback(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_WRITEBACK);
+}
+
+static inline void ceph_queue_check_caps(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_CHECK_CAPS);
+}
+
+static inline void ceph_queue_flush_snaps(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS);
+}
+
extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
int mask, bool force);
static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
{
return __ceph_do_getattr(inode, NULL, mask, force);
}
- extern int ceph_permission(struct inode *inode, int mask);
+ extern int ceph_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask);
extern int __ceph_setattr(struct inode *inode, struct iattr *attr);
- extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
- extern int ceph_getattr(const struct path *path, struct kstat *stat,
+ extern int ceph_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr);
+ extern int ceph_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags);
/* xattr.c */
#ifdef CONFIG_CEPH_FS_POSIX_ACL
struct posix_acl *ceph_get_acl(struct inode *, int);
- int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+ int ceph_set_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, struct posix_acl *acl, int type);
int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
struct ceph_acl_sec_ctx *as_ctx);
void ceph_init_inode_acls(struct inode *inode,
bool snap_rwsem_locked);
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had);
extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci,
int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
return -EOPNOTSUPP;
}
- static int cifs_permission(struct inode *inode, int mask)
+ static int cifs_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
{
struct cifs_sb_info *cifs_sb;
on the client (above and beyond ACL on servers) for
servers which do not support setting and viewing mode bits,
so allowing client to check permissions is useful */
- return generic_permission(inode, mask);
+ return generic_permission(&init_user_ns, inode, mask);
}
static struct kmem_cache *cifs_inode_cachep;
static int cifs_show_devname(struct seq_file *m, struct dentry *root)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
- char *devname = kstrdup(cifs_sb->ctx->UNC, GFP_KERNEL);
+ char *devname = kstrdup(cifs_sb->ctx->source, GFP_KERNEL);
if (devname == NULL)
seq_puts(m, "none");
goto out;
}
- rc = cifs_setup_volume_info(cifs_sb->ctx);
+ rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, old_ctx->UNC);
if (rc) {
root = ERR_PTR(rc);
goto out;
return rc;
}
- int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
- bool excl)
+ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode,
+ struct dentry *direntry, umode_t mode, bool excl)
{
int rc;
unsigned int xid = get_xid();
return rc;
}
- int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
- dev_t device_number)
+ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode,
+ struct dentry *direntry, umode_t mode, dev_t device_number)
{
int rc = -EPERM;
unsigned int xid;
cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
{
struct inode *inode;
+ int rc;
if (flags & LOOKUP_RCU)
return -ECHILD;
if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode)))
CIFS_I(inode)->time = 0; /* force reval */
- if (cifs_revalidate_dentry(direntry))
- return 0;
+ rc = cifs_revalidate_dentry(direntry);
+ if (rc) {
+ cifs_dbg(FYI, "cifs_revalidate_dentry failed with rc=%d", rc);
+ switch (rc) {
+ case -ENOENT:
+ case -ESTALE:
+ /*
+ * Those errors mean the dentry is invalid
+ * (file was deleted or recreated)
+ */
+ return 0;
+ default:
+ /*
+ * Otherwise some unexpected error happened
+ * report it as-is to VFS layer
+ */
+ return rc;
+ }
+ }
else {
/*
* If the inode wasn't known to be a dfs entry when
else if (d_unhashed(lower_dentry))
rc = -EINVAL;
else
- rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
+ rc = vfs_unlink(&init_user_ns, lower_dir_inode, lower_dentry,
+ NULL);
if (rc) {
printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
goto out_unlock;
lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true);
+ rc = vfs_create(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+ mode, true);
if (rc) {
printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
"rc = [%d]\n", __func__, rc);
inode = __ecryptfs_get_inode(d_inode(lower_dentry),
directory_inode->i_sb);
if (IS_ERR(inode)) {
- vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL);
+ vfs_unlink(&init_user_ns, d_inode(lower_dir_dentry),
+ lower_dentry, NULL);
goto out_lock;
}
fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry));
* Returns zero on success; non-zero on error condition
*/
static int
- ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
+ ecryptfs_create(struct user_namespace *mnt_userns,
+ struct inode *directory_inode, struct dentry *ecryptfs_dentry,
umode_t mode, bool excl)
{
struct inode *ecryptfs_inode;
dget(lower_old_dentry);
dget(lower_new_dentry);
lower_dir_dentry = lock_parent(lower_new_dentry);
- rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry),
- lower_new_dentry, NULL);
+ rc = vfs_link(lower_old_dentry, &init_user_ns,
+ d_inode(lower_dir_dentry), lower_new_dentry, NULL);
if (rc || d_really_is_negative(lower_new_dentry))
goto out_lock;
rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
return ecryptfs_do_unlink(dir, dentry, d_inode(dentry));
}
- static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
+ static int ecryptfs_symlink(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry,
const char *symname)
{
int rc;
strlen(symname));
if (rc)
goto out_lock;
- rc = vfs_symlink(d_inode(lower_dir_dentry), lower_dentry,
+ rc = vfs_symlink(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
encoded_symname);
kfree(encoded_symname);
if (rc || d_really_is_negative(lower_dentry))
return rc;
}
- static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int rc;
struct dentry *lower_dentry;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode);
+ rc = vfs_mkdir(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+ mode);
if (rc || d_really_is_negative(lower_dentry))
goto out;
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
else if (d_unhashed(lower_dentry))
rc = -EINVAL;
else
- rc = vfs_rmdir(lower_dir_inode, lower_dentry);
+ rc = vfs_rmdir(&init_user_ns, lower_dir_inode, lower_dentry);
if (!rc) {
clear_nlink(d_inode(dentry));
fsstack_copy_attr_times(dir, lower_dir_inode);
}
static int
- ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
int rc;
struct dentry *lower_dentry;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev);
+ rc = vfs_mknod(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+ mode, dev);
if (rc || d_really_is_negative(lower_dentry))
goto out;
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
}
static int
- ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+ ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
int rc;
struct dentry *lower_old_dentry;
struct dentry *lower_new_dir_dentry;
struct dentry *trap;
struct inode *target_inode;
+ struct renamedata rd = {};
if (flags)
return -EINVAL;
rc = -ENOTEMPTY;
goto out_lock;
}
- rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry,
- d_inode(lower_new_dir_dentry), lower_new_dentry,
- NULL, 0);
+
+ rd.old_mnt_userns = &init_user_ns;
+ rd.old_dir = d_inode(lower_old_dir_dentry);
+ rd.old_dentry = lower_old_dentry;
+ rd.new_mnt_userns = &init_user_ns;
+ rd.new_dir = d_inode(lower_new_dir_dentry);
+ rd.new_dentry = lower_new_dentry;
+ rc = vfs_rename(&rd);
if (rc)
goto out_lock;
if (target_inode)
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
inode_lock(d_inode(lower_dentry));
- rc = notify_change(lower_dentry, &lower_ia, NULL);
+ rc = notify_change(&init_user_ns, lower_dentry,
+ &lower_ia, NULL);
inode_unlock(d_inode(lower_dentry));
}
return rc;
}
static int
- ecryptfs_permission(struct inode *inode, int mask)
+ ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+ int mask)
{
- return inode_permission(ecryptfs_inode_to_lower(inode), mask);
+ return inode_permission(&init_user_ns,
+ ecryptfs_inode_to_lower(inode), mask);
}
/**
* All other metadata changes will be passed right to the lower filesystem,
* and we will just update our inode to look like the lower.
*/
- static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
+ static int ecryptfs_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *ia)
{
int rc = 0;
struct dentry *lower_dentry;
}
mutex_unlock(&crypt_stat->cs_mutex);
- rc = setattr_prepare(dentry, ia);
+ rc = setattr_prepare(&init_user_ns, dentry, ia);
if (rc)
goto out;
if (ia->ia_valid & ATTR_SIZE) {
lower_ia.ia_valid &= ~ATTR_MODE;
inode_lock(d_inode(lower_dentry));
- rc = notify_change(lower_dentry, &lower_ia, NULL);
+ rc = notify_change(&init_user_ns, lower_dentry, &lower_ia, NULL);
inode_unlock(d_inode(lower_dentry));
out:
fsstack_copy_attr_all(inode, lower_inode);
return rc;
}
- static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat,
+ static int ecryptfs_getattr_link(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
{
struct dentry *dentry = path->dentry;
mount_crypt_stat = &ecryptfs_superblock_to_private(
dentry->d_sb)->mount_crypt_stat;
- generic_fillattr(d_inode(dentry), stat);
+ generic_fillattr(&init_user_ns, d_inode(dentry), stat);
if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
char *target;
size_t targetsiz;
return rc;
}
- static int ecryptfs_getattr(const struct path *path, struct kstat *stat,
+ static int ecryptfs_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
{
struct dentry *dentry = path->dentry;
if (!rc) {
fsstack_copy_attr_all(d_inode(dentry),
ecryptfs_inode_to_lower(d_inode(dentry)));
- generic_fillattr(d_inode(dentry), stat);
+ generic_fillattr(&init_user_ns, d_inode(dentry), stat);
stat->blocks = lower_stat.blocks;
}
return rc;
{
int rc;
struct dentry *lower_dentry;
+ struct inode *lower_inode;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- if (!(d_inode(lower_dentry)->i_opflags & IOP_XATTR)) {
+ lower_inode = d_inode(lower_dentry);
+ if (!(lower_inode->i_opflags & IOP_XATTR)) {
rc = -EOPNOTSUPP;
goto out;
}
- rc = vfs_setxattr(&init_user_ns, lower_dentry, name, value, size,
- flags);
+ inode_lock(lower_inode);
- rc = __vfs_setxattr_locked(lower_dentry, name, value, size, flags, NULL);
++ rc = __vfs_setxattr_locked(&init_user_ns, lower_dentry, name, value, size, flags, NULL);
+ inode_unlock(lower_inode);
if (!rc && inode)
- fsstack_copy_attr_all(inode, d_inode(lower_dentry));
+ fsstack_copy_attr_all(inode, lower_inode);
out:
return rc;
}
goto out;
}
inode_lock(lower_inode);
- rc = __vfs_removexattr(lower_dentry, name);
+ rc = __vfs_removexattr(&init_user_ns, lower_dentry, name);
inode_unlock(lower_inode);
out:
return rc;
}
static int ecryptfs_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *dentry, struct inode *inode,
const char *name, const void *value, size_t size,
int flags)
return -ENOMEM;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, old_start, old_end);
+ tlb_gather_mmu(&tlb, mm);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
free_pgd_range(&tlb, old_start, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
}
- tlb_finish_mmu(&tlb, old_start, old_end);
+ tlb_finish_mmu(&tlb);
/*
* Shrink the vma to just the new range. Always succeeds.
void would_dump(struct linux_binprm *bprm, struct file *file)
{
struct inode *inode = file_inode(file);
- if (inode_permission(inode, MAY_READ) < 0) {
+ struct user_namespace *mnt_userns = file_mnt_user_ns(file);
+ if (inode_permission(mnt_userns, inode, MAY_READ) < 0) {
struct user_namespace *old, *user_ns;
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
/* Ensure mm->user_ns contains the executable */
user_ns = old = bprm->mm->user_ns;
while ((user_ns != &init_user_ns) &&
- !privileged_wrt_inode_uidgid(user_ns, inode))
+ !privileged_wrt_inode_uidgid(user_ns, mnt_userns, inode))
user_ns = user_ns->parent;
if (old != user_ns) {
static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
{
/* Handle suid and sgid on files */
+ struct user_namespace *mnt_userns;
struct inode *inode;
unsigned int mode;
kuid_t uid;
if (!(mode & (S_ISUID|S_ISGID)))
return;
+ mnt_userns = file_mnt_user_ns(file);
+
/* Be careful if suid/sgid is set */
inode_lock(inode);
/* reload atomically mode/uid/gid now that lock held */
mode = inode->i_mode;
- uid = inode->i_uid;
- gid = inode->i_gid;
+ uid = i_uid_into_mnt(mnt_userns, inode);
+ gid = i_gid_into_mnt(mnt_userns, inode);
inode_unlock(inode);
/* We ignore suid/sgid if there are no mappings for them in the ns */
int exfat_load_bitmap(struct super_block *sb);
void exfat_free_bitmap(struct exfat_sb_info *sbi);
int exfat_set_bitmap(struct inode *inode, unsigned int clu);
-void exfat_clear_bitmap(struct inode *inode, unsigned int clu);
+void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync);
unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu);
int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count);
extern const struct file_operations exfat_file_operations;
int __exfat_truncate(struct inode *inode, loff_t new_size);
void exfat_truncate(struct inode *inode, loff_t size);
- int exfat_setattr(struct dentry *dentry, struct iattr *attr);
- int exfat_getattr(const struct path *path, struct kstat *stat,
- unsigned int request_mask, unsigned int query_flags);
+ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr);
+ int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, unsigned int request_mask,
+ unsigned int query_flags);
int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
/* namei.c */
mutex_unlock(&sbi->s_lock);
}
- int exfat_getattr(const struct path *path, struct kstat *stat,
- unsigned int request_mask, unsigned int query_flags)
+ int exfat_getattr(struct user_namespace *mnt_uerns, const struct path *path,
+ struct kstat *stat, unsigned int request_mask,
+ unsigned int query_flags)
{
struct inode *inode = d_backing_inode(path->dentry);
struct exfat_inode_info *ei = EXFAT_I(inode);
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
exfat_truncate_atime(&stat->atime);
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = ei->i_crtime.tv_sec;
return 0;
}
- int exfat_setattr(struct dentry *dentry, struct iattr *attr)
+ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb);
struct inode *inode = dentry->d_inode;
ATTR_TIMES_SET);
}
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(&init_user_ns, dentry, attr);
attr->ia_valid = ia_valid;
if (error)
goto out;
up_write(&EXFAT_I(inode)->truncate_lock);
}
- setattr_copy(inode, attr);
+ setattr_copy(&init_user_ns, inode, attr);
exfat_truncate_atime(&inode->i_atime);
mark_inode_dirty(inode);
if (err)
return err;
- return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ return blkdev_issue_flush(inode->i_sb->s_bdev);
}
const struct file_operations exfat_file_operations = {
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
- struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
+ struct inode *__ext4_new_inode(struct user_namespace *mnt_userns,
+ handle_t *handle, struct inode *dir,
umode_t mode, const struct qstr *qstr,
__u32 goal, uid_t *owner, __u32 i_flags,
int handle_type, unsigned int line_no,
i_gid_write(inode, owner[1]);
} else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
- inode->i_uid = current_fsuid();
+ inode->i_uid = fsuid_into_mnt(mnt_userns);
inode->i_gid = dir->i_gid;
} else
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(mnt_userns, inode, dir, mode);
if (ext4_has_feature_project(sb) &&
ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
if (ret < 0)
goto err_out;
if (barrier)
- blkdev_issue_flush(sb->s_bdev, GFP_NOFS);
+ blkdev_issue_flush(sb->s_bdev);
skip_zeroout:
ext4_lock_group(sb, group);
*/
#include <linux/fs.h>
+ #include <linux/mount.h>
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
if (!inode)
return;
- if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- I_DIRTY_INODE)) ||
- ((inode->i_state & I_DIRTY_TIME) == 0))
+ if (!inode_is_dirtytime_only(inode))
return;
spin_lock(&inode->i_lock);
- if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- I_DIRTY_INODE)) == 0) &&
- (inode->i_state & I_DIRTY_TIME)) {
+ if (inode_is_dirtytime_only(inode)) {
struct ext4_inode_info *ei = EXT4_I(inode);
inode->i_state &= ~I_DIRTY_TIME;
*
* Called with inode->i_mutex down.
*/
- int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int error, rc = 0;
ATTR_GID | ATTR_TIMES_SET))))
return -EPERM;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(mnt_userns, dentry, attr);
if (error)
return error;
}
if (!error) {
- setattr_copy(inode, attr);
+ setattr_copy(mnt_userns, inode, attr);
mark_inode_dirty(inode);
}
ext4_orphan_del(NULL, inode);
if (!error && (ia_valid & ATTR_MODE))
- rc = posix_acl_chmod(inode, inode->i_mode);
+ rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
err_out:
if (error)
return error;
}
- int ext4_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
+ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
struct ext4_inode *raw_inode;
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
return 0;
}
- int ext4_file_getattr(const struct path *path, struct kstat *stat,
+ int ext4_file_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
u64 delalloc_blocks;
- ext4_getattr(path, stat, request_mask, query_flags);
+ ext4_getattr(mnt_userns, path, stat, request_mask, query_flags);
/*
* If there is inline data in the inode, the inode will normally not
* If the inode is marked synchronous, we don't honour that here - doing
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
- *
- * If only the I_DIRTY_TIME flag is set, we can skip everything. If
- * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
- * to copy into the on-disk inode structure are the timestamp files.
*/
void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;
- if (flags == I_DIRTY_TIME)
- return;
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
- goto out;
-
+ return;
ext4_mark_inode_dirty(handle, inode);
-
ext4_journal_stop(handle);
-out:
- return;
}
int ext4_change_inode_journal_flag(struct inode *inode, int val)
* important fields of the inodes.
*
* @sb: the super block of the filesystem
+ * @mnt_userns: user namespace of the mount the inode was found from
* @inode: the inode to swap with EXT4_BOOT_LOADER_INO
*
*/
static long swap_inode_boot_loader(struct super_block *sb,
+ struct user_namespace *mnt_userns,
struct inode *inode)
{
handle_t *handle;
}
if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
- !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+ !inode_owner_or_capable(mnt_userns, inode) ||
+ !capable(CAP_SYS_ADMIN)) {
err = -EPERM;
goto journal_err_out;
}
struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
struct ext4_inode_info *ei = EXT4_I(inode);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
unsigned int flags;
ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
case FS_IOC_SETFLAGS: {
int err;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
if (get_user(flags, (int __user *) arg))
__u32 generation;
int err;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
if (ext4_has_metadata_csum(inode->i_sb)) {
case EXT4_IOC_MIGRATE:
{
int err;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
err = mnt_want_write_file(filp);
case EXT4_IOC_ALLOC_DA_BLKS:
{
int err;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
err = mnt_want_write_file(filp);
err = mnt_want_write_file(filp);
if (err)
return err;
- err = swap_inode_boot_loader(sb, inode);
+ err = swap_inode_boot_loader(sb, mnt_userns, inode);
mnt_drop_write_file(filp);
return err;
}
case EXT4_IOC_CLEAR_ES_CACHE:
{
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ext4_clear_inode_es(inode);
return 0;
return -EFAULT;
/* Make sure caller has proper permission */
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS)
return -EOPNOTSUPP;
return fsverity_ioctl_measure(filp, (void __user *)arg);
+ case FS_IOC_READ_VERITY_METADATA:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_read_metadata(filp,
+ (const void __user *)arg);
+
default:
return -ENOTTY;
}
case FS_IOC_GETFSMAP:
case FS_IOC_ENABLE_VERITY:
case FS_IOC_MEASURE_VERITY:
+ case FS_IOC_READ_VERITY_METADATA:
case EXT4_IOC_CLEAR_ES_CACHE:
case EXT4_IOC_GETSTATE:
case EXT4_IOC_GET_ES_CACHE:
needs_barrier = true;
if (needs_barrier) {
int err;
- err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+ err = blkdev_issue_flush(sb->s_bdev);
if (!ret)
ret = err;
}
.name = "ext4",
.mount = ext4_mount,
.kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("ext4");
return __f2fs_get_acl(inode, type, NULL);
}
- if (!in_group_p(inode->i_gid) &&
- !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
+ struct posix_acl **acl)
+{
+ umode_t mode = inode->i_mode;
+ int error;
+
+ if (is_inode_flag_set(inode, FI_ACL_MODE))
+ mode = F2FS_I(inode)->i_acl_mode;
+
+ error = posix_acl_equiv_mode(*acl, &mode);
+ if (error < 0)
+ return error;
+ if (error == 0)
+ *acl = NULL;
++ if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
++ !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+ mode &= ~S_ISGID;
+ *mode_p = mode;
+ return 0;
+}
+
static int __f2fs_set_acl(struct inode *inode, int type,
struct posix_acl *acl, struct page *ipage)
{
case ACL_TYPE_ACCESS:
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl && !ipage) {
- error = posix_acl_update_mode(&init_user_ns, inode,
- &mode, &acl);
+ error = f2fs_acl_update_mode(inode, &mode, &acl);
if (error)
return error;
set_acl_inode(inode, mode);
return error;
}
- int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+ int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type)
{
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
FAULT_KVMALLOC,
FAULT_PAGE_ALLOC,
FAULT_PAGE_GET,
- FAULT_ALLOC_BIO,
FAULT_ALLOC_NID,
FAULT_ORPHAN,
FAULT_BLOCK,
#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000
#define F2FS_MOUNT_NORECOVERY 0x04000000
#define F2FS_MOUNT_ATGC 0x08000000
+#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
/* For compression */
unsigned char compress_algorithm; /* algorithm type */
unsigned char compress_log_size; /* cluster log size */
+ unsigned char compress_level; /* compress level */
bool compress_chksum; /* compressed data chksum */
unsigned char compress_ext_cnt; /* extension count */
int compress_mode; /* compression mode */
unsigned int seq_id; /* sequence id */
};
+struct ckpt_req {
+ struct completion wait; /* completion for checkpoint done */
+ struct llist_node llnode; /* llist_node to be linked in wait queue */
+ int ret; /* return code of checkpoint */
+ ktime_t queue_time; /* request queued time */
+};
+
+struct ckpt_req_control {
+ struct task_struct *f2fs_issue_ckpt; /* checkpoint task */
+ int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */
+ wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */
+ atomic_t issued_ckpt; /* # of actually issued ckpts */
+ atomic_t total_ckpt; /* # of total ckpts */
+ atomic_t queued_ckpt; /* # of queued ckpts */
+ struct llist_head issue_list; /* list for command issue */
+ spinlock_t stat_lock; /* lock for below checkpoint time stats */
+ unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */
+ unsigned int peak_time; /* peak wait time in msec until now */
+};
+
/* for the bitmap indicate blocks to be discarded */
struct discard_entry {
struct list_head list; /* list head */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
struct task_struct *inmem_task; /* store inmemory task */
struct mutex inmem_lock; /* lock for inmemory pages */
- pgoff_t ra_offset; /* ongoing readahead offset */
struct extent_tree *extent_tree; /* cached extent_tree entry */
/* avoid racing between foreground op and gc */
atomic_t i_compr_blocks; /* # of compressed blocks */
unsigned char i_compress_algorithm; /* algorithm type */
unsigned char i_log_cluster_size; /* log of cluster size */
+ unsigned char i_compress_level; /* compress level (lz4hc,zstd) */
unsigned short i_compress_flag; /* compress flag */
unsigned int i_cluster_size; /* cluster size */
};
#define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000
+#define COMPRESS_LEVEL_OFFSET 8
+
/* compress context */
struct compress_ctx {
struct inode *inode; /* inode the context belong to */
atomic_t pending_pages; /* in-flight compressed page count */
};
-/* decompress io context for read IO path */
+/* Context for decompressing one cluster on the read IO path */
struct decompress_io_ctx {
u32 magic; /* magic number to indicate page is compressed */
struct inode *inode; /* inode the context belong to */
struct compress_data *cbuf; /* virtual mapped address on cpages */
size_t rlen; /* valid data length in rbuf */
size_t clen; /* valid data length in cbuf */
- atomic_t pending_pages; /* in-flight compressed page count */
- atomic_t verity_pages; /* in-flight page count for verity */
- bool failed; /* indicate IO error during decompression */
+
+ /*
+ * The number of compressed pages remaining to be read in this cluster.
+ * This is initially nr_cpages. It is decremented by 1 each time a page
+ * has been read (or failed to be read). When it reaches 0, the cluster
+ * is decompressed (or an error is reported).
+ *
+ * If an error occurs before all the pages have been submitted for I/O,
+ * then this will never reach 0. In this case the I/O submitter is
+ * responsible for calling f2fs_decompress_end_io() instead.
+ */
+ atomic_t remaining_pages;
+
+ /*
+ * Number of references to this decompress_io_ctx.
+ *
+ * One reference is held for I/O completion. This reference is dropped
+ * after the pagecache pages are updated and unlocked -- either after
+ * decompression (and verity if enabled), or after an error.
+ *
+ * In addition, each compressed page holds a reference while it is in a
+ * bio. These references are necessary prevent compressed pages from
+ * being freed while they are still in a bio.
+ */
+ refcount_t refcnt;
+
+ bool failed; /* IO error occurred before decompression? */
+ bool need_verity; /* need fs-verity verification after decompression? */
void *private; /* payload buffer for specified decompression algorithm */
void *private2; /* extra payload buffer */
+ struct work_struct verity_work; /* work to verify the decompressed pages */
};
#define NULL_CLUSTER ((unsigned int)(~0))
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
+ struct ckpt_req_control cprc_info; /* for checkpoint request control */
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
unsigned int total_sections; /* total section count */
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
- loff_t max_file_blocks; /* max block index of file */
int dir_level; /* directory level */
int readdir_ra; /* readahead inode in readdir */
u64 max_io_bytes; /* max io bytes to merge IOs */
unsigned int node_io_flag;
/* For sysfs suppport */
- struct kobject s_kobj;
+ struct kobject s_kobj; /* /sys/fs/f2fs/<devname> */
struct completion s_kobj_unregister;
+ struct kobject s_stat_kobj; /* /sys/fs/f2fs/<devname>/stat */
+ struct completion s_stat_kobj_unregister;
+
/* For shrinker support */
struct list_head s_list;
int s_ndevs; /* number of devices */
int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock);
int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock);
int f2fs_truncate(struct inode *inode);
- int f2fs_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags);
- int f2fs_setattr(struct dentry *dentry, struct iattr *attr);
+ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags);
+ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr);
int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
int f2fs_precache_extents(struct inode *inode);
void f2fs_inode_synced(struct inode *inode);
int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
int f2fs_quota_sync(struct super_block *sb, int type);
+loff_t max_file_blocks(struct inode *inode);
void f2fs_quota_off_umount(struct super_block *sb);
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
int f2fs_sync_fs(struct super_block *sb, int sync);
void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
int __init f2fs_create_checkpoint_caches(void);
void f2fs_destroy_checkpoint_caches(void);
+int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi);
+int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi);
+void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi);
+void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
/*
* data.c
*/
int __init f2fs_init_bioset(void);
void f2fs_destroy_bioset(void);
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio);
int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_bio(struct f2fs_sb_info *sbi,
struct bio **bio, sector_t *last_block,
struct writeback_control *wbc,
enum iostat_type io_type,
- int compr_blocks);
+ int compr_blocks, bool allow_balance);
void f2fs_invalidate_page(struct page *page, unsigned int offset,
unsigned int length);
int f2fs_release_page(struct page *page, gfp_t wait);
int nr_discarding, nr_discarded;
int nr_discard_cmd;
unsigned int undiscard_blks;
+ int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt;
+ unsigned int cur_ckpt_time, peak_ckpt_time;
int inline_xattr, inline_inode, inline_dir, append, update, orphans;
int compr_inode;
unsigned long long compr_blocks;
#define stat_dec_compr_inode(inode) do { } while (0)
#define stat_add_compr_blocks(inode, blocks) do { } while (0)
#define stat_sub_compr_blocks(inode, blocks) do { } while (0)
-#define stat_inc_atomic_write(inode) do { } while (0)
-#define stat_dec_atomic_write(inode) do { } while (0)
#define stat_update_max_atomic_write(inode) do { } while (0)
#define stat_inc_volatile_write(inode) do { } while (0)
#define stat_dec_volatile_write(inode) do { } while (0)
bool f2fs_is_compress_backend_ready(struct inode *inode);
int f2fs_init_compress_mempool(void);
void f2fs_destroy_compress_mempool(void);
-void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity);
+void f2fs_end_read_compressed_page(struct page *page, bool failed);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
unsigned nr_pages, sector_t *last_block_in_bio,
bool is_readahead, bool for_write);
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
-void f2fs_free_dic(struct decompress_io_ctx *dic);
-void f2fs_decompress_end_io(struct page **rpages,
- unsigned int cluster_size, bool err, bool verity);
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed);
+void f2fs_put_page_dic(struct page *page);
int f2fs_init_compress_ctx(struct compress_ctx *cc);
void f2fs_destroy_compress_ctx(struct compress_ctx *cc);
void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
}
static inline int f2fs_init_compress_mempool(void) { return 0; }
static inline void f2fs_destroy_compress_mempool(void) { }
+static inline void f2fs_end_read_compressed_page(struct page *page, bool failed)
+{
+ WARN_ON_ONCE(1);
+}
+static inline void f2fs_put_page_dic(struct page *page)
+{
+ WARN_ON_ONCE(1);
+}
static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { }
static inline int __init f2fs_init_compress_cache(void) { return 0; }
1 << COMPRESS_CHKSUM : 0;
F2FS_I(inode)->i_cluster_size =
1 << F2FS_I(inode)->i_log_cluster_size;
+ if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 &&
+ F2FS_OPTION(sbi).compress_level)
+ F2FS_I(inode)->i_compress_flag |=
+ F2FS_OPTION(sbi).compress_level <<
+ COMPRESS_LEVEL_OFFSET;
F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
set_inode_flag(inode, FI_COMPRESSED_FILE);
stat_inc_compr_inode(inode);
return false;
}
+static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
+{
+ return fsverity_active(inode) &&
+ idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
#ifdef CONFIG_F2FS_FAULT_INJECTION
extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
unsigned int type);
#include "xattr.h"
#include "acl.h"
#include "gc.h"
-#include "trace.h"
#include <trace/events/f2fs.h>
#include <uapi/linux/f2fs.h>
bool need_alloc = true;
int err = 0;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return VM_FAULT_SIGBUS;
+
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
goto err;
goto err;
}
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ goto err;
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
int ret = f2fs_is_compressed_cluster(inode, page->index);
f2fs_update_time(sbi, REQ_TIME);
out:
trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
- f2fs_trace_ios(NULL, 1);
return ret;
}
struct inode *inode = file->f_mapping->host;
loff_t maxbytes = inode->i_sb->s_maxbytes;
+ if (f2fs_compressed_file(inode))
+ maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS;
+
switch (whence) {
case SEEK_SET:
case SEEK_CUR:
static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
- int err;
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- /* we don't need to use inline_data strictly */
- err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
-
file_accessed(file);
vma->vm_ops = &f2fs_file_vm_ops;
set_inode_flag(inode, FI_MMAP_FILE);
free_from = (pgoff_t)F2FS_BLK_ALIGN(from);
- if (free_from >= sbi->max_file_blocks)
+ if (free_from >= max_file_blocks(inode))
goto free_partial;
if (lock)
return -EIO;
}
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
/* we should check inline_data size */
if (!f2fs_may_inline_data(inode)) {
err = f2fs_convert_inline_inode(inode);
return 0;
}
- int f2fs_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
+ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
struct f2fs_inode_info *fi = F2FS_I(inode);
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
/* we need to show initial sectors used for inline_data/dentries */
if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
}
#ifdef CONFIG_F2FS_FS_POSIX_ACL
- static void __setattr_copy(struct inode *inode, const struct iattr *attr)
+ static void __setattr_copy(struct user_namespace *mnt_userns,
+ struct inode *inode, const struct iattr *attr)
{
unsigned int ia_valid = attr->ia_valid;
inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
+ kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (!in_group_p(inode->i_gid) &&
- !capable_wrt_inode_uidgid(inode, CAP_FSETID))
- if (!in_group_p(kgid) && !capable(CAP_FSETID))
++ if (!in_group_p(kgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
set_acl_inode(inode, mode);
}
#define __setattr_copy setattr_copy
#endif
- int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
+ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int err;
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
+ if (unlikely(IS_APPEND(inode) &&
+ (attr->ia_valid & (ATTR_MODE | ATTR_UID |
+ ATTR_GID | ATTR_TIMES_SET))))
+ return -EPERM;
+
if ((attr->ia_valid & ATTR_SIZE) &&
!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- err = setattr_prepare(dentry, attr);
+ err = setattr_prepare(&init_user_ns, dentry, attr);
if (err)
return err;
spin_unlock(&F2FS_I(inode)->i_size_lock);
}
- __setattr_copy(inode, attr);
+ __setattr_copy(&init_user_ns, inode, attr);
if (attr->ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode));
- err = posix_acl_chmod(&init_user_ns, inode,
- f2fs_get_inode_mode(inode));
- if (err || is_inode_flag_set(inode, FI_ACL_MODE)) {
- inode->i_mode = F2FS_I(inode)->i_acl_mode;
++ err = posix_acl_chmod(&init_user_ns, inode, f2fs_get_inode_mode(inode));
+
+ if (is_inode_flag_set(inode, FI_ACL_MODE)) {
+ if (!err)
+ inode->i_mode = F2FS_I(inode)->i_acl_mode;
clear_inode_flag(inode, FI_ACL_MODE);
}
}
u32 iflags;
int ret;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EACCES;
if (get_user(fsflags, (int __user *)arg))
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int ret;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EACCES;
if (!S_ISREG(inode->i_mode))
struct inode *inode = file_inode(filp);
int ret;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
struct inode *inode = file_inode(filp);
int ret;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EACCES;
if (!S_ISREG(inode->i_mode))
struct inode *inode = file_inode(filp);
int ret;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
struct inode *inode = file_inode(filp);
int ret;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
return -EINVAL;
if (unlikely((range.start + range.len) >> PAGE_SHIFT >
- sbi->max_file_blocks))
+ max_file_blocks(inode)))
return -EINVAL;
err = mnt_want_write_file(filp);
return -EFAULT;
/* Make sure caller has proper permission */
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EACCES;
if (fa.fsx_xflags & ~F2FS_SUPPORTED_XFLAGS)
map.m_next_extent = &m_next_extent;
map.m_seg_type = NO_CHECK_TYPE;
map.m_may_create = false;
- end = F2FS_I_SB(inode)->max_file_blocks;
+ end = max_file_blocks(inode);
while (map.m_lblk < end) {
map.m_len = end - map.m_lblk;
return fsverity_ioctl_measure(filp, (void __user *)arg);
}
+static int f2fs_ioc_read_verity_metadata(struct file *filp, unsigned long arg)
+{
+ if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp))))
+ return -EOPNOTSUPP;
+
+ return fsverity_ioctl_read_metadata(filp, (const void __user *)arg);
+}
+
static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
for (i = 0; i < page_len; i++, redirty_idx++) {
page = find_lock_page(mapping, redirty_idx);
- if (!page)
- ret = -ENOENT;
+ if (!page) {
+ ret = -ENOMEM;
+ break;
+ }
set_page_dirty(page);
f2fs_put_page(page, 1);
f2fs_put_page(page, 0);
return f2fs_ioc_enable_verity(filp, arg);
case FS_IOC_MEASURE_VERITY:
return f2fs_ioc_measure_verity(filp, arg);
+ case FS_IOC_READ_VERITY_METADATA:
+ return f2fs_ioc_read_verity_metadata(filp, arg);
case FS_IOC_GETFSLABEL:
return f2fs_ioc_getfslabel(filp, arg);
case FS_IOC_SETFSLABEL:
inode_lock(inode);
}
+ if (unlikely(IS_IMMUTABLE(inode))) {
+ ret = -EPERM;
+ goto unlock;
+ }
+
ret = generic_write_checks(iocb, from);
if (ret > 0) {
bool preallocated = false;
if (ret > 0)
f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
}
+unlock:
inode_unlock(inode);
out:
trace_f2fs_file_write_iter(inode, iocb->ki_pos,
case F2FS_IOC_RESIZE_FS:
case FS_IOC_ENABLE_VERITY:
case FS_IOC_MEASURE_VERITY:
+ case FS_IOC_READ_VERITY_METADATA:
case FS_IOC_GETFSLABEL:
case FS_IOC_SETFSLABEL:
case F2FS_IOC_GET_COMPRESS_BLOCKS:
nid_free = true;
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(&init_user_ns, inode, dir, mode);
inode->i_ino = ino;
inode->i_blocks = 0;
}
}
- static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
return link;
}
- static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
- const char *symname)
+ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *symname)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
return err;
}
- static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
return -ENOTEMPTY;
}
- static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
+ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
if (whiteout) {
f2fs_i_links_write(inode, false);
+
+ spin_lock(&inode->i_lock);
inode->i_state |= I_LINKABLE;
+ spin_unlock(&inode->i_lock);
+
*whiteout = inode;
} else {
d_tmpfile(dentry, inode);
return err;
}
- static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
err = f2fs_add_link(old_dentry, whiteout);
if (err)
goto put_out_dir;
+
+ spin_lock(&whiteout->i_lock);
whiteout->i_state &= ~I_LINKABLE;
+ spin_unlock(&whiteout->i_lock);
+
iput(whiteout);
}
return err;
}
- static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ static int f2fs_rename2(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
}
static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
}
static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
unsigned char old_advise = F2FS_I(inode)->i_advise;
unsigned char new_advise;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EPERM;
if (value == NULL)
return -EINVAL;
void *last_addr = NULL;
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
unsigned int inline_size = inline_xattr_size(inode);
- int err = 0;
+ int err;
if (!xnid && !inline_size)
return -ENODATA;
void *buffer, size_t buffer_size, struct page *ipage)
{
struct f2fs_xattr_entry *entry = NULL;
- int error = 0;
+ int error;
unsigned int size, len;
void *base_addr = NULL;
int base_size;
struct inode *inode = d_inode(dentry);
struct f2fs_xattr_entry *entry;
void *base_addr, *last_base_addr;
- int error = 0;
+ int error;
size_t rest = buffer_size;
down_read(&F2FS_I(inode)->i_xattr_sem);
int found, newsize;
size_t len;
__u32 new_hsize;
- int error = 0;
+ int error;
if (name == NULL)
return -EINVAL;
}
if (value && f2fs_xattr_value_same(here, value, size))
- goto exit;
+ goto same;
} else if ((flags & XATTR_REPLACE)) {
error = -ENODATA;
goto exit;
if (error)
goto exit;
- if (is_inode_flag_set(inode, FI_ACL_MODE)) {
- inode->i_mode = F2FS_I(inode)->i_acl_mode;
- inode->i_ctime = current_time(inode);
- clear_inode_flag(inode, FI_ACL_MODE);
- }
if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
f2fs_set_encrypted_inode(inode);
f2fs_mark_inode_dirty_sync(inode, true);
if (!error && S_ISDIR(inode->i_mode))
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
+
+same:
+ if (is_inode_flag_set(inode, FI_ACL_MODE)) {
+ inode->i_mode = F2FS_I(inode)->i_acl_mode;
+ inode->i_ctime = current_time(inode);
+ clear_inode_flag(inode, FI_ACL_MODE);
+ }
+
exit:
kfree(base_addr);
return error;
goto out_unlock_inode;
/* This MUST be done before doing anything irreversible... */
- err = fat_setattr(file->f_path.dentry, &ia);
+ err = fat_setattr(file_mnt_user_ns(file), file->f_path.dentry, &ia);
if (err)
goto out_unlock_inode;
if (err)
return err;
- return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ return blkdev_issue_flush(inode->i_sb->s_bdev);
}
fat_flush_inodes(inode->i_sb, inode, NULL);
}
- int fat_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
+ int fat_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
return 0;
}
- static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
+ static int fat_allow_set_time(struct user_namespace *mnt_userns,
+ struct msdos_sb_info *sbi, struct inode *inode)
{
umode_t allow_utime = sbi->options.allow_utime;
- if (!uid_eq(current_fsuid(), inode->i_uid)) {
- if (in_group_p(inode->i_gid))
+ if (!uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) {
+ if (in_group_p(i_gid_into_mnt(mnt_userns, inode)))
allow_utime >>= 3;
if (allow_utime & MAY_WRITE)
return 1;
/* valid file mode bits */
#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO)
- int fat_setattr(struct dentry *dentry, struct iattr *attr)
+ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
struct inode *inode = d_inode(dentry);
/* Check for setting the inode time. */
ia_valid = attr->ia_valid;
if (ia_valid & TIMES_SET_FLAGS) {
- if (fat_allow_set_time(sbi, inode))
+ if (fat_allow_set_time(mnt_userns, sbi, inode))
attr->ia_valid &= ~TIMES_SET_FLAGS;
}
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(mnt_userns, dentry, attr);
attr->ia_valid = ia_valid;
if (error) {
if (sbi->options.quiet)
fat_truncate_time(inode, &attr->ia_mtime, S_MTIME);
attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME);
- setattr_copy(inode, attr);
+ setattr_copy(mnt_userns, inode, attr);
mark_inode_dirty(inode);
out:
return error;
#include <linux/user_namespace.h>
#include <linux/memfd.h>
#include <linux/compat.h>
+ #include <linux/mount.h>
#include <linux/poll.h>
#include <asm/siginfo.h>
/* O_NOATIME can only be set by the owner or superuser */
if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(file_mnt_user_ns(filp), inode))
return -EPERM;
/* required for strict SunOS emulation */
pid_t f_getown(struct file *filp)
{
- pid_t pid;
+ pid_t pid = 0;
read_lock(&filp->f_owner.lock);
- pid = pid_vnr(filp->f_owner.pid);
- if (filp->f_owner.pid_type == PIDTYPE_PGID)
- pid = -pid;
+ rcu_read_lock();
+ if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
+ pid = pid_vnr(filp->f_owner.pid);
+ if (filp->f_owner.pid_type == PIDTYPE_PGID)
+ pid = -pid;
+ }
+ rcu_read_unlock();
read_unlock(&filp->f_owner.lock);
return pid;
}
static int f_getown_ex(struct file *filp, unsigned long arg)
{
struct f_owner_ex __user *owner_p = (void __user *)arg;
- struct f_owner_ex owner;
+ struct f_owner_ex owner = {};
int ret = 0;
read_lock(&filp->f_owner.lock);
- owner.pid = pid_vnr(filp->f_owner.pid);
+ rcu_read_lock();
+ if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
+ owner.pid = pid_vnr(filp->f_owner.pid);
+ rcu_read_unlock();
switch (filp->f_owner.pid_type) {
case PIDTYPE_PID:
owner.type = F_OWNER_TID;
goto out;
error = -EACCES;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
goto out;
error = 0;
!capable(CAP_LINUX_IMMUTABLE))
goto out;
if (!IS_IMMUTABLE(inode)) {
- error = gfs2_permission(inode, MAY_WRITE);
+ error = gfs2_permission(&init_user_ns, inode, MAY_WRITE);
if (error)
goto out;
}
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- int sync_state = inode->i_state & I_DIRTY_ALL;
+ int sync_state = inode->i_state & I_DIRTY;
struct gfs2_inode *ip = GFS2_I(inode);
int ret = 0, ret1 = 0;
if (!gfs2_is_jdata(ip))
sync_state &= ~I_DIRTY_PAGES;
if (datasync)
- sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
+ sync_state &= ~I_DIRTY_SYNC;
if (sync_state) {
ret = sync_inode_metadata(inode, 1);
if (ret)
goto out_uninit;
- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
- is_sync_kiocb(iocb));
-
+ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
if (offset + len > i_size_read(&ip->i_inode))
goto out;
- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
out:
return 0;
}
- static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
+ static int hfsplus_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int error;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(&init_user_ns, dentry, attr);
if (error)
return error;
inode->i_mtime = inode->i_ctime = current_time(inode);
}
- setattr_copy(inode, attr);
+ setattr_copy(&init_user_ns, inode, attr);
mark_inode_dirty(inode);
return 0;
}
- int hfsplus_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
+ int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP;
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
return 0;
}
}
if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ blkdev_issue_flush(inode->i_sb->s_bdev);
inode_unlock(inode);
return NULL;
inode->i_ino = sbi->next_cnid++;
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(&init_user_ns, inode, dir, mode);
set_nlink(inode, 1);
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
#define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file))
+static struct kmem_cache *hostfs_inode_cache;
+
/* Changed in hostfs_args before the kernel starts running */
static char *root_ino = "";
static int append = 0;
{
struct hostfs_inode_info *hi;
- hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
+ hi = kmem_cache_alloc(hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
if (hi == NULL)
return NULL;
hi->fd = -1;
static void hostfs_free_inode(struct inode *inode)
{
- kfree(HOSTFS_I(inode));
+ kmem_cache_free(hostfs_inode_cache, HOSTFS_I(inode));
}
static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
- static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+ static int hostfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
struct inode *inode;
char *name;
return err;
}
- static int hostfs_symlink(struct inode *ino, struct dentry *dentry,
- const char *to)
+ static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino,
+ struct dentry *dentry, const char *to)
{
char *file;
int err;
return err;
}
- static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
+ static int hostfs_mkdir(struct user_namespace *mnt_userns, struct inode *ino,
+ struct dentry *dentry, umode_t mode)
{
char *file;
int err;
return err;
}
- static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
char *name;
return err;
}
- static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ static int hostfs_rename2(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
return err;
}
- static int hostfs_permission(struct inode *ino, int desired)
+ static int hostfs_permission(struct user_namespace *mnt_userns,
+ struct inode *ino, int desired)
{
char *name;
int r = 0, w = 0, x = 0, err;
err = access_file(name, r, w, x);
__putname(name);
if (!err)
- err = generic_permission(ino, desired);
+ err = generic_permission(&init_user_ns, ino, desired);
return err;
}
- static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
+ static int hostfs_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct hostfs_iattr attrs;
int fd = HOSTFS_I(inode)->fd;
- err = setattr_prepare(dentry, attr);
+ err = setattr_prepare(&init_user_ns, dentry, attr);
if (err)
return err;
attr->ia_size != i_size_read(inode))
truncate_setsize(inode, attr->ia_size);
- setattr_copy(inode, attr);
+ setattr_copy(&init_user_ns, inode, attr);
mark_inode_dirty(inode);
return 0;
}
static int __init init_hostfs(void)
{
+ hostfs_inode_cache = KMEM_CACHE(hostfs_inode_info, 0);
+ if (!hostfs_inode_cache)
+ return -ENOMEM;
return register_filesystem(&hostfs_type);
}
static void __exit exit_hostfs(void)
{
unregister_filesystem(&hostfs_type);
+ kmem_cache_destroy(hostfs_inode_cache);
}
module_init(init_hostfs)
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ set_page_huge_active(page);
/*
* unlock_page because locked by add_to_page_cache()
- * page_put due to reference from alloc_huge_page()
+ * put_page() due to reference from alloc_huge_page()
*/
unlock_page(page);
put_page(page);
return error;
}
- static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
+ static int hugetlbfs_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct hstate *h = hstate_inode(inode);
BUG_ON(!inode);
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(&init_user_ns, dentry, attr);
if (error)
return error;
return error;
}
- setattr_copy(inode, attr);
+ setattr_copy(&init_user_ns, inode, attr);
mark_inode_dirty(inode);
return 0;
}
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
inode->i_ino = get_next_ino();
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(&init_user_ns, inode, dir, mode);
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
return error;
}
- static int hugetlbfs_mknod(struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t dev)
+ static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
}
- static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+ int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry,
+ mode | S_IFDIR, 0);
if (!retval)
inc_nlink(dir);
return retval;
}
- static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+ static int hugetlbfs_create(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool excl)
{
- return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
+ return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
}
- static int hugetlbfs_tmpfile(struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode)
{
return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
}
- static int hugetlbfs_symlink(struct inode *dir,
- struct dentry *dentry, const char *symname)
+ static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry,
+ const char *symname)
{
struct inode *inode;
int error = -ENOSPC;
EXPORT_SYMBOL(find_inode_rcu);
/**
- * find_inode_by_rcu - Find an inode in the inode cache
+ * find_inode_by_ino_rcu - Find an inode in the inode cache
* @sb: Super block of file system to search
* @ino: The inode number to match
*
int generic_update_time(struct inode *inode, struct timespec64 *time, int flags)
{
- int iflags = I_DIRTY_TIME;
- bool dirty = false;
-
- if (flags & S_ATIME)
- inode->i_atime = *time;
- if (flags & S_VERSION)
- dirty = inode_maybe_inc_iversion(inode, false);
- if (flags & S_CTIME)
- inode->i_ctime = *time;
- if (flags & S_MTIME)
- inode->i_mtime = *time;
- if ((flags & (S_ATIME | S_CTIME | S_MTIME)) &&
- !(inode->i_sb->s_flags & SB_LAZYTIME))
- dirty = true;
-
- if (dirty)
- iflags |= I_DIRTY_SYNC;
- __mark_inode_dirty(inode, iflags);
+ int dirty_flags = 0;
+
+ if (flags & (S_ATIME | S_CTIME | S_MTIME)) {
+ if (flags & S_ATIME)
+ inode->i_atime = *time;
+ if (flags & S_CTIME)
+ inode->i_ctime = *time;
+ if (flags & S_MTIME)
+ inode->i_mtime = *time;
+
+ if (inode->i_sb->s_flags & SB_LAZYTIME)
+ dirty_flags |= I_DIRTY_TIME;
+ else
+ dirty_flags |= I_DIRTY_SYNC;
+ }
+
+ if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false))
+ dirty_flags |= I_DIRTY_SYNC;
+
+ __mark_inode_dirty(inode, dirty_flags);
return 0;
}
EXPORT_SYMBOL(generic_update_time);
}
/**
- * touch_atime - update the access time
+ * atime_needs_update - update the access time
* @path: the &struct path to update
* @inode: inode to update
*
/* Atime updates will likely cause i_uid and i_gid to be written
* back improprely if their true value is unknown to the vfs.
*/
- if (HAS_UNMAPPED_ID(inode))
+ if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode))
return false;
if (IS_NOATIME(inode))
return mask;
}
- static int __remove_privs(struct dentry *dentry, int kill)
+ static int __remove_privs(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int kill)
{
struct iattr newattrs;
* Note we call this on write, so notify_change will not
* encounter any conflicting delegations:
*/
- return notify_change(dentry, &newattrs, NULL);
+ return notify_change(mnt_userns, dentry, &newattrs, NULL);
}
/*
if (kill < 0)
return kill;
if (kill)
- error = __remove_privs(dentry, kill);
+ error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
if (!error)
inode_has_no_xattr(inode);
/**
* inode_init_owner - Init uid,gid,mode for new inode according to posix standards
+ * @mnt_userns: User namespace of the mount the inode was created from
* @inode: New inode
* @dir: Directory inode
* @mode: mode of the new inode
+ *
+ * If the inode has been created through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions
+ * and initializing i_uid and i_gid. On non-idmapped mounts or if permission
+ * checking is to be performed on the raw inode simply passs init_user_ns.
*/
- void inode_init_owner(struct inode *inode, const struct inode *dir,
- umode_t mode)
+ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
+ const struct inode *dir, umode_t mode)
{
- inode->i_uid = current_fsuid();
+ inode->i_uid = fsuid_into_mnt(mnt_userns);
if (dir && dir->i_mode & S_ISGID) {
inode->i_gid = dir->i_gid;
if (S_ISDIR(mode))
mode |= S_ISGID;
else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
- !in_group_p(inode->i_gid) &&
- !capable_wrt_inode_uidgid(dir, CAP_FSETID))
+ !in_group_p(i_gid_into_mnt(mnt_userns, dir)) &&
+ !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
mode &= ~S_ISGID;
} else
- inode->i_gid = current_fsgid();
+ inode->i_gid = fsgid_into_mnt(mnt_userns);
inode->i_mode = mode;
}
EXPORT_SYMBOL(inode_init_owner);
/**
* inode_owner_or_capable - check current task permissions to inode
+ * @mnt_userns: user namespace of the mount the inode was found from
* @inode: inode being checked
*
* Return true if current either has CAP_FOWNER in a namespace with the
* inode owner uid mapped, or owns the file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
*/
- bool inode_owner_or_capable(const struct inode *inode)
+ bool inode_owner_or_capable(struct user_namespace *mnt_userns,
+ const struct inode *inode)
{
+ kuid_t i_uid;
struct user_namespace *ns;
- if (uid_eq(current_fsuid(), inode->i_uid))
+ i_uid = i_uid_into_mnt(mnt_userns, inode);
+ if (uid_eq(current_fsuid(), i_uid))
return true;
ns = current_user_ns();
- if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
+ if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER))
return true;
return false;
}
struct shrink_control;
struct fs_context;
struct user_namespace;
+struct pipe_inode_info;
/*
* block_dev.c
const char *, unsigned int, struct path *);
long do_rmdir(int dfd, struct filename *name);
long do_unlinkat(int dfd, struct filename *name);
- int may_linkat(struct path *link);
+ int may_linkat(struct user_namespace *mnt_userns, struct path *link);
int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
struct filename *newname, unsigned int flags);
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
+extern int __close_fd_get_file(unsigned int fd, struct file **res);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
*/
int do_statx(int dfd, const char __user *filename, unsigned flags,
unsigned int mask, struct statx __user *buffer);
+
+/*
+ * fs/splice.c:
+ */
+long splice_file_to_pipe(struct file *in,
+ struct pipe_inode_info *opipe,
+ loff_t *offset,
+ size_t len, unsigned int flags);
#include "internal.h"
- int simple_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
+ int simple_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
return 0;
}
}
EXPORT_SYMBOL(simple_rmdir);
- int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+ int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = d_is_dir(old_dentry);
* on simple regular filesystems. Anything that needs to change on-disk
* or wire state on size changes needs its own setattr method.
*/
- int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+ int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
int error;
- error = setattr_prepare(dentry, iattr);
+ error = setattr_prepare(mnt_userns, dentry, iattr);
if (error)
return error;
if (iattr->ia_valid & ATTR_SIZE)
truncate_setsize(inode, iattr->ia_size);
- setattr_copy(inode, iattr);
+ setattr_copy(mnt_userns, inode, iattr);
mark_inode_dirty(inode);
return 0;
}
err = __generic_file_fsync(file, start, end, datasync);
if (err)
return err;
- return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ return blkdev_issue_flush(inode->i_sb->s_bdev);
}
EXPORT_SYMBOL(generic_file_fsync);
return 0;
};
-/*
- * A single inode exists for all anon_inode files. Contrary to pipes,
- * anon_inode inodes have no associated per-instance data, so we need
- * only allocate one of them.
- */
struct inode *alloc_anon_inode(struct super_block *s)
{
static const struct address_space_operations anon_aops = {
return ERR_PTR(-ENOENT);
}
- static int empty_dir_getattr(const struct path *path, struct kstat *stat,
+ static int empty_dir_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
return 0;
}
- static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
+ static int empty_dir_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
return -EPERM;
}
*
* Return: 0 if names match, 1 if mismatch, or -ERRNO
*/
-int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
- const char *str, const struct qstr *name)
+static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name)
{
const struct dentry *parent = READ_ONCE(dentry->d_parent);
const struct inode *dir = READ_ONCE(parent->d_inode);
return 1;
return !!memcmp(str, name->name, len);
}
-EXPORT_SYMBOL(generic_ci_d_compare);
/**
* generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
*
* Return: 0 if hash was successful or unchanged, and -EINVAL on error
*/
-int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
+static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
{
const struct inode *dir = READ_ONCE(dentry->d_inode);
struct super_block *sb = dentry->d_sb;
return -EINVAL;
return 0;
}
-EXPORT_SYMBOL(generic_ci_d_hash);
static const struct dentry_operations generic_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
__putname(name);
}
- static int check_acl(struct inode *inode, int mask)
+ /**
+ * check_acl - perform ACL permission checking
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @inode: inode to check permissions on
+ * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
+ *
+ * This function performs the ACL permission checking. Since this function
+ * retrieve POSIX acls it needs to know whether it is called from a blocking or
+ * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+ static int check_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
{
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *acl;
/* no ->get_acl() calls in RCU mode... */
if (is_uncached_acl(acl))
return -ECHILD;
- return posix_acl_permission(inode, acl, mask);
+ return posix_acl_permission(mnt_userns, inode, acl, mask);
}
acl = get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl) {
- int error = posix_acl_permission(inode, acl, mask);
+ int error = posix_acl_permission(mnt_userns, inode, acl, mask);
posix_acl_release(acl);
return error;
}
return -EAGAIN;
}
- /*
- * This does the basic UNIX permission checking.
+ /**
+ * acl_permission_check - perform basic UNIX permission checking
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @inode: inode to check permissions on
+ * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
+ *
+ * This function performs the basic UNIX permission checking. Since this
+ * function may retrieve POSIX acls it needs to know whether it is called from a
+ * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
*
- * Note that the POSIX ACL check cares about the MAY_NOT_BLOCK bit,
- * for RCU walking.
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
*/
- static int acl_permission_check(struct inode *inode, int mask)
+ static int acl_permission_check(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
{
unsigned int mode = inode->i_mode;
+ kuid_t i_uid;
/* Are we the owner? If so, ACL's don't matter */
- if (likely(uid_eq(current_fsuid(), inode->i_uid))) {
+ i_uid = i_uid_into_mnt(mnt_userns, inode);
+ if (likely(uid_eq(current_fsuid(), i_uid))) {
mask &= 7;
mode >>= 6;
return (mask & ~mode) ? -EACCES : 0;
/* Do we have ACL's? */
if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
- int error = check_acl(inode, mask);
+ int error = check_acl(mnt_userns, inode, mask);
if (error != -EAGAIN)
return error;
}
* about? Need to check group ownership if so.
*/
if (mask & (mode ^ (mode >> 3))) {
- if (in_group_p(inode->i_gid))
+ kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
+ if (in_group_p(kgid))
mode >>= 3;
}
/**
* generic_permission - check for access rights on a Posix-like filesystem
+ * @mnt_userns: user namespace of the mount the inode was found from
* @inode: inode to check access rights for
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
* %MAY_NOT_BLOCK ...)
* generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
* request cannot be satisfied (eg. requires blocking or too much complexity).
* It would then be called again in ref-walk mode.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
*/
- int generic_permission(struct inode *inode, int mask)
+ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode,
+ int mask)
{
int ret;
/*
* Do the basic permission checks.
*/
- ret = acl_permission_check(inode, mask);
+ ret = acl_permission_check(mnt_userns, inode, mask);
if (ret != -EACCES)
return ret;
if (S_ISDIR(inode->i_mode)) {
/* DACs are overridable for directories */
if (!(mask & MAY_WRITE))
- if (capable_wrt_inode_uidgid(inode,
+ if (capable_wrt_inode_uidgid(mnt_userns, inode,
CAP_DAC_READ_SEARCH))
return 0;
- if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+ if (capable_wrt_inode_uidgid(mnt_userns, inode,
+ CAP_DAC_OVERRIDE))
return 0;
return -EACCES;
}
*/
mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
if (mask == MAY_READ)
- if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
+ if (capable_wrt_inode_uidgid(mnt_userns, inode,
+ CAP_DAC_READ_SEARCH))
return 0;
/*
* Read/write DACs are always overridable.
* at least one exec bit set.
*/
if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
- if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+ if (capable_wrt_inode_uidgid(mnt_userns, inode,
+ CAP_DAC_OVERRIDE))
return 0;
return -EACCES;
}
EXPORT_SYMBOL(generic_permission);
- /*
+ /**
+ * do_inode_permission - UNIX permission checking
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @inode: inode to check permissions on
+ * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
+ *
* We _really_ want to just do "generic_permission()" without
* even looking at the inode->i_op values. So we keep a cache
* flag in inode->i_opflags, that says "this has not special
* permission function, use the fast case".
*/
- static inline int do_inode_permission(struct inode *inode, int mask)
+ static inline int do_inode_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
{
if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
if (likely(inode->i_op->permission))
- return inode->i_op->permission(inode, mask);
+ return inode->i_op->permission(mnt_userns, inode, mask);
/* This gets set once for the inode lifetime */
spin_lock(&inode->i_lock);
inode->i_opflags |= IOP_FASTPERM;
spin_unlock(&inode->i_lock);
}
- return generic_permission(inode, mask);
+ return generic_permission(mnt_userns, inode, mask);
}
/**
/**
* inode_permission - Check for access rights to a given inode
- * @inode: Inode to check permission on
- * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mnt_userns: User namespace of the mount the inode was found from
+ * @inode: Inode to check permission on
+ * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
*
* Check for read/write/execute permissions on an inode. We use fs[ug]id for
* this, letting us set arbitrary permissions for filesystem access without
*
* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
*/
- int inode_permission(struct inode *inode, int mask)
+ int inode_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
{
int retval;
* written back improperly if their true value is unknown
* to the vfs.
*/
- if (HAS_UNMAPPED_ID(inode))
+ if (HAS_UNMAPPED_ID(mnt_userns, inode))
return -EACCES;
}
- retval = do_inode_permission(inode, mask);
+ retval = do_inode_permission(mnt_userns, inode, mask);
if (retval)
return retval;
static bool legitimize_links(struct nameidata *nd)
{
int i;
+ if (unlikely(nd->flags & LOOKUP_CACHED)) {
+ drop_links(nd);
+ nd->depth = 0;
+ return false;
+ }
for (i = 0; i < nd->depth; i++) {
struct saved *last = nd->stack + i;
if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
*/
/**
- * unlazy_walk - try to switch to ref-walk mode.
+ * try_to_unlazy - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
- * Returns: 0 on success, -ECHILD on failure
+ * Returns: true on success, false on failure
*
- * unlazy_walk attempts to legitimize the current nd->path and nd->root
+ * try_to_unlazy attempts to legitimize the current nd->path and nd->root
* for ref-walk mode.
* Must be called from rcu-walk context.
- * Nothing should touch nameidata between unlazy_walk() failure and
+ * Nothing should touch nameidata between try_to_unlazy() failure and
* terminate_walk().
*/
-static int unlazy_walk(struct nameidata *nd)
+static bool try_to_unlazy(struct nameidata *nd)
{
struct dentry *parent = nd->path.dentry;
goto out;
rcu_read_unlock();
BUG_ON(nd->inode != parent->d_inode);
- return 0;
+ return true;
out1:
nd->path.mnt = NULL;
nd->path.dentry = NULL;
out:
rcu_read_unlock();
- return -ECHILD;
+ return false;
}
/**
- * unlazy_child - try to switch to ref-walk mode.
+ * try_to_unlazy_next - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
- * @dentry: child of nd->path.dentry
- * @seq: seq number to check dentry against
- * Returns: 0 on success, -ECHILD on failure
+ * @dentry: next dentry to step into
+ * @seq: seq number to check @dentry against
+ * Returns: true on success, false on failure
*
- * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
- * for ref-walk mode. @dentry must be a path found by a do_lookup call on
- * @nd. Must be called from rcu-walk context.
- * Nothing should touch nameidata between unlazy_child() failure and
+ * Similar to to try_to_unlazy(), but here we have the next dentry already
+ * picked by rcu-walk and want to legitimize that in addition to the current
+ * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context.
+ * Nothing should touch nameidata between try_to_unlazy_next() failure and
* terminate_walk().
*/
-static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
+static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
{
BUG_ON(!(nd->flags & LOOKUP_RCU));
if (unlikely(!legitimize_root(nd)))
goto out_dput;
rcu_read_unlock();
- return 0;
+ return true;
out2:
nd->path.mnt = NULL;
nd->path.dentry = NULL;
out:
rcu_read_unlock();
- return -ECHILD;
+ return false;
out_dput:
rcu_read_unlock();
dput(dentry);
- return -ECHILD;
+ return false;
}
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
*/
if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
nd->root.mnt = NULL;
- if (unlikely(unlazy_walk(nd)))
+ nd->flags &= ~LOOKUP_CACHED;
+ if (!try_to_unlazy(nd))
return -ECHILD;
}
*/
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
+ struct user_namespace *mnt_userns;
+ kuid_t i_uid;
+
if (!sysctl_protected_symlinks)
return 0;
+ mnt_userns = mnt_user_ns(nd->path.mnt);
+ i_uid = i_uid_into_mnt(mnt_userns, inode);
/* Allowed if owner and follower match. */
- if (uid_eq(current_cred()->fsuid, inode->i_uid))
+ if (uid_eq(current_cred()->fsuid, i_uid))
return 0;
/* Allowed if parent directory not sticky and world-writable. */
return 0;
/* Allowed if parent directory and link owner match. */
- if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid))
+ if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
return 0;
if (nd->flags & LOOKUP_RCU)
/**
* safe_hardlink_source - Check for safe hardlink conditions
+ * @mnt_userns: user namespace of the mount the inode was found from
* @inode: the source inode to hardlink from
*
* Return false if at least one of the following conditions:
*
* Otherwise returns true.
*/
- static bool safe_hardlink_source(struct inode *inode)
+ static bool safe_hardlink_source(struct user_namespace *mnt_userns,
+ struct inode *inode)
{
umode_t mode = inode->i_mode;
return false;
/* Hardlinking to unreadable or unwritable sources is dangerous. */
- if (inode_permission(inode, MAY_READ | MAY_WRITE))
+ if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE))
return false;
return true;
/**
* may_linkat - Check permissions for creating a hardlink
+ * @mnt_userns: user namespace of the mount the inode was found from
* @link: the source to hardlink from
*
* Block hardlink when all of:
* - hardlink source is unsafe (see safe_hardlink_source() above)
* - not CAP_FOWNER in a namespace with the inode owner uid mapped
*
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ *
* Returns 0 if successful, -ve on error.
*/
- int may_linkat(struct path *link)
+ int may_linkat(struct user_namespace *mnt_userns, struct path *link)
{
struct inode *inode = link->dentry->d_inode;
/* Inode writeback is not safe when the uid or gid are invalid. */
- if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
+ !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
return -EOVERFLOW;
if (!sysctl_protected_hardlinks)
/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
* otherwise, it must be a safe source.
*/
- if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
+ if (safe_hardlink_source(mnt_userns, inode) ||
+ inode_owner_or_capable(mnt_userns, inode))
return 0;
audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
* may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
* should be allowed, or not, on files that already
* exist.
+ * @mnt_userns: user namespace of the mount the inode was found from
* @dir_mode: mode bits of directory
* @dir_uid: owner of directory
* @inode: the inode of the file to open
* the directory doesn't have to be world writable: being group writable will
* be enough.
*
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ *
* Returns 0 if the open is allowed, -ve on error.
*/
- static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
- struct inode * const inode)
+ static int may_create_in_sticky(struct user_namespace *mnt_userns,
+ struct nameidata *nd, struct inode *const inode)
{
+ umode_t dir_mode = nd->dir_mode;
+ kuid_t dir_uid = nd->dir_uid;
+
if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
(!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
likely(!(dir_mode & S_ISVTX)) ||
- uid_eq(inode->i_uid, dir_uid) ||
- uid_eq(current_fsuid(), inode->i_uid))
+ uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
+ uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
return 0;
if (likely(dir_mode & 0002) ||
return -ENOENT;
if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
return 0;
- if (unlazy_child(nd, dentry, seq))
+ if (!try_to_unlazy_next(nd, dentry, seq))
return -ECHILD;
// *path might've been clobbered by __follow_mount_rcu()
path->mnt = nd->path.mnt;
unsigned seq;
dentry = __d_lookup_rcu(parent, &nd->last, &seq);
if (unlikely(!dentry)) {
- if (unlazy_walk(nd))
+ if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
return NULL;
}
status = d_revalidate(dentry, nd->flags);
if (likely(status > 0))
return dentry;
- if (unlazy_child(nd, dentry, seq))
+ if (!try_to_unlazy_next(nd, dentry, seq))
return ERR_PTR(-ECHILD);
- if (unlikely(status == -ECHILD))
+ if (status == -ECHILD)
/* we'd been told to redo it in non-rcu mode */
status = d_revalidate(dentry, nd->flags);
} else {
return res;
}
- static inline int may_lookup(struct nameidata *nd)
+ static inline int may_lookup(struct user_namespace *mnt_userns,
+ struct nameidata *nd)
{
if (nd->flags & LOOKUP_RCU) {
- int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
- int err = inode_permission(mnt_userns, nd->inode,
- MAY_EXEC | MAY_NOT_BLOCK);
- if (err != -ECHILD)
++ int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+ if (err != -ECHILD || !try_to_unlazy(nd))
return err;
- if (unlazy_walk(nd))
- return -ECHILD;
}
- return inode_permission(nd->inode, MAY_EXEC);
+ return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
}
static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
// unlazy even if we fail to grab the link - cleanup needs it
bool grabbed_link = legitimize_path(nd, link, seq);
- if (unlazy_walk(nd) != 0 || !grabbed_link)
+ if (!try_to_unlazy(nd) != 0 || !grabbed_link)
return -ECHILD;
if (nd_alloc_stack(nd))
touch_atime(&last->link);
cond_resched();
} else if (atime_needs_update(&last->link, inode)) {
- if (unlikely(unlazy_walk(nd)))
+ if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
touch_atime(&last->link);
}
get = inode->i_op->get_link;
if (nd->flags & LOOKUP_RCU) {
res = get(NULL, inode, &last->done);
- if (res == ERR_PTR(-ECHILD)) {
- if (unlikely(unlazy_walk(nd)))
- return ERR_PTR(-ECHILD);
+ if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
res = get(link->dentry, inode, &last->done);
- }
} else {
res = get(link->dentry, inode, &last->done);
}
/* At this point we know we have a real path component. */
for(;;) {
+ struct user_namespace *mnt_userns;
const char *link;
u64 hash_len;
int type;
- err = may_lookup(nd);
+ mnt_userns = mnt_user_ns(nd->path.mnt);
+ err = may_lookup(mnt_userns, nd);
if (err)
return err;
OK:
/* pathname or trailing symlink, done */
if (!depth) {
- nd->dir_uid = nd->inode->i_uid;
+ nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
nd->dir_mode = nd->inode->i_mode;
nd->flags &= ~LOOKUP_PARENT;
return 0;
}
if (unlikely(!d_can_lookup(nd->path.dentry))) {
if (nd->flags & LOOKUP_RCU) {
- if (unlazy_walk(nd))
+ if (!try_to_unlazy(nd))
return -ECHILD;
}
return -ENOTDIR;
int error;
const char *s = nd->name->name;
+ /* LOOKUP_CACHED requires RCU, ask caller to retry */
+ if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
+ return ERR_PTR(-EAGAIN);
+
if (!*s)
flags &= ~LOOKUP_RCU;
if (flags & LOOKUP_RCU)
return err;
}
- return inode_permission(base->d_inode, MAY_EXEC);
+ return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
}
/**
}
EXPORT_SYMBOL(user_path_at_empty);
- int __check_sticky(struct inode *dir, struct inode *inode)
+ int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
+ struct inode *inode)
{
kuid_t fsuid = current_fsuid();
- if (uid_eq(inode->i_uid, fsuid))
+ if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
return 0;
- if (uid_eq(dir->i_uid, fsuid))
+ if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
return 0;
- return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
+ return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
}
EXPORT_SYMBOL(__check_sticky);
* 11. We don't allow removal of NFS sillyrenamed files; it's handled by
* nfs_async_unlink().
*/
- static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
+ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *victim, bool isdir)
{
struct inode *inode = d_backing_inode(victim);
int error;
BUG_ON(victim->d_parent->d_inode != dir);
/* Inode writeback is not safe when the uid or gid are invalid. */
- if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
+ !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
return -EOVERFLOW;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
- error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
if (error)
return error;
if (IS_APPEND(dir))
return -EPERM;
- if (check_sticky(dir, inode) || IS_APPEND(inode) ||
- IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
+ if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) ||
+ IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
+ HAS_UNMAPPED_ID(mnt_userns, inode))
return -EPERM;
if (isdir) {
if (!d_is_dir(victim))
* 4. We should have write and exec permissions on dir
* 5. We can't do it if dir is immutable (done in permission())
*/
- static inline int may_create(struct inode *dir, struct dentry *child)
+ static inline int may_create(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *child)
{
struct user_namespace *s_user_ns;
audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
if (IS_DEADDIR(dir))
return -ENOENT;
s_user_ns = dir->i_sb->s_user_ns;
- if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
- !kgid_has_mapping(s_user_ns, current_fsgid()))
+ if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
+ !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
return -EOVERFLOW;
- return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
}
/*
}
EXPORT_SYMBOL(unlock_rename);
- int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool want_excl)
+ /**
+ * vfs_create - create new file
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dir: inode of @dentry
+ * @dentry: pointer to dentry of the base directory
+ * @mode: mode of the new file
+ * @want_excl: whether the file must not yet exist
+ *
+ * Create a new file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+ int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool want_excl)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt_userns, dir, dentry);
if (error)
return error;
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
- error = dir->i_op->create(dir, dentry, mode, want_excl);
+ error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl);
if (!error)
fsnotify_create(dir, dentry);
return error;
void *arg)
{
struct inode *dir = dentry->d_parent->d_inode;
- int error = may_create(dir, dentry);
+ int error = may_create(&init_user_ns, dir, dentry);
if (error)
return error;
!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}
- static int may_open(const struct path *path, int acc_mode, int flag)
+ static int may_open(struct user_namespace *mnt_userns, const struct path *path,
+ int acc_mode, int flag)
{
struct dentry *dentry = path->dentry;
struct inode *inode = dentry->d_inode;
break;
}
- error = inode_permission(inode, MAY_OPEN | acc_mode);
+ error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode);
if (error)
return error;
}
/* O_NOATIME can only be set by the owner or superuser */
- if (flag & O_NOATIME && !inode_owner_or_capable(inode))
+ if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
return 0;
}
- static int handle_truncate(struct file *filp)
+ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
{
const struct path *path = &filp->f_path;
struct inode *inode = path->dentry->d_inode;
if (!error)
error = security_path_truncate(path);
if (!error) {
- error = do_truncate(path->dentry, 0,
+ error = do_truncate(mnt_userns, path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
filp);
}
return flag;
}
- static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
+ static int may_o_create(struct user_namespace *mnt_userns,
+ const struct path *dir, struct dentry *dentry,
+ umode_t mode)
{
struct user_namespace *s_user_ns;
int error = security_path_mknod(dir, dentry, mode, 0);
return error;
s_user_ns = dir->dentry->d_sb->s_user_ns;
- if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
- !kgid_has_mapping(s_user_ns, current_fsgid()))
+ if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
+ !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
return -EOVERFLOW;
- error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
+ error = inode_permission(mnt_userns, dir->dentry->d_inode,
+ MAY_WRITE | MAY_EXEC);
if (error)
return error;
const struct open_flags *op,
bool got_write)
{
+ struct user_namespace *mnt_userns;
struct dentry *dir = nd->path.dentry;
struct inode *dir_inode = dir->d_inode;
int open_flag = op->open_flag;
*/
if (unlikely(!got_write))
open_flag &= ~O_TRUNC;
+ mnt_userns = mnt_user_ns(nd->path.mnt);
if (open_flag & O_CREAT) {
if (open_flag & O_EXCL)
open_flag &= ~O_TRUNC;
if (!IS_POSIXACL(dir->d_inode))
mode &= ~current_umask();
if (likely(got_write))
- create_error = may_o_create(&nd->path, dentry, mode);
+ create_error = may_o_create(mnt_userns, &nd->path,
+ dentry, mode);
else
create_error = -EROFS;
}
error = -EACCES;
goto out_dput;
}
- error = dir_inode->i_op->create(dir_inode, dentry, mode,
- open_flag & O_EXCL);
+
+ error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry,
+ mode, open_flag & O_EXCL);
if (error)
goto out_dput;
}
struct inode *inode;
struct dentry *dentry;
const char *res;
- int error;
nd->flags |= op->intent;
} else {
/* create side of things */
if (nd->flags & LOOKUP_RCU) {
- error = unlazy_walk(nd);
- if (unlikely(error))
- return ERR_PTR(error);
+ if (!try_to_unlazy(nd))
+ return ERR_PTR(-ECHILD);
}
audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
/* trailing slashes? */
}
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
- error = mnt_want_write(nd->path.mnt);
- if (!error)
- got_write = true;
+ got_write = !mnt_want_write(nd->path.mnt);
/*
* do _not_ fail yet - we might not need that or fail with
* a different error; let lookup_open() decide; we'll be
static int do_open(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
+ struct user_namespace *mnt_userns;
int open_flag = op->open_flag;
bool do_truncate;
int acc_mode;
}
if (!(file->f_mode & FMODE_CREATED))
audit_inode(nd->name, nd->path.dentry, 0);
+ mnt_userns = mnt_user_ns(nd->path.mnt);
if (open_flag & O_CREAT) {
if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
return -EEXIST;
if (d_is_dir(nd->path.dentry))
return -EISDIR;
- error = may_create_in_sticky(nd->dir_mode, nd->dir_uid,
+ error = may_create_in_sticky(mnt_userns, nd,
d_backing_inode(nd->path.dentry));
if (unlikely(error))
return error;
return error;
do_truncate = true;
}
- error = may_open(&nd->path, acc_mode, open_flag);
+ error = may_open(mnt_userns, &nd->path, acc_mode, open_flag);
if (!error && !(file->f_mode & FMODE_OPENED))
error = vfs_open(&nd->path, file);
if (!error)
error = ima_file_check(file, op->acc_mode);
if (!error && do_truncate)
- error = handle_truncate(file);
+ error = handle_truncate(mnt_userns, file);
if (unlikely(error > 0)) {
WARN_ON(1);
error = -EINVAL;
return error;
}
- struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
+ /**
+ * vfs_tmpfile - create tmpfile
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dentry: pointer to dentry of the base directory
+ * @mode: mode of the new tmpfile
+ * @open_flags: flags
+ *
+ * Create a temporary file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+ struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
+ struct dentry *dentry, umode_t mode, int open_flag)
{
struct dentry *child = NULL;
struct inode *dir = dentry->d_inode;
int error;
/* we want directory to be writable */
- error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
if (error)
goto out_err;
error = -EOPNOTSUPP;
child = d_alloc(dentry, &slash_name);
if (unlikely(!child))
goto out_err;
- error = dir->i_op->tmpfile(dir, child, mode);
+ error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
if (error)
goto out_err;
error = -ENOENT;
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
}
- ima_post_create_tmpfile(inode);
+ ima_post_create_tmpfile(mnt_userns, inode);
return child;
out_err:
const struct open_flags *op,
struct file *file)
{
+ struct user_namespace *mnt_userns;
struct dentry *child;
struct path path;
int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
error = mnt_want_write(path.mnt);
if (unlikely(error))
goto out;
- child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
+ mnt_userns = mnt_user_ns(path.mnt);
+ child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
error = PTR_ERR(child);
if (IS_ERR(child))
goto out2;
path.dentry = child;
audit_inode(nd->name, child, 0);
/* Don't check for other permissions, the inode was just created */
- error = may_open(&path, 0, op->open_flag);
+ error = may_open(mnt_userns, &path, 0, op->open_flag);
- if (error)
- goto out2;
- file->f_path.mnt = path.mnt;
- error = finish_open(file, child, NULL);
+ if (!error)
+ error = vfs_open(&path, file);
out2:
mnt_drop_write(path.mnt);
out:
}
EXPORT_SYMBOL(user_path_create);
- int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ /**
+ * vfs_mknod - create device node or file
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dir: inode of @dentry
+ * @dentry: pointer to dentry of the base directory
+ * @mode: mode of the new device node or file
+ * @dev: device number of device to create
+ *
+ * Create a device node or file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
- int error = may_create(dir, dentry);
+ int error = may_create(mnt_userns, dir, dentry);
if (error)
return error;
if (error)
return error;
- error = dir->i_op->mknod(dir, dentry, mode, dev);
+ error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev);
if (!error)
fsnotify_create(dir, dentry);
return error;
static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
unsigned int dev)
{
+ struct user_namespace *mnt_userns;
struct dentry *dentry;
struct path path;
int error;
error = security_path_mknod(&path, dentry, mode, dev);
if (error)
goto out;
+
+ mnt_userns = mnt_user_ns(path.mnt);
switch (mode & S_IFMT) {
case 0: case S_IFREG:
- error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+ error = vfs_create(mnt_userns, path.dentry->d_inode,
+ dentry, mode, true);
if (!error)
- ima_post_path_mknod(dentry);
+ ima_post_path_mknod(mnt_userns, dentry);
break;
case S_IFCHR: case S_IFBLK:
- error = vfs_mknod(path.dentry->d_inode,dentry,mode,
- new_decode_dev(dev));
+ error = vfs_mknod(mnt_userns, path.dentry->d_inode,
+ dentry, mode, new_decode_dev(dev));
break;
case S_IFIFO: case S_IFSOCK:
- error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
+ error = vfs_mknod(mnt_userns, path.dentry->d_inode,
+ dentry, mode, 0);
break;
}
out:
return do_mknodat(AT_FDCWD, filename, mode, dev);
}
- int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ /**
+ * vfs_mkdir - create directory
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dir: inode of @dentry
+ * @dentry: pointer to dentry of the base directory
+ * @mode: mode of the new directory
+ *
+ * Create a directory.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+ int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt_userns, dir, dentry);
unsigned max_links = dir->i_sb->s_max_links;
if (error)
if (max_links && dir->i_nlink >= max_links)
return -EMLINK;
- error = dir->i_op->mkdir(dir, dentry, mode);
+ error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode);
if (!error)
fsnotify_mkdir(dir, dentry);
return error;
if (!IS_POSIXACL(path.dentry->d_inode))
mode &= ~current_umask();
error = security_path_mkdir(&path, dentry, mode);
- if (!error)
- error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+ if (!error) {
+ struct user_namespace *mnt_userns;
+ mnt_userns = mnt_user_ns(path.mnt);
+ error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry,
+ mode);
+ }
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
return do_mkdirat(AT_FDCWD, pathname, mode);
}
- int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+ /**
+ * vfs_rmdir - remove directory
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dir: inode of @dentry
+ * @dentry: pointer to dentry of the base directory
+ *
+ * Remove a directory.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+ int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry)
{
- int error = may_delete(dir, dentry, 1);
+ int error = may_delete(mnt_userns, dir, dentry, 1);
if (error)
return error;
long do_rmdir(int dfd, struct filename *name)
{
+ struct user_namespace *mnt_userns;
int error = 0;
struct dentry *dentry;
struct path path;
error = security_path_rmdir(&path, dentry);
if (error)
goto exit3;
- error = vfs_rmdir(path.dentry->d_inode, dentry);
+ mnt_userns = mnt_user_ns(path.mnt);
+ error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
exit3:
dput(dentry);
exit2:
/**
* vfs_unlink - unlink a filesystem object
+ * @mnt_userns: user namespace of the mount the inode was found from
* @dir: parent directory
* @dentry: victim
* @delegated_inode: returns victim inode, if the inode is delegated.
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
*/
- int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, struct inode **delegated_inode)
{
struct inode *target = dentry->d_inode;
- int error = may_delete(dir, dentry, 0);
+ int error = may_delete(mnt_userns, dir, dentry, 0);
if (error)
return error;
dentry = __lookup_hash(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (!IS_ERR(dentry)) {
+ struct user_namespace *mnt_userns;
+
/* Why not before? Because we want correct error value */
if (last.name[last.len])
goto slashes;
error = security_path_unlink(&path, dentry);
if (error)
goto exit2;
- error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
+ mnt_userns = mnt_user_ns(path.mnt);
+ error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
+ &delegated_inode);
exit2:
dput(dentry);
}
return do_unlinkat(AT_FDCWD, getname(pathname));
}
- int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+ /**
+ * vfs_symlink - create symlink
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dir: inode of @dentry
+ * @dentry: pointer to dentry of the base directory
+ * @oldname: name of the file to link to
+ *
+ * Create a symlink.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+ int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *oldname)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt_userns, dir, dentry);
if (error)
return error;
if (error)
return error;
- error = dir->i_op->symlink(dir, dentry, oldname);
+ error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname);
if (!error)
fsnotify_create(dir, dentry);
return error;
goto out_putname;
error = security_path_symlink(&path, dentry, from->name);
- if (!error)
- error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
+ if (!error) {
+ struct user_namespace *mnt_userns;
+
+ mnt_userns = mnt_user_ns(path.mnt);
+ error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry,
+ from->name);
+ }
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
/**
* vfs_link - create a new link
* @old_dentry: object to be linked
+ * @mnt_userns: the user namespace of the mount
* @dir: new parent
* @new_dentry: where to create the new link
* @delegated_inode: returns inode needing a delegation break
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
*/
- int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+ int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *new_dentry,
+ struct inode **delegated_inode)
{
struct inode *inode = old_dentry->d_inode;
unsigned max_links = dir->i_sb->s_max_links;
if (!inode)
return -ENOENT;
- error = may_create(dir, new_dentry);
+ error = may_create(mnt_userns, dir, new_dentry);
if (error)
return error;
* be writen back improperly if their true value is unknown to
* the vfs.
*/
- if (HAS_UNMAPPED_ID(inode))
+ if (HAS_UNMAPPED_ID(mnt_userns, inode))
return -EPERM;
if (!dir->i_op->link)
return -EPERM;
static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
const char __user *newname, int flags)
{
+ struct user_namespace *mnt_userns;
struct dentry *new_dentry;
struct path old_path, new_path;
struct inode *delegated_inode = NULL;
error = -EXDEV;
if (old_path.mnt != new_path.mnt)
goto out_dput;
- error = may_linkat(&old_path);
+ mnt_userns = mnt_user_ns(new_path.mnt);
+ error = may_linkat(mnt_userns, &old_path);
if (unlikely(error))
goto out_dput;
error = security_path_link(old_path.dentry, &new_path, new_dentry);
if (error)
goto out_dput;
- error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
+ error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
+ new_dentry, &delegated_inode);
out_dput:
done_path_create(&new_path, new_dentry);
if (delegated_inode) {
/**
* vfs_rename - rename a filesystem object
- * @old_dir: parent of source
- * @old_dentry: source
- * @new_dir: parent of destination
- * @new_dentry: destination
- * @delegated_inode: returns an inode needing a delegation break
- * @flags: rename flags
+ * @old_mnt_userns: old user namespace of the mount the inode was found from
+ * @old_dir: parent of source
+ * @old_dentry: source
+ * @new_mnt_userns: new user namespace of the mount the inode was found from
+ * @new_dir: parent of destination
+ * @new_dentry: destination
+ * @delegated_inode: returns an inode needing a delegation break
+ * @flags: rename flags
*
* The caller must hold multiple mutexes--see lock_rename()).
*
* ->i_mutex on parents, which works but leads to some truly excessive
* locking].
*/
- int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- struct inode **delegated_inode, unsigned int flags)
+ int vfs_rename(struct renamedata *rd)
{
int error;
+ struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
+ struct dentry *old_dentry = rd->old_dentry;
+ struct dentry *new_dentry = rd->new_dentry;
+ struct inode **delegated_inode = rd->delegated_inode;
+ unsigned int flags = rd->flags;
bool is_dir = d_is_dir(old_dentry);
struct inode *source = old_dentry->d_inode;
struct inode *target = new_dentry->d_inode;
if (source == target)
return 0;
- error = may_delete(old_dir, old_dentry, is_dir);
+ error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir);
if (error)
return error;
if (!target) {
- error = may_create(new_dir, new_dentry);
+ error = may_create(rd->new_mnt_userns, new_dir, new_dentry);
} else {
new_is_dir = d_is_dir(new_dentry);
if (!(flags & RENAME_EXCHANGE))
- error = may_delete(new_dir, new_dentry, is_dir);
+ error = may_delete(rd->new_mnt_userns, new_dir,
+ new_dentry, is_dir);
else
- error = may_delete(new_dir, new_dentry, new_is_dir);
+ error = may_delete(rd->new_mnt_userns, new_dir,
+ new_dentry, new_is_dir);
}
if (error)
return error;
*/
if (new_dir != old_dir) {
if (is_dir) {
- error = inode_permission(source, MAY_WRITE);
+ error = inode_permission(rd->old_mnt_userns, source,
+ MAY_WRITE);
if (error)
return error;
}
if ((flags & RENAME_EXCHANGE) && new_is_dir) {
- error = inode_permission(target, MAY_WRITE);
+ error = inode_permission(rd->new_mnt_userns, target,
+ MAY_WRITE);
if (error)
return error;
}
if (error)
goto out;
}
- error = old_dir->i_op->rename(old_dir, old_dentry,
- new_dir, new_dentry, flags);
+ error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry,
+ new_dir, new_dentry, flags);
if (error)
goto out;
int do_renameat2(int olddfd, struct filename *from, int newdfd,
struct filename *to, unsigned int flags)
{
+ struct renamedata rd;
struct dentry *old_dentry, *new_dentry;
struct dentry *trap;
struct path old_path, new_path;
&new_path, new_dentry, flags);
if (error)
goto exit5;
- error = vfs_rename(old_path.dentry->d_inode, old_dentry,
- new_path.dentry->d_inode, new_dentry,
- &delegated_inode, flags);
+
+ rd.old_dir = old_path.dentry->d_inode;
+ rd.old_dentry = old_dentry;
+ rd.old_mnt_userns = mnt_user_ns(old_path.mnt);
+ rd.new_dir = new_path.dentry->d_inode;
+ rd.new_dentry = new_dentry;
+ rd.new_mnt_userns = mnt_user_ns(new_path.mnt);
+ rd.delegated_inode = &delegated_inode;
+ rd.flags = flags;
+ error = vfs_rename(&rd);
exit5:
dput(new_dentry);
exit4:
fsloc->locations = NULL;
}
+static int export_stats_init(struct export_stats *stats)
+{
+ stats->start_time = ktime_get_seconds();
+ return nfsd_percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM);
+}
+
+static void export_stats_reset(struct export_stats *stats)
+{
+ nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM);
+}
+
+static void export_stats_destroy(struct export_stats *stats)
+{
+ nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM);
+}
+
static void svc_export_put(struct kref *ref)
{
struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
path_put(&exp->ex_path);
auth_domain_put(exp->ex_client);
nfsd4_fslocs_free(&exp->ex_fslocs);
+ export_stats_destroy(&exp->ex_stats);
kfree(exp->ex_uuid);
kfree_rcu(exp, ex_rcu);
}
struct svc_export *old);
static struct svc_export *svc_export_lookup(struct svc_export *);
- static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
+ static int check_export(struct path *path, int *flags, unsigned char *uuid)
{
+ struct inode *inode = d_inode(path->dentry);
/*
* We currently export only dirs, regular files, and (for v4
* or an FSID number (so NFSEXP_FSID or ->uuid is needed).
* 2: We must be able to find an inode from a filehandle.
* This means that s_export_op must be set.
+ * 3: We must not currently be on an idmapped mount.
*/
if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) &&
!(*flags & NFSEXP_FSID) &&
return -EINVAL;
}
+ if (mnt_user_ns(path->mnt) != &init_user_ns) {
+ dprintk("exp_export: export of idmapped mounts not yet supported.\n");
+ return -EINVAL;
+ }
+
if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK &&
!(*flags & NFSEXP_NOSUBTREECHECK)) {
dprintk("%s: %s does not support subtree checking!\n",
goto out4;
}
- err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags,
- exp.ex_uuid);
+ err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid);
if (err)
goto out4;
/*
kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs);
static void show_secinfo(struct seq_file *m, struct svc_export *exp);
+static int is_export_stats_file(struct seq_file *m)
+{
+ /*
+ * The export_stats file uses the same ops as the exports file.
+ * We use the file's name to determine the reported info per export.
+ * There is no rename in nsfdfs, so d_name.name is stable.
+ */
+ return !strcmp(m->file->f_path.dentry->d_name.name, "export_stats");
+}
+
static int svc_export_show(struct seq_file *m,
struct cache_detail *cd,
struct cache_head *h)
{
- struct svc_export *exp ;
+ struct svc_export *exp;
+ bool export_stats = is_export_stats_file(m);
- if (h ==NULL) {
- seq_puts(m, "#path domain(flags)\n");
+ if (h == NULL) {
+ if (export_stats)
+ seq_puts(m, "#path domain start-time\n#\tstats\n");
+ else
+ seq_puts(m, "#path domain(flags)\n");
return 0;
}
exp = container_of(h, struct svc_export, h);
seq_path(m, &exp->ex_path, " \t\n\\");
seq_putc(m, '\t');
seq_escape(m, exp->ex_client->name, " \t\n\\");
+ if (export_stats) {
+ seq_printf(m, "\t%lld\n", exp->ex_stats.start_time);
+ seq_printf(m, "\tfh_stale: %lld\n",
+ percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE]));
+ seq_printf(m, "\tio_read: %lld\n",
+ percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ]));
+ seq_printf(m, "\tio_write: %lld\n",
+ percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE]));
+ seq_putc(m, '\n');
+ return 0;
+ }
seq_putc(m, '(');
- if (test_bit(CACHE_VALID, &h->flags) &&
+ if (test_bit(CACHE_VALID, &h->flags) &&
!test_bit(CACHE_NEGATIVE, &h->flags)) {
exp_flags(m, exp->ex_flags, exp->ex_fsid,
exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs);
new->ex_layout_types = 0;
new->ex_uuid = NULL;
new->cd = item->cd;
+ export_stats_reset(&new->ex_stats);
}
static void export_update(struct cache_head *cnew, struct cache_head *citem)
static struct cache_head *svc_export_alloc(void)
{
struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL);
- if (i)
- return &i->h;
- else
+ if (!i)
+ return NULL;
+
+ if (export_stats_init(&i->ex_stats)) {
+ kfree(i);
return NULL;
+ }
+
+ return &i->h;
}
static const struct cache_detail svc_export_cache_template = {
struct cache_head *cp = p;
struct svc_export *exp = container_of(cp, struct svc_export, h);
struct cache_detail *cd = m->private;
+ bool export_stats = is_export_stats_file(m);
if (p == SEQ_START_TOKEN) {
seq_puts(m, "# Version 1.1\n");
- seq_puts(m, "# Path Client(Flags) # IPs\n");
+ if (export_stats)
+ seq_puts(m, "# Path Client Start-time\n#\tStats\n");
+ else
+ seq_puts(m, "# Path Client(Flags) # IPs\n");
return 0;
}
fh_lock(fh);
- error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
+ error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+ argp->acl_access);
if (error)
goto out_drop_lock;
- error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
+ error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+ argp->acl_default);
if (error)
goto out_drop_lock;
static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_getaclargs *argp = rqstp->rq_argp;
- p = nfs2svc_decode_fh(p, &argp->fh);
- if (!p)
+ if (!svcxdr_decode_fhandle(xdr, &argp->fh))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
return 0;
- argp->mask = ntohl(*p); p++;
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
-
static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_setaclargs *argp = rqstp->rq_argp;
- struct kvec *head = rqstp->rq_arg.head;
- unsigned int base;
- int n;
- p = nfs2svc_decode_fh(p, &argp->fh);
- if (!p)
+ if (!svcxdr_decode_fhandle(xdr, &argp->fh))
return 0;
- argp->mask = ntohl(*p++);
- if (argp->mask & ~NFS_ACL_MASK ||
- !xdr_argsize_check(rqstp, p))
+ if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
return 0;
-
- base = (char *)p - (char *)head->iov_base;
- n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
- (argp->mask & NFS_ACL) ?
- &argp->acl_access : NULL);
- if (n > 0)
- n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
- (argp->mask & NFS_DFACL) ?
- &argp->acl_default : NULL);
- return (n > 0);
-}
-
-static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p)
-{
- struct nfsd_fhandle *argp = rqstp->rq_argp;
-
- p = nfs2svc_decode_fh(p, &argp->fh);
- if (!p)
+ if (argp->mask & ~NFS_ACL_MASK)
+ return 0;
+ if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ?
+ &argp->acl_access : NULL))
return 0;
- return xdr_argsize_check(rqstp, p);
+ if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ?
+ &argp->acl_default : NULL))
+ return 0;
+
+ return 1;
}
static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
{
- struct nfsd3_accessargs *argp = rqstp->rq_argp;
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+ struct nfsd3_accessargs *args = rqstp->rq_argp;
- p = nfs2svc_decode_fh(p, &argp->fh);
- if (!p)
+ if (!svcxdr_decode_fhandle(xdr, &args->fh))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->access) < 0)
return 0;
- argp->access = ntohl(*p++);
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
/*
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
+ .pc_name = "NULL",
},
[ACLPROC2_GETACL] = {
.pc_func = nfsacld_proc_getacl,
.pc_ressize = sizeof(struct nfsd3_getaclres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+1+2*(1+ACL),
+ .pc_name = "GETACL",
},
[ACLPROC2_SETACL] = {
.pc_func = nfsacld_proc_setacl,
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
+ .pc_name = "SETACL",
},
[ACLPROC2_GETATTR] = {
.pc_func = nfsacld_proc_getattr,
- .pc_decode = nfsaclsvc_decode_fhandleargs,
+ .pc_decode = nfssvc_decode_fhandleargs,
.pc_encode = nfsaclsvc_encode_attrstatres,
.pc_release = nfsaclsvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
+ .pc_name = "GETATTR",
},
[ACLPROC2_ACCESS] = {
.pc_func = nfsacld_proc_access,
.pc_ressize = sizeof(struct nfsd3_accessres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT+1,
+ .pc_name = "SETATTR",
},
};
fh_lock(fh);
- error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
+ error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+ argp->acl_access);
if (error)
goto out_drop_lock;
- error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
+ error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+ argp->acl_default);
out_drop_lock:
fh_unlock(fh);
/*
* XDR decode functions
*/
+
static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_getaclargs *args = rqstp->rq_argp;
- p = nfs3svc_decode_fh(p, &args->fh);
- if (!p)
+ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->mask) < 0)
return 0;
- args->mask = ntohl(*p); p++;
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
-
static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
{
- struct nfsd3_setaclargs *args = rqstp->rq_argp;
- struct kvec *head = rqstp->rq_arg.head;
- unsigned int base;
- int n;
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+ struct nfsd3_setaclargs *argp = rqstp->rq_argp;
- p = nfs3svc_decode_fh(p, &args->fh);
- if (!p)
+ if (!svcxdr_decode_nfs_fh3(xdr, &argp->fh))
return 0;
- args->mask = ntohl(*p++);
- if (args->mask & ~NFS_ACL_MASK ||
- !xdr_argsize_check(rqstp, p))
+ if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
+ return 0;
+ if (argp->mask & ~NFS_ACL_MASK)
+ return 0;
+ if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ?
+ &argp->acl_access : NULL))
+ return 0;
+ if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ?
+ &argp->acl_default : NULL))
return 0;
- base = (char *)p - (char *)head->iov_base;
- n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
- (args->mask & NFS_ACL) ?
- &args->acl_access : NULL);
- if (n > 0)
- n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
- (args->mask & NFS_DFACL) ?
- &args->acl_default : NULL);
- return (n > 0);
+ return 1;
}
/*
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
+ .pc_name = "NULL",
},
[ACLPROC3_GETACL] = {
.pc_func = nfsd3_proc_getacl,
.pc_ressize = sizeof(struct nfsd3_getaclres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+1+2*(1+ACL),
+ .pc_name = "GETACL",
},
[ACLPROC3_SETACL] = {
.pc_func = nfsd3_proc_setacl,
.pc_ressize = sizeof(struct nfsd3_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT,
+ .pc_name = "SETACL",
},
};
/* make sure parents give x permission to user */
int err;
parent = dget_parent(tdentry);
- err = inode_permission(d_inode(parent), MAY_EXEC);
+ err = inode_permission(&init_user_ns,
+ d_inode(parent), MAY_EXEC);
if (err < 0) {
dput(parent);
break;
__be32
fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
{
- struct svc_export *exp;
+ struct svc_export *exp = NULL;
struct dentry *dentry;
__be32 error;
}
out:
if (error == nfserr_stale)
- nfsdstats.fh_stale++;
+ nfsd_stats_fh_stale_inc(exp);
return error;
}
if (delta < 0)
delta = -delta;
if (delta < MAX_TOUCH_TIME_ERROR &&
- setattr_prepare(fhp->fh_dentry, iap) != 0) {
+ setattr_prepare(&init_user_ns, fhp->fh_dentry, iap) != 0) {
/*
* Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
* This will cause notify_change to set these times
static __be32
nfsd_proc_readlink(struct svc_rqst *rqstp)
{
- struct nfsd_readlinkargs *argp = rqstp->rq_argp;
+ struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd_readlinkres *resp = rqstp->rq_resp;
+ char *buffer = page_address(*(rqstp->rq_next_page++));
dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
/* Read the symlink. */
resp->len = NFS_MAXPATHLEN;
- resp->status = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len);
+ resp->status = nfsd_readlink(rqstp, &argp->fh, buffer, &resp->len);
fh_put(&argp->fh);
return rpc_success;
{
struct nfsd_readargs *argp = rqstp->rq_argp;
struct nfsd_readres *resp = rqstp->rq_resp;
+ unsigned int len;
u32 eof;
+ int v;
dprintk("nfsd: READ %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->count, argp->offset);
+ argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
+
+ v = 0;
+ len = argp->count;
+ while (len > 0) {
+ struct page *page = *(rqstp->rq_next_page++);
+
+ rqstp->rq_vec[v].iov_base = page_address(page);
+ rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
+ len -= rqstp->rq_vec[v].iov_len;
+ v++;
+ }
+
/* Obtain buffer pointer for payload. 19 is 1 word for
* status, 17 words for fattr, and 1 word for the byte count.
*/
-
- if (NFSSVC_MAXBLKSIZE_V2 < argp->count) {
- char buf[RPC_MAX_ADDRBUFLEN];
- printk(KERN_NOTICE
- "oversized read request from %s (%d bytes)\n",
- svc_print_addr(rqstp, buf, sizeof(buf)),
- argp->count);
- argp->count = NFSSVC_MAXBLKSIZE_V2;
- }
svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
resp->count = argp->count;
- resp->status = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
- argp->offset,
- rqstp->rq_vec, argp->vlen,
- &resp->count,
- &eof);
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
+ rqstp->rq_vec, v, &resp->count, &eof);
if (resp->status == nfs_ok)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
return rpc_success;
}
+static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
+ struct nfsd_readdirres *resp,
+ int count)
+{
+ count = min_t(u32, count, PAGE_SIZE);
+
+ /* Convert byte count to number of words (i.e. >> 2),
+ * and reserve room for the NULL ptr & eof flag (-2 words) */
+ resp->buflen = (count >> 2) - 2;
+
+ resp->buffer = page_address(*rqstp->rq_next_page);
+ rqstp->rq_next_page++;
+}
+
/*
* Read a portion of a directory.
*/
{
struct nfsd_readdirargs *argp = rqstp->rq_argp;
struct nfsd_readdirres *resp = rqstp->rq_resp;
- int count;
loff_t offset;
+ __be32 *buffer;
dprintk("nfsd: READDIR %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->count, argp->cookie);
- /* Shrink to the client read size */
- count = (argp->count >> 2) - 2;
-
- /* Make sure we've room for the NULL ptr & eof flag */
- count -= 2;
- if (count < 0)
- count = 0;
+ nfsd_init_dirlist_pages(rqstp, resp, argp->count);
+ buffer = resp->buffer;
- resp->buffer = argp->buffer;
resp->offset = NULL;
- resp->buflen = count;
resp->common.err = nfs_ok;
/* Read directory and encode entries on the fly */
offset = argp->cookie;
resp->status = nfsd_readdir(rqstp, &argp->fh, &offset,
&resp->common, nfssvc_encode_entry);
- resp->count = resp->buffer - argp->buffer;
+ resp->count = resp->buffer - buffer;
if (resp->offset)
*resp->offset = htonl(offset);
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
+ .pc_name = "NULL",
},
[NFSPROC_GETATTR] = {
.pc_func = nfsd_proc_getattr,
- .pc_decode = nfssvc_decode_fhandle,
+ .pc_decode = nfssvc_decode_fhandleargs,
.pc_encode = nfssvc_encode_attrstat,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
+ .pc_name = "GETATTR",
},
[NFSPROC_SETATTR] = {
.pc_func = nfsd_proc_setattr,
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+AT,
+ .pc_name = "SETATTR",
},
[NFSPROC_ROOT] = {
.pc_func = nfsd_proc_root,
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
+ .pc_name = "ROOT",
},
[NFSPROC_LOOKUP] = {
.pc_func = nfsd_proc_lookup,
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+FH+AT,
+ .pc_name = "LOOKUP",
},
[NFSPROC_READLINK] = {
.pc_func = nfsd_proc_readlink,
- .pc_decode = nfssvc_decode_readlinkargs,
+ .pc_decode = nfssvc_decode_fhandleargs,
.pc_encode = nfssvc_encode_readlinkres,
- .pc_argsize = sizeof(struct nfsd_readlinkargs),
+ .pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_readlinkres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
+ .pc_name = "READLINK",
},
[NFSPROC_READ] = {
.pc_func = nfsd_proc_read,
.pc_ressize = sizeof(struct nfsd_readres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
+ .pc_name = "READ",
},
[NFSPROC_WRITECACHE] = {
.pc_func = nfsd_proc_writecache,
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
+ .pc_name = "WRITECACHE",
},
[NFSPROC_WRITE] = {
.pc_func = nfsd_proc_write,
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+AT,
+ .pc_name = "WRITE",
},
[NFSPROC_CREATE] = {
.pc_func = nfsd_proc_create,
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+FH+AT,
+ .pc_name = "CREATE",
},
[NFSPROC_REMOVE] = {
.pc_func = nfsd_proc_remove,
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
+ .pc_name = "REMOVE",
},
[NFSPROC_RENAME] = {
.pc_func = nfsd_proc_rename,
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
+ .pc_name = "RENAME",
},
[NFSPROC_LINK] = {
.pc_func = nfsd_proc_link,
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
+ .pc_name = "LINK",
},
[NFSPROC_SYMLINK] = {
.pc_func = nfsd_proc_symlink,
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
+ .pc_name = "SYMLINK",
},
[NFSPROC_MKDIR] = {
.pc_func = nfsd_proc_mkdir,
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+FH+AT,
+ .pc_name = "MKDIR",
},
[NFSPROC_RMDIR] = {
.pc_func = nfsd_proc_rmdir,
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
+ .pc_name = "RMDIR",
},
[NFSPROC_READDIR] = {
.pc_func = nfsd_proc_readdir,
.pc_argsize = sizeof(struct nfsd_readdirargs),
.pc_ressize = sizeof(struct nfsd_readdirres),
.pc_cachetype = RC_NOCACHE,
+ .pc_name = "READDIR",
},
[NFSPROC_STATFS] = {
.pc_func = nfsd_proc_statfs,
- .pc_decode = nfssvc_decode_fhandle,
+ .pc_decode = nfssvc_decode_fhandleargs,
.pc_encode = nfssvc_encode_statfsres,
.pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_statfsres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+5,
+ .pc_name = "STATFS",
},
};
.ia_size = iap->ia_size,
};
- host_err = notify_change(dentry, &size_attr, NULL);
+ host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
if (host_err)
goto out_unlock;
iap->ia_valid &= ~ATTR_SIZE;
}
iap->ia_valid |= ATTR_CTIME;
- host_err = notify_change(dentry, iap, NULL);
+ host_err = notify_change(&init_user_ns, dentry, iap, NULL);
out_unlock:
fh_unlock(fhp);
return 0;
if (!(inode->i_mode & S_ISVTX))
return 0;
- if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0)
+ if (vfs_getxattr(&init_user_ns, dentry, NFSD_JUNCTION_XATTR_NAME,
+ NULL, 0) <= 0)
return 0;
return 1;
}
unsigned long *count, u32 *eof, ssize_t host_err)
{
if (host_err >= 0) {
- nfsdstats.io_read += host_err;
+ nfsd_stats_io_read_add(fhp->fh_export, host_err);
*eof = nfsd_eof_on_read(file, offset, host_err, *count);
*count = host_err;
fsnotify_access(file);
goto out_nfserr;
}
*cnt = host_err;
- nfsdstats.io_write += *cnt;
+ nfsd_stats_io_write_add(exp, *cnt);
fsnotify_modify(file);
if (stable && use_wgather) {
host_err = 0;
switch (type) {
case S_IFREG:
- host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+ host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
if (!host_err)
nfsd_check_ignore_resizing(iap);
break;
case S_IFDIR:
- host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+ host_err = vfs_mkdir(&init_user_ns, dirp, dchild, iap->ia_mode);
if (!host_err && unlikely(d_unhashed(dchild))) {
struct dentry *d;
d = lookup_one_len(dchild->d_name.name,
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
- host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
+ host_err = vfs_mknod(&init_user_ns, dirp, dchild,
+ iap->ia_mode, rdev);
break;
default:
printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
if (!IS_POSIXACL(dirp))
iap->ia_mode &= ~current_umask();
- host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+ host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
if (host_err < 0) {
fh_drop_write(fhp);
goto out_nfserr;
if (IS_ERR(dnew))
goto out_nfserr;
- host_err = vfs_symlink(d_inode(dentry), dnew, path);
+ host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path);
err = nfserrno(host_err);
if (!err)
err = nfserrno(commit_metadata(fhp));
err = nfserr_noent;
if (d_really_is_negative(dold))
goto out_dput;
- host_err = vfs_link(dold, dirp, dnew, NULL);
+ host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL);
if (!host_err) {
err = nfserrno(commit_metadata(ffhp));
if (!err)
close_cached = true;
goto out_dput_old;
} else {
- host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
+ struct renamedata rd = {
+ .old_mnt_userns = &init_user_ns,
+ .old_dir = fdir,
+ .old_dentry = odentry,
+ .new_mnt_userns = &init_user_ns,
+ .new_dir = tdir,
+ .new_dentry = ndentry,
+ };
+ host_err = vfs_rename(&rd);
if (!host_err) {
host_err = commit_metadata(tfhp);
if (!host_err)
if (type != S_IFDIR) {
if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK)
nfsd_close_cached_files(rdentry);
- host_err = vfs_unlink(dirp, rdentry, NULL);
+ host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL);
} else {
- host_err = vfs_rmdir(dirp, rdentry);
+ host_err = vfs_rmdir(&init_user_ns, dirp, rdentry);
}
if (!host_err)
inode_lock_shared(inode);
- len = vfs_getxattr(dentry, name, NULL, 0);
+ len = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0);
/*
* Zero-length attribute, just return.
goto out;
}
- len = vfs_getxattr(dentry, name, buf, len);
+ len = vfs_getxattr(&init_user_ns, dentry, name, buf, len);
if (len <= 0) {
kvfree(buf);
buf = NULL;
fh_lock(fhp);
- ret = __vfs_removexattr_locked(fhp->fh_dentry, name, NULL);
+ ret = __vfs_removexattr_locked(&init_user_ns, fhp->fh_dentry,
+ name, NULL);
fh_unlock(fhp);
fh_drop_write(fhp);
return nfserrno(ret);
fh_lock(fhp);
- ret = __vfs_setxattr_locked(fhp->fh_dentry, name, buf, len, flags,
- NULL);
+ ret = __vfs_setxattr_locked(&init_user_ns, fhp->fh_dentry, name, buf,
+ len, flags, NULL);
fh_unlock(fhp);
fh_drop_write(fhp);
return 0;
/* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
- err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
+ err = inode_permission(&init_user_ns, inode,
+ acc & (MAY_READ | MAY_WRITE | MAY_EXEC));
/* Allow read access to binaries even when mode 111 */
if (err == -EACCES && S_ISREG(inode->i_mode) &&
(acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
- err = inode_permission(inode, MAY_EXEC);
+ err = inode_permission(&init_user_ns, inode, MAY_EXEC);
return err? nfserrno(err) : 0;
}
}
/* you can only watch an inode if you have read permissions on it */
- ret = inode_permission(path->dentry->d_inode, MAY_READ);
+ ret = path_permission(path, MAY_READ);
if (ret) {
path_put(path);
goto out;
f_flags |= O_NONBLOCK;
/* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
- group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
+ group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
if (IS_ERR(group)) {
free_uid(user);
return PTR_ERR(group);
if (error)
return error;
/* you can only watch an inode if you have read permissions on it */
- error = inode_permission(path->dentry->d_inode, MAY_READ);
+ error = path_permission(path, MAY_READ);
if (error) {
path_put(path);
return error;
struct fsnotify_group *group;
struct inotify_event_info *oevent;
- group = fsnotify_alloc_group(&inotify_fsnotify_ops);
+ group = fsnotify_alloc_user_group(&inotify_fsnotify_ops);
if (IS_ERR(group))
return group;
- oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
+ oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT);
if (unlikely(!oevent)) {
fsnotify_destroy_group(group);
return ERR_PTR(-ENOMEM);
needs_barrier = true;
err = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
- ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev);
if (!err)
err = ret;
}
return ret;
}
- int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
+ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
int status = 0, size_change;
int inode_locked = 0;
if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
return 0;
- status = setattr_prepare(dentry, attr);
+ status = setattr_prepare(&init_user_ns, dentry, attr);
if (status)
return status;
}
}
- setattr_copy(inode, attr);
+ setattr_copy(&init_user_ns, inode, attr);
mark_inode_dirty(inode);
status = ocfs2_mark_inode_dirty(handle, inode, bh);
return status;
}
- int ocfs2_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
+ int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct inode *inode = d_inode(path->dentry);
struct super_block *sb = path->dentry->d_sb;
goto bail;
}
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
/*
* If there is inline data in the inode, the inode will normally not
* have data blocks allocated (it may have an external xattr block).
return err;
}
- int ocfs2_permission(struct inode *inode, int mask)
+ int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode,
+ int mask)
{
int ret, had_lock;
struct ocfs2_lock_holder oh;
dump_stack();
}
- ret = generic_permission(inode, mask);
+ ret = generic_permission(&init_user_ns, inode, mask);
ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
out:
#include "internal.h"
- int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
- struct file *filp)
+ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry,
+ loff_t length, unsigned int time_attrs, struct file *filp)
{
int ret;
struct iattr newattrs;
inode_lock(dentry->d_inode);
/* Note any delegations or leases have already been broken: */
- ret = notify_change(dentry, &newattrs, NULL);
+ ret = notify_change(mnt_userns, dentry, &newattrs, NULL);
inode_unlock(dentry->d_inode);
return ret;
}
long vfs_truncate(const struct path *path, loff_t length)
{
+ struct user_namespace *mnt_userns;
struct inode *inode;
long error;
if (error)
goto out;
- error = inode_permission(inode, MAY_WRITE);
+ mnt_userns = mnt_user_ns(path->mnt);
+ error = inode_permission(mnt_userns, inode, MAY_WRITE);
if (error)
goto mnt_drop_write_and_out;
if (!error)
error = security_path_truncate(path);
if (!error)
- error = do_truncate(path->dentry, length, 0, NULL);
+ error = do_truncate(mnt_userns, path->dentry, length, 0, NULL);
put_write_and_out:
put_write_access(inode);
/* Check IS_APPEND on real upper inode */
if (IS_APPEND(file_inode(f.file)))
goto out_putf;
-
sb_start_write(inode->i_sb);
error = locks_verify_truncate(inode, f.file, length);
if (!error)
error = security_path_truncate(&f.file->f_path);
if (!error)
- error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
+ error = do_truncate(file_mnt_user_ns(f.file), dentry, length,
+ ATTR_MTIME | ATTR_CTIME, f.file);
sb_end_write(inode->i_sb);
out_putf:
fdput(f);
goto out_path_release;
}
- res = inode_permission(inode, mode | MAY_ACCESS);
+ res = inode_permission(mnt_user_ns(path.mnt), inode, mode | MAY_ACCESS);
/* SuS v2 requires we report a read only fs too */
if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
goto out_path_release;
if (error)
goto out;
- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+ error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
if (!d_can_lookup(f.file->f_path.dentry))
goto out_putf;
- error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR);
+ error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, &f.file->f_path);
out_putf:
if (error)
goto out;
- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+ error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
goto out_unlock;
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ error = notify_change(mnt_user_ns(path->mnt), path->dentry,
+ &newattrs, &delegated_inode);
out_unlock:
inode_unlock(inode);
if (delegated_inode) {
int chown_common(const struct path *path, uid_t user, gid_t group)
{
+ struct user_namespace *mnt_userns;
struct inode *inode = path->dentry->d_inode;
struct inode *delegated_inode = NULL;
int error;
uid = make_kuid(current_user_ns(), user);
gid = make_kgid(current_user_ns(), group);
+ mnt_userns = mnt_user_ns(path->mnt);
+ uid = kuid_from_mnt(mnt_userns, uid);
+ gid = kgid_from_mnt(mnt_userns, gid);
+
retry_deleg:
newattrs.ia_valid = ATTR_CTIME;
if (user != (uid_t) -1) {
inode_lock(inode);
error = security_path_chown(path, uid, gid);
if (!error)
- error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ error = notify_change(mnt_userns, path->dentry, &newattrs,
+ &delegated_inode);
inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
lookup_flags |= LOOKUP_BENEATH;
if (how->resolve & RESOLVE_IN_ROOT)
lookup_flags |= LOOKUP_IN_ROOT;
+ if (how->resolve & RESOLVE_CACHED) {
+ /* Don't bother even trying for create/truncate/tmpfile open */
+ if (flags & (O_TRUNC | O_CREAT | O_TMPFILE))
+ return -EAGAIN;
+ lookup_flags |= LOOKUP_CACHED;
+ }
op->lookup_flags = lookup_flags;
return 0;
if (ovl_is_private_xattr(sb, name))
continue;
+
+ error = security_inode_copy_up_xattr(name);
+ if (error < 0 && error != -EOPNOTSUPP)
+ break;
+ if (error == 1) {
+ error = 0;
+ continue; /* Discard */
+ }
retry:
- size = vfs_getxattr(old, name, value, value_size);
+ size = vfs_getxattr(&init_user_ns, old, name, value, value_size);
if (size == -ERANGE)
- size = vfs_getxattr(old, name, NULL, 0);
+ size = vfs_getxattr(&init_user_ns, old, name, NULL, 0);
if (size < 0) {
error = size;
goto retry;
}
- error = vfs_setxattr(new, name, value, size, 0);
- error = security_inode_copy_up_xattr(name);
- if (error < 0 && error != -EOPNOTSUPP)
- break;
- if (error == 1) {
- error = 0;
- continue; /* Discard */
- }
+ error = vfs_setxattr(&init_user_ns, new, name, value, size, 0);
if (error) {
if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))
break;
.ia_size = stat->size,
};
- return notify_change(upperdentry, &attr, NULL);
+ return notify_change(&init_user_ns, upperdentry, &attr, NULL);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
.ia_mtime = stat->mtime,
};
- return notify_change(upperdentry, &attr, NULL);
+ return notify_change(&init_user_ns, upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
- err = notify_change(upperdentry, &attr, NULL);
+ err = notify_change(&init_user_ns, upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
- err = notify_change(upperdentry, &attr, NULL);
+ err = notify_change(&init_user_ns, upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
ssize_t res;
char *buf;
- res = vfs_getxattr(dentry, name, NULL, 0);
+ res = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0);
if (res == -ENODATA || res == -EOPNOTSUPP)
res = 0;
if (!buf)
return -ENOMEM;
- res = vfs_getxattr(dentry, name, buf, res);
+ res = vfs_getxattr(&init_user_ns, dentry, name, buf, res);
if (res < 0)
kfree(buf);
else
* don't want that to happen for normal copy-up operation.
*/
if (capability) {
- err = vfs_setxattr(upperpath.dentry, XATTR_NAME_CAPS,
- capability, cap_size, 0);
+ err = vfs_setxattr(&init_user_ns, upperpath.dentry,
+ XATTR_NAME_CAPS, capability, cap_size, 0);
if (err)
goto out_free;
}
if (err < 0)
goto out_free;
- err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE);
+ err = vfs_setxattr(&init_user_ns, upperdentry, name, buffer, size, XATTR_CREATE);
out_free:
kfree(buffer);
return err;
.ia_mode = cattr->mode,
};
inode_lock(newdentry->d_inode);
- err = notify_change(newdentry, &attr, NULL);
+ err = notify_change(&init_user_ns, newdentry, &attr, NULL);
inode_unlock(newdentry->d_inode);
if (err)
goto out_cleanup;
inode->i_state |= I_CREATING;
spin_unlock(&inode->i_lock);
- inode_init_owner(inode, dentry->d_parent->d_inode, mode);
+ inode_init_owner(&init_user_ns, inode, dentry->d_parent->d_inode, mode);
attr.mode = inode->i_mode;
err = ovl_create_or_link(dentry, inode, &attr, false);
return err;
}
- static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+ static int ovl_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
- static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int ovl_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
- static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
- dev_t rdev)
+ static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return ovl_create_object(dentry, mode, rdev, NULL);
}
- static int ovl_symlink(struct inode *dir, struct dentry *dentry,
- const char *link)
+ static int ovl_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
goto out_dput_upper;
if (is_dir)
- err = vfs_rmdir(dir, upper);
+ err = vfs_rmdir(&init_user_ns, dir, upper);
else
- err = vfs_unlink(dir, upper, NULL);
+ err = vfs_unlink(&init_user_ns, dir, upper, NULL);
ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry));
/*
buflen -= thislen;
memcpy(&buf[buflen], name, thislen);
- tmp = dget_dlock(d->d_parent);
spin_unlock(&d->d_lock);
+ tmp = dget_parent(d);
dput(d);
d = tmp;
return err;
}
- static int ovl_rename(struct inode *olddir, struct dentry *old,
- struct inode *newdir, struct dentry *new,
- unsigned int flags)
+ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
+ struct dentry *old, struct inode *newdir,
+ struct dentry *new, unsigned int flags)
{
int err;
struct dentry *old_upperdir;
acc_mode |= MAY_APPEND;
old_cred = ovl_override_creds(inode->i_sb);
- err = inode_permission(realinode, MAY_OPEN | acc_mode);
+ err = inode_permission(&init_user_ns, realinode, MAY_OPEN | acc_mode);
if (err) {
realfile = ERR_PTR(err);
} else {
- if (!inode_owner_or_capable(realinode))
+ if (!inode_owner_or_capable(&init_user_ns, realinode))
flags &= ~O_NOATIME;
realfile = open_with_fake_path(&file->f_path, flags, realinode,
const struct cred *old_cred;
int ret;
- if (!ovl_should_sync(OVL_FS(file_inode(file)->i_sb)))
- return 0;
+ ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
+ if (ret <= 0)
+ return ret;
ret = ovl_real_fdget_meta(file, &real, !datasync);
if (ret)
long ret;
struct inode *inode = file_inode(file);
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
return -EACCES;
ret = mnt_want_write_file(file);
#include "overlayfs.h"
- int ovl_setattr(struct dentry *dentry, struct iattr *attr)
+ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
int err;
bool full_copy_up = false;
struct dentry *upperdentry;
const struct cred *old_cred;
- err = setattr_prepare(dentry, attr);
+ err = setattr_prepare(&init_user_ns, dentry, attr);
if (err)
return err;
inode_lock(upperdentry->d_inode);
old_cred = ovl_override_creds(dentry->d_sb);
- err = notify_change(upperdentry, attr, NULL);
+ err = notify_change(&init_user_ns, upperdentry, attr, NULL);
revert_creds(old_cred);
if (!err)
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
return 0;
}
- int ovl_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
+ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct dentry *dentry = path->dentry;
enum ovl_path_type type;
return err;
}
- int ovl_permission(struct inode *inode, int mask)
+ int ovl_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
{
struct inode *upperinode = ovl_inode_upper(inode);
struct inode *realinode = upperinode ?: ovl_inode_lower(inode);
* Check overlay inode with the creds of task and underlying inode
* with creds of mounter
*/
- err = generic_permission(inode, mask);
+ err = generic_permission(&init_user_ns, inode, mask);
if (err)
return err;
/* Make sure mounter can read file for copy up later */
mask |= MAY_READ;
}
- err = inode_permission(realinode, mask);
+ err = inode_permission(&init_user_ns, realinode, mask);
revert_creds(old_cred);
return err;
goto out;
if (!value && !upperdentry) {
- err = vfs_getxattr(realdentry, name, NULL, 0);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0);
+ revert_creds(old_cred);
if (err < 0)
goto out_drop_write;
}
old_cred = ovl_override_creds(dentry->d_sb);
if (value)
- err = vfs_setxattr(realdentry, name, value, size, flags);
+ err = vfs_setxattr(&init_user_ns, realdentry, name, value, size,
+ flags);
else {
WARN_ON(flags != XATTR_REPLACE);
- err = vfs_removexattr(realdentry, name);
+ err = vfs_removexattr(&init_user_ns, realdentry, name);
}
revert_creds(old_cred);
ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry);
old_cred = ovl_override_creds(dentry->d_sb);
- res = vfs_getxattr(realdentry, name, value, size);
+ res = vfs_getxattr(&init_user_ns, realdentry, name, value, size);
revert_creds(old_cred);
return res;
}
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
- int err = vfs_rmdir(dir, dentry);
+ int err = vfs_rmdir(&init_user_ns, dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
- int err = vfs_unlink(dir, dentry, NULL);
+ int err = vfs_unlink(&init_user_ns, dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry)
{
- int err = vfs_link(old_dentry, dir, new_dentry, NULL);
+ int err = vfs_link(old_dentry, &init_user_ns, dir, new_dentry, NULL);
pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err);
return err;
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode)
{
- int err = vfs_create(dir, dentry, mode, true);
+ int err = vfs_create(&init_user_ns, dir, dentry, mode, true);
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode)
{
- int err = vfs_mkdir(dir, dentry, mode);
+ int err = vfs_mkdir(&init_user_ns, dir, dentry, mode);
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev)
{
- int err = vfs_mknod(dir, dentry, mode, dev);
+ int err = vfs_mknod(&init_user_ns, dir, dentry, mode, dev);
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
return err;
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname)
{
- int err = vfs_symlink(dir, dentry, oldname);
+ int err = vfs_symlink(&init_user_ns, dir, dentry, oldname);
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
size_t size)
{
const char *name = ovl_xattr(ofs, ox);
- return vfs_getxattr(dentry, name, value, size);
+ return vfs_getxattr(&init_user_ns, dentry, name, value, size);
}
static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
size_t size)
{
const char *name = ovl_xattr(ofs, ox);
- int err = vfs_setxattr(dentry, name, value, size, 0);
+ int err = vfs_setxattr(&init_user_ns, dentry, name, value, size, 0);
pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n",
dentry, name, min((int)size, 48), value, size, err);
return err;
enum ovl_xattr ox)
{
const char *name = ovl_xattr(ofs, ox);
- int err = vfs_removexattr(dentry, name);
+ int err = vfs_removexattr(&init_user_ns, dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
unsigned int flags)
{
int err;
+ struct renamedata rd = {
+ .old_mnt_userns = &init_user_ns,
+ .old_dir = olddir,
+ .old_dentry = olddentry,
+ .new_mnt_userns = &init_user_ns,
+ .new_dir = newdir,
+ .new_dentry = newdentry,
+ .flags = flags,
+ };
pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
- err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
+ err = vfs_rename(&rd);
if (err) {
pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
- int err = vfs_whiteout(dir, dentry);
+ int err = vfs_whiteout(&init_user_ns, dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode)
{
- struct dentry *ret = vfs_tmpfile(dentry, mode, 0);
+ struct dentry *ret = vfs_tmpfile(&init_user_ns, dentry, mode, 0);
int err = PTR_ERR_OR_ZERO(ret);
pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry,
int padding);
+int ovl_sync_status(struct ovl_fs *ofs);
static inline bool ovl_is_impuredir(struct super_block *sb,
struct dentry *dentry)
unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
struct dentry *upperdentry,
unsigned int fallback);
- int ovl_setattr(struct dentry *dentry, struct iattr *attr);
- int ovl_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags);
- int ovl_permission(struct inode *inode, int mask);
+ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr);
+ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags);
+ int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode,
+ int mask);
int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
const void *value, size_t size, int flags);
int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
struct super_block *upper_sb;
int ret;
- if (!ovl_upper_mnt(ofs))
- return 0;
+ ret = ovl_sync_status(ofs);
+ /*
+ * We have to always set the err, because the return value isn't
+ * checked in syncfs, and instead indirectly return an error via
+ * the sb's writeback errseq, which VFS inspects after this call.
+ */
+ if (ret < 0) {
+ errseq_set(&sb->s_wb_err, -EIO);
+ return -EIO;
+ }
+
+ if (!ret)
+ return ret;
- if (!ovl_should_sync(ofs))
- return 0;
/*
* Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC).
* All the super blocks will be iterated, including upper_sb.
* allowed as upper are limited to "normal" ones, where checking
* for the above two errors is sufficient.
*/
- err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT);
+ err = vfs_removexattr(&init_user_ns, work,
+ XATTR_NAME_POSIX_ACL_DEFAULT);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
- err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS);
+ err = vfs_removexattr(&init_user_ns, work,
+ XATTR_NAME_POSIX_ACL_ACCESS);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
/* Clear any inherited mode bits */
inode_lock(work->d_inode);
- err = notify_change(work, &attr, NULL);
+ err = notify_change(&init_user_ns, work, &attr, NULL);
inode_unlock(work->d_inode);
if (err)
goto out_dput;
pr_err("filesystem on '%s' not supported\n", name);
goto out_put;
}
+ if (mnt_user_ns(path->mnt) != &init_user_ns) {
+ pr_err("idmapped layers are currently not supported\n");
+ goto out_put;
+ }
if (!d_is_dir(path->dentry)) {
pr_err("'%s' not a directory\n", name);
goto out_put;
static int __maybe_unused
ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *dentry, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
goto out_acl_release;
}
err = -EPERM;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(&init_user_ns, inode))
goto out_acl_release;
posix_acl_release(acl);
if (unlikely(inode->i_mode & S_ISGID) &&
handler->flags == ACL_TYPE_ACCESS &&
!in_group_p(inode->i_gid) &&
- !capable_wrt_inode_uidgid(inode, CAP_FSETID)) {
+ !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
- err = ovl_setattr(dentry, &iattr);
+ err = ovl_setattr(&init_user_ns, dentry, &iattr);
if (err)
return err;
}
}
static int ovl_own_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *dentry, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
}
static int ovl_other_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *dentry, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
unsigned int numlower;
int err;
+ err = -EIO;
+ if (WARN_ON(sb->s_user_ns != current_user_ns()))
+ goto out;
+
sb->s_d_op = &ovl_dentry_operations;
err = -ENOMEM;
sb->s_op = &ovl_super_operations;
if (ofs->config.upperdir) {
+ struct super_block *upper_sb;
+
if (!ofs->config.workdir) {
pr_err("missing 'workdir'\n");
goto out_err;
if (err)
goto out_err;
+ upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
+ if (!ovl_should_sync(ofs)) {
+ ofs->errseq = errseq_sample(&upper_sb->s_wb_err);
+ if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) {
+ err = -EIO;
+ pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n");
+ goto out_err;
+ }
+ }
+
err = ovl_get_workdir(sb, ofs, &upperpath);
if (err)
goto out_err;
if (!ofs->workdir)
sb->s_flags |= SB_RDONLY;
- sb->s_stack_depth = ovl_upper_mnt(ofs)->mnt_sb->s_stack_depth;
- sb->s_time_gran = ovl_upper_mnt(ofs)->mnt_sb->s_time_gran;
-
+ sb->s_stack_depth = upper_sb->s_stack_depth;
+ sb->s_time_gran = upper_sb->s_time_gran;
}
oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers);
err = PTR_ERR(oe);
BUG();
}
- err = inode_permission(inode, acc_mode | MAY_OPEN);
+ err = inode_permission(&init_user_ns, inode, acc_mode | MAY_OPEN);
if (err)
return ERR_PTR(err);
/* O_NOATIME is an optimization, don't fail if not permitted */
- if (inode_owner_or_capable(inode))
+ if (inode_owner_or_capable(&init_user_ns, inode))
flags |= O_NOATIME;
return dentry_open(path, flags, current_cred());
kfree(buf);
return ERR_PTR(res);
}
+
+/*
+ * ovl_sync_status() - Check fs sync status for volatile mounts
+ *
+ * Returns 1 if this is not a volatile mount and a real sync is required.
+ *
+ * Returns 0 if syncing can be skipped because mount is volatile, and no errors
+ * have occurred on the upperdir since the mount.
+ *
+ * Returns -errno if it is a volatile mount, and the error that occurred since
+ * the last mount. If the error code changes, it'll return the latest error
+ * code.
+ */
+
+int ovl_sync_status(struct ovl_fs *ofs)
+{
+ struct vfsmount *mnt;
+
+ if (ovl_should_sync(ofs))
+ return 1;
+
+ mnt = ovl_upper_mnt(ofs);
+ if (!mnt)
+ return 0;
+
+ return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq);
+}
return 0;
}
- static int proc_sys_permission(struct inode *inode, int mask)
+ static int proc_sys_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
{
/*
* sysctl entries that are not writeable,
return error;
}
- static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
+ static int proc_sys_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int error;
if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
return -EPERM;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(&init_user_ns, dentry, attr);
if (error)
return error;
- setattr_copy(inode, attr);
+ setattr_copy(&init_user_ns, inode, attr);
mark_inode_dirty(inode);
return 0;
}
- static int proc_sys_getattr(const struct path *path, struct kstat *stat,
+ static int proc_sys_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
if (IS_ERR(head))
return PTR_ERR(head);
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
if (table)
stat->mode = (stat->mode & S_IFMT) | table->mode;
return 0;
}
+ if (!val)
+ return -EINVAL;
+ len = strlen(val);
+ if (len == 0)
+ return -EINVAL;
+
/*
* To set sysctl options, we use a temporary mount of proc, look up the
* respective sys/ file and write to it. To avoid mounting it when no
file, param, val);
goto out;
}
- len = strlen(val);
wret = kernel_write(file, val, len, &pos);
if (wret < 0) {
err = wret;
struct fscrypt_name nm = {0};
int err;
- if (ubifs_inode(host)->xattr_cnt < ubifs_xattr_max_cnt(c))
+ if (ubifs_inode(host)->xattr_cnt <= ubifs_xattr_max_cnt(c))
return 0;
ubifs_warn(c, "inode %lu has too many xattrs, doing a non-atomic deletion",
}
static int xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *dentry, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
#include <linux/backing-dev.h>
#include <linux/mman.h>
#include <linux/fadvise.h>
+ #include <linux/mount.h>
static const struct vm_operations_struct xfs_file_vm_ops;
return xfs_log_force_inode(ip);
}
+static xfs_lsn_t
+xfs_fsync_lsn(
+ struct xfs_inode *ip,
+ bool datasync)
+{
+ if (!xfs_ipincount(ip))
+ return 0;
+ if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ return 0;
+ return ip->i_itemp->ili_last_lsn;
+}
+
+/*
+ * All metadata updates are logged, which means that we just have to flush the
+ * log up to the latest LSN that touched the inode.
+ *
+ * If we have concurrent fsync/fdatasync() calls, we need them to all block on
+ * the log force before we clear the ili_fsync_fields field. This ensures that
+ * we don't get a racing sync operation that does not wait for the metadata to
+ * hit the journal before returning. If we race with clearing ili_fsync_fields,
+ * then all that will happen is the log force will do nothing as the lsn will
+ * already be on disk. We can't race with setting ili_fsync_fields because that
+ * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
+ * shared until after the ili_fsync_fields is cleared.
+ */
+static int
+xfs_fsync_flush_log(
+ struct xfs_inode *ip,
+ bool datasync,
+ int *log_flushed)
+{
+ int error = 0;
+ xfs_lsn_t lsn;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ lsn = xfs_fsync_lsn(ip, datasync);
+ if (lsn) {
+ error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC,
+ log_flushed);
+
+ spin_lock(&ip->i_itemp->ili_lock);
+ ip->i_itemp->ili_fsync_fields = 0;
+ spin_unlock(&ip->i_itemp->ili_lock);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
+}
+
STATIC int
xfs_file_fsync(
struct file *file,
loff_t end,
int datasync)
{
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct xfs_inode *ip = XFS_I(file->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
int error = 0;
int log_flushed = 0;
- xfs_lsn_t lsn = 0;
trace_xfs_file_fsync(ip);
xfs_blkdev_issue_flush(mp->m_ddev_targp);
/*
- * All metadata updates are logged, which means that we just have to
- * flush the log up to the latest LSN that touched the inode. If we have
- * concurrent fsync/fdatasync() calls, we need them to all block on the
- * log force before we clear the ili_fsync_fields field. This ensures
- * that we don't get a racing sync operation that does not wait for the
- * metadata to hit the journal before returning. If we race with
- * clearing the ili_fsync_fields, then all that will happen is the log
- * force will do nothing as the lsn will already be on disk. We can't
- * race with setting ili_fsync_fields because that is done under
- * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
- * until after the ili_fsync_fields is cleared.
+ * Any inode that has dirty modifications in the log is pinned. The
+ * racy check here for a pinned inode while not catch modifications
+ * that happen concurrently to the fsync call, but fsync semantics
+ * only require to sync previously completed I/O.
*/
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip)) {
- if (!datasync ||
- (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- lsn = iip->ili_last_lsn;
- }
-
- if (lsn) {
- error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
- spin_lock(&iip->ili_lock);
- iip->ili_fsync_fields = 0;
- spin_unlock(&iip->ili_lock);
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
/*
* If we only have a single device, and the log force about was
return error;
}
+static int
+xfs_ilock_iocb(
+ struct kiocb *iocb,
+ unsigned int lock_mode)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, lock_mode))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, lock_mode);
+ }
+
+ return 0;
+}
+
STATIC ssize_t
-xfs_file_dio_aio_read(
+xfs_file_dio_read(
struct kiocb *iocb,
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
- size_t count = iov_iter_count(to);
ssize_t ret;
- trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
+ trace_xfs_file_direct_read(iocb, to);
- if (!count)
+ if (!iov_iter_count(to))
return 0; /* skip atime */
file_accessed(iocb->ki_filp);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
- size_t count = iov_iter_count(to);
ssize_t ret = 0;
- trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+ trace_xfs_file_dax_read(iocb, to);
- if (!count)
+ if (!iov_iter_count(to))
return 0; /* skip atime */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
-
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
}
STATIC ssize_t
-xfs_file_buffered_aio_read(
+xfs_file_buffered_read(
struct kiocb *iocb,
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
ssize_t ret;
- trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+ trace_xfs_file_buffered_read(iocb, to);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
ret = generic_file_read_iter(iocb, to);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
if (IS_DAX(inode))
ret = xfs_file_dax_read(iocb, to);
else if (iocb->ki_flags & IOCB_DIRECT)
- ret = xfs_file_dio_aio_read(iocb, to);
+ ret = xfs_file_dio_read(iocb, to);
else
- ret = xfs_file_buffered_aio_read(iocb, to);
+ ret = xfs_file_buffered_read(iocb, to);
if (ret > 0)
XFS_STATS_ADD(mp, xs_read_bytes, ret);
* if called for a direct write beyond i_size.
*/
STATIC ssize_t
-xfs_file_aio_write_checks(
+xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
int *iolock)
if (error <= 0)
return error;
- error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ error = break_layout(inode, false);
+ if (error == -EWOULDBLOCK)
+ error = -EAGAIN;
+ } else {
+ error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ }
+
if (error)
return error;
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, *iolock);
+ error = xfs_ilock_iocb(iocb, *iolock);
+ if (error) {
+ *iolock = 0;
+ return error;
+ }
goto restart;
}
/*
isize = i_size_read(inode);
if (iocb->ki_pos > isize) {
spin_unlock(&ip->i_flags_lock);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_iunlock(ip, *iolock);
} else
spin_unlock(&ip->i_flags_lock);
- /*
- * Updating the timestamps will grab the ilock again from
- * xfs_fs_dirty_inode, so we have to call it after dropping the
- * lock above. Eventually we should look into a way to avoid
- * the pointless lock roundtrip.
- */
return file_modified(file);
}
};
/*
- * xfs_file_dio_aio_write - handle direct IO writes
- *
- * Lock the inode appropriately to prepare for and issue a direct IO write.
- * By separating it from the buffered write path we remove all the tricky to
- * follow locking changes and looping.
- *
- * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
- * until we're sure the bytes at the new EOF have been zeroed and/or the cached
- * pages are flushed out.
- *
- * In most cases the direct IO writes will be done holding IOLOCK_SHARED
- * allowing them to be done in parallel with reads and other direct IO writes.
- * However, if the IO is not aligned to filesystem blocks, the direct IO layer
- * needs to do sub-block zeroing and that requires serialisation against other
- * direct IOs to the same block. In this case we need to serialise the
- * submission of the unaligned IOs so that we don't get racing block zeroing in
- * the dio layer. To avoid the problem with aio, we also need to wait for
- * outstanding IOs to complete so that unwritten extent conversion is completed
- * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. inode_dio_wait()).
- *
- * Returns with locks held indicated by @iolock and errors indicated by
- * negative return values.
+ * Handle block aligned direct I/O writes
*/
-STATIC ssize_t
-xfs_file_dio_aio_write(
+static noinline ssize_t
+xfs_file_dio_write_aligned(
+ struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- ssize_t ret = 0;
- int unaligned_io = 0;
- int iolock;
- size_t count = iov_iter_count(from);
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ int iolock = XFS_IOLOCK_SHARED;
+ ssize_t ret;
- /* DIO must be aligned to device logical sector size */
- if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
- return -EINVAL;
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
+ ret = xfs_file_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out_unlock;
/*
- * Don't take the exclusive iolock here unless the I/O is unaligned to
- * the file system block size. We don't need to consider the EOF
- * extension case here because xfs_file_aio_write_checks() will relock
- * the inode as necessary for EOF zeroing cases and fill out the new
- * inode size as appropriate.
+ * We don't need to hold the IOLOCK exclusively across the IO, so demote
+ * the iolock back to shared if we had to take the exclusive lock in
+ * xfs_file_write_checks() for other reasons.
*/
- if ((iocb->ki_pos & mp->m_blockmask) ||
- ((iocb->ki_pos + count) & mp->m_blockmask)) {
- unaligned_io = 1;
-
- /*
- * We can't properly handle unaligned direct I/O to reflink
- * files yet, as we can't unshare a partial block.
- */
- if (xfs_is_cow_inode(ip)) {
- trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
- return -ENOTBLK;
- }
- iolock = XFS_IOLOCK_EXCL;
- } else {
+ if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+ &xfs_dio_write_ops, 0);
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
- if (iocb->ki_flags & IOCB_NOWAIT) {
- /* unaligned dio always waits, bail */
- if (unaligned_io)
- return -EAGAIN;
- if (!xfs_ilock_nowait(ip, iolock))
+/*
+ * Handle block unaligned direct I/O writes
+ *
+ * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
+ * them to be done in parallel with reads and other direct I/O writes. However,
+ * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
+ * to do sub-block zeroing and that requires serialisation against other direct
+ * I/O to the same block. In this case we need to serialise the submission of
+ * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
+ * In the case where sub-block zeroing is not required, we can do concurrent
+ * sub-block dios to the same block successfully.
+ *
+ * Optimistically submit the I/O using the shared lock first, but use the
+ * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
+ * if block allocation or partial block zeroing would be required. In that case
+ * we try again with the exclusive lock.
+ */
+static noinline ssize_t
+xfs_file_dio_write_unaligned(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ size_t isize = i_size_read(VFS_I(ip));
+ size_t count = iov_iter_count(from);
+ int iolock = XFS_IOLOCK_SHARED;
+ unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
+ ssize_t ret;
+
+ /*
+ * Extending writes need exclusivity because of the sub-block zeroing
+ * that the DIO code always does for partial tail blocks beyond EOF, so
+ * don't even bother trying the fast path in this case.
+ */
+ if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
+retry_exclusive:
+ if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
- } else {
- xfs_ilock(ip, iolock);
+ iolock = XFS_IOLOCK_EXCL;
+ flags = IOMAP_DIO_FORCE_WAIT;
}
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
- goto out;
- count = iov_iter_count(from);
+ return ret;
/*
- * If we are doing unaligned IO, we can't allow any other overlapping IO
- * in-flight at the same time or we risk data corruption. Wait for all
- * other IO to drain before we submit. If the IO is aligned, demote the
- * iolock if we had to take the exclusive lock in
- * xfs_file_aio_write_checks() for other reasons.
+ * We can't properly handle unaligned direct I/O to reflink files yet,
+ * as we can't unshare a partial block.
*/
- if (unaligned_io) {
- inode_dio_wait(inode);
- } else if (iolock == XFS_IOLOCK_EXCL) {
- xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
- iolock = XFS_IOLOCK_SHARED;
+ if (xfs_is_cow_inode(ip)) {
+ trace_xfs_reflink_bounce_dio_write(iocb, from);
+ ret = -ENOTBLK;
+ goto out_unlock;
}
- trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+ ret = xfs_file_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out_unlock;
+
/*
- * If unaligned, this is the only IO in-flight. Wait on it before we
- * release the iolock to prevent subsequent overlapping IO.
+ * If we are doing exclusive unaligned I/O, this must be the only I/O
+ * in-flight. Otherwise we risk data corruption due to unwritten extent
+ * conversions from the AIO end_io handler. Wait for all other I/O to
+ * drain first.
*/
+ if (flags & IOMAP_DIO_FORCE_WAIT)
+ inode_dio_wait(VFS_I(ip));
+
+ trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops,
- is_sync_kiocb(iocb) || unaligned_io);
-out:
- xfs_iunlock(ip, iolock);
+ &xfs_dio_write_ops, flags);
/*
- * No fallback to buffered IO after short writes for XFS, direct I/O
- * will either complete fully or return an error.
+ * Retry unaligned I/O with exclusive blocking semantics if the DIO
+ * layer rejected it for mapping or locking reasons. If we are doing
+ * nonblocking user I/O, propagate the error.
*/
- ASSERT(ret < 0 || ret == count);
+ if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
+ ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
+ xfs_iunlock(ip, iolock);
+ goto retry_exclusive;
+ }
+
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
return ret;
}
+static ssize_t
+xfs_file_dio_write(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ size_t count = iov_iter_count(from);
+
+ /* direct I/O must be aligned to device logical sector size */
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
+ return -EINVAL;
+ if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+ return xfs_file_dio_write_unaligned(ip, iocb, from);
+ return xfs_file_dio_write_aligned(ip, iocb, from);
+}
+
static noinline ssize_t
xfs_file_dax_write(
struct kiocb *iocb,
struct xfs_inode *ip = XFS_I(inode);
int iolock = XFS_IOLOCK_EXCL;
ssize_t ret, error = 0;
- size_t count;
loff_t pos;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, iolock))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, iolock);
- }
-
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
+ ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
goto out;
pos = iocb->ki_pos;
- count = iov_iter_count(from);
- trace_xfs_file_dax_write(ip, count, pos);
+ trace_xfs_file_dax_write(iocb, from);
ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
}
out:
- xfs_iunlock(ip, iolock);
+ if (iolock)
+ xfs_iunlock(ip, iolock);
if (error)
return error;
}
STATIC ssize_t
-xfs_file_buffered_aio_write(
+xfs_file_buffered_write(
struct kiocb *iocb,
struct iov_iter *from)
{
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- int enospc = 0;
+ bool cleared_space = false;
int iolock;
if (iocb->ki_flags & IOCB_NOWAIT)
iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock);
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
goto out;
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
- trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+ trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
&xfs_buffered_write_iomap_ops);
if (likely(ret >= 0))
* metadata space. This reduces the chances that the eofblocks scan
* waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
* also behaves as a filter to prevent too many eofblocks scans from
- * running at the same time.
+ * running at the same time. Use a synchronous scan to increase the
+ * effectiveness of the scan.
*/
- if (ret == -EDQUOT && !enospc) {
+ if (ret == -EDQUOT && !cleared_space) {
xfs_iunlock(ip, iolock);
- enospc = xfs_inode_free_quota_eofblocks(ip);
- if (enospc)
- goto write_retry;
- enospc = xfs_inode_free_quota_cowblocks(ip);
- if (enospc)
- goto write_retry;
- iolock = 0;
- } else if (ret == -ENOSPC && !enospc) {
+ xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC);
+ cleared_space = true;
+ goto write_retry;
+ } else if (ret == -ENOSPC && !cleared_space) {
struct xfs_eofblocks eofb = {0};
- enospc = 1;
+ cleared_space = true;
xfs_flush_inodes(ip->i_mount);
xfs_iunlock(ip, iolock);
eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
- xfs_icache_free_eofblocks(ip->i_mount, &eofb);
- xfs_icache_free_cowblocks(ip->i_mount, &eofb);
+ xfs_blockgc_free_space(ip->i_mount, &eofb);
goto write_retry;
}
* CoW. In all other directio scenarios we do not
* allow an operation to fall back to buffered mode.
*/
- ret = xfs_file_dio_aio_write(iocb, from);
+ ret = xfs_file_dio_write(iocb, from);
if (ret != -ENOTBLK)
return ret;
}
- return xfs_file_buffered_aio_write(iocb, from);
+ return xfs_file_buffered_write(iocb, from);
}
static void
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
- error = xfs_vn_setattr_size(file_dentry(file), &iattr);
+ error = xfs_vn_setattr_size(file_mnt_user_ns(file),
+ file_dentry(file), &iattr);
if (error)
goto out_unlock;
}
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
-static void
+static vm_fault_t
xfs_filemap_map_pages(
struct vm_fault *vmf,
pgoff_t start_pgoff,
pgoff_t end_pgoff)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
+ vm_fault_t ret;
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- filemap_map_pages(vmf, start_pgoff, end_pgoff);
+ ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ return ret;
}
static const struct vm_operations_struct xfs_file_vm_ops = {
*/
static int
xfs_init_new_inode(
+ struct user_namespace *mnt_userns,
struct xfs_trans *tp,
struct xfs_inode *pip,
xfs_ino_t ino,
prid_t prid,
struct xfs_inode **ipp)
{
+ struct inode *dir = pip ? VFS_I(pip) : NULL;
struct xfs_mount *mp = tp->t_mountp;
struct xfs_inode *ip;
unsigned int flags;
ASSERT(ip != NULL);
inode = VFS_I(ip);
- inode->i_mode = mode;
set_nlink(inode, nlink);
- inode->i_uid = fsuid_into_mnt(mnt_userns);
inode->i_rdev = rdev;
ip->i_d.di_projid = prid;
- if (pip && XFS_INHERIT_GID(pip)) {
- inode->i_gid = VFS_I(pip)->i_gid;
- if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
+ if (dir && !(dir->i_mode & S_ISGID) &&
+ (mp->m_flags & XFS_MOUNT_GRPID)) {
- inode->i_uid = current_fsuid();
++ inode->i_uid = fsuid_into_mnt(mnt_userns);
+ inode->i_gid = dir->i_gid;
+ inode->i_mode = mode;
} else {
- inode_init_owner(inode, dir, mode);
- inode->i_gid = fsgid_into_mnt(mnt_userns);
++ inode_init_owner(mnt_userns, inode, dir, mode);
}
/*
* (and only if the irix_sgid_inherit compatibility variable is set).
*/
if (irix_sgid_inherit &&
- (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid))
+ (inode->i_mode & S_ISGID) &&
+ !in_group_p(i_gid_into_mnt(mnt_userns, inode)))
inode->i_mode &= ~S_ISGID;
ip->i_d.di_size = 0;
*/
int
xfs_dir_ialloc(
+ struct user_namespace *mnt_userns,
struct xfs_trans **tpp,
struct xfs_inode *dp,
umode_t mode,
return error;
ASSERT(ino != NULLFSINO);
- return xfs_init_new_inode(*tpp, dp, ino, mode, nlink, rdev, prid, ipp);
+ return xfs_init_new_inode(mnt_userns, *tpp, dp, ino, mode, nlink, rdev,
+ prid, ipp);
}
/*
int
xfs_create(
+ struct user_namespace *mnt_userns,
xfs_inode_t *dp,
struct xfs_name *name,
umode_t mode,
* the case we'll drop the one we have and get a more
* appropriate transaction later.
*/
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ &tp);
if (error == -ENOSPC) {
/* flush outstanding delalloc blocks and retry */
xfs_flush_inodes(mp);
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
+ resblks, &tp);
}
if (error)
- goto out_release_inode;
+ goto out_release_dquots;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
- /*
- * Reserve disk quota and the inode.
- */
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- pdqp, resblks, 1, 0);
+ error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
+ XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
goto out_trans_cancel;
* entry pointing to them, but a directory also the "." entry
* pointing to itself.
*/
- error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip);
+ error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, is_dir ? 2 : 1, rdev,
+ prid, &ip);
if (error)
goto out_trans_cancel;
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
-
+ out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
int
xfs_create_tmpfile(
+ struct user_namespace *mnt_userns,
struct xfs_inode *dp,
umode_t mode,
struct xfs_inode **ipp)
resblks = XFS_IALLOC_SPACE_RES(mp);
tres = &M_RES(mp)->tr_create_tmpfile;
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
- if (error)
- goto out_release_inode;
-
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- pdqp, resblks, 1, 0);
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ &tp);
if (error)
- goto out_trans_cancel;
+ goto out_release_dquots;
- error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
+ error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, 0, 0, prid, &ip);
if (error)
goto out_trans_cancel;
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
-
+ out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+ error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
+ XFS_IEXT_DIR_MANIP_CNT(mp));
+ if (error)
+ goto error_return;
+
/*
* If we are using project inheritance, we only allow hard link
* creation in our tree when the project IDs are the same; else
*/
static int
xfs_rename_alloc_whiteout(
+ struct user_namespace *mnt_userns,
struct xfs_inode *dp,
struct xfs_inode **wip)
{
struct xfs_inode *tmpfile;
int error;
- error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile);
+ error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE,
+ &tmpfile);
if (error)
return error;
*/
int
xfs_rename(
+ struct user_namespace *mnt_userns,
struct xfs_inode *src_dp,
struct xfs_name *src_name,
struct xfs_inode *src_ip,
struct xfs_trans *tp;
struct xfs_inode *wip = NULL; /* whiteout inode */
struct xfs_inode *inodes[__XFS_SORT_INODES];
- struct xfs_buf *agibp;
+ int i;
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
*/
if (flags & RENAME_WHITEOUT) {
ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
- error = xfs_rename_alloc_whiteout(target_dp, &wip);
+ error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip);
if (error)
return error;
/*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
+ *
+ * Extent count overflow check:
+ *
+ * From the perspective of src_dp, a rename operation is essentially a
+ * directory entry remove operation. Hence the only place where we check
+ * for extent count overflow for src_dp is in
+ * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns
+ * -ENOSPC when it detects a possible extent count overflow and in
+ * response, the higher layers of directory handling code do the
+ * following:
+ * 1. Data/Free blocks: XFS lets these blocks linger until a
+ * future remove operation removes them.
+ * 2. Dabtree blocks: XFS swaps the blocks with the last block in the
+ * Leaf space and unmaps the last block.
+ *
+ * For target_dp, there are two cases depending on whether the
+ * destination directory entry exists or not.
+ *
+ * When destination directory entry does not exist (i.e. target_ip ==
+ * NULL), extent count overflow check is performed only when transaction
+ * has a non-zero sized space reservation associated with it. With a
+ * zero-sized space reservation, XFS allows a rename operation to
+ * continue only when the directory has sufficient free space in its
+ * data/leaf/free space blocks to hold the new entry.
+ *
+ * When destination directory entry exists (i.e. target_ip != NULL), all
+ * we need to do is change the inode number associated with the already
+ * existing entry. Hence there is no need to perform an extent count
+ * overflow check.
*/
if (target_ip == NULL) {
/*
error = xfs_dir_canenter(tp, target_dp, target_name);
if (error)
goto out_trans_cancel;
+ } else {
+ error = xfs_iext_count_may_overflow(target_dp,
+ XFS_DATA_FORK,
+ XFS_IEXT_DIR_MANIP_CNT(mp));
+ if (error)
+ goto out_trans_cancel;
}
} else {
/*
}
/*
+ * Lock the AGI buffers we need to handle bumping the nlink of the
+ * whiteout inode off the unlinked list and to handle dropping the
+ * nlink of the target inode. Per locking order rules, do this in
+ * increasing AG order and before directory block allocation tries to
+ * grab AGFs because we grab AGIs before AGFs.
+ *
+ * The (vfs) caller must ensure that if src is a directory then
+ * target_ip is either null or an empty directory.
+ */
+ for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
+ if (inodes[i] == wip ||
+ (inodes[i] == target_ip &&
+ (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
+ struct xfs_buf *bp;
+ xfs_agnumber_t agno;
+
+ agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
+ error = xfs_read_agi(mp, tp, agno, &bp);
+ if (error)
+ goto out_trans_cancel;
+ }
+ }
+
+ /*
* Directory entry creation below may acquire the AGF. Remove
* the whiteout from the unlinked list first to preserve correct
* AGI/AGF locking order. This dirties the transaction so failures
* In case there is already an entry with the same
* name at the destination directory, remove it first.
*/
-
- /*
- * Check whether the replace operation will need to allocate
- * blocks. This happens when the shortform directory lacks
- * space and we have to convert it to a block format directory.
- * When more blocks are necessary, we must lock the AGI first
- * to preserve locking order (AGI -> AGF).
- */
- if (xfs_dir2_sf_replace_needblock(target_dp, src_ip->i_ino)) {
- error = xfs_read_agi(mp, tp,
- XFS_INO_TO_AGNO(mp, target_ip->i_ino),
- &agibp);
- if (error)
- goto out_trans_cancel;
- }
-
error = xfs_dir_replace(tp, target_dp, target_name,
src_ip->i_ino, spaceres);
if (error)
if (wip) {
error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
spaceres);
- } else
+ } else {
+ /*
+ * NOTE: We don't need to check for extent count overflow here
+ * because the dir remove name code will leave the dir block in
+ * place if the extent count would overflow.
+ */
error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
spaceres);
+ }
+
if (error)
goto out_trans_cancel;
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = bf->l_start;
- error = xfs_vn_setattr_size(file_dentry(filp), &iattr);
+ error = xfs_vn_setattr_size(file_mnt_user_ns(filp), file_dentry(filp),
+ &iattr);
if (error)
goto out_unlock;
STATIC int
xfs_ioc_fsbulkstat(
- xfs_mount_t *mp,
+ struct file *file,
unsigned int cmd,
void __user *arg)
{
+ struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
struct xfs_fsop_bulkreq bulkreq;
struct xfs_ibulk breq = {
.mp = mp,
+ .mnt_userns = file_mnt_user_ns(file),
.ocount = 0,
};
xfs_ino_t lastino;
/* Handle the v5 bulkstat ioctl. */
STATIC int
xfs_ioc_bulkstat(
- struct xfs_mount *mp,
+ struct file *file,
unsigned int cmd,
struct xfs_bulkstat_req __user *arg)
{
+ struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
struct xfs_bulk_ireq hdr;
struct xfs_ibulk breq = {
.mp = mp,
+ .mnt_userns = file_mnt_user_ns(file),
};
int error;
*/
static struct xfs_trans *
xfs_ioctl_setattr_get_trans(
- struct xfs_inode *ip,
- struct file *file)
++ struct file *file,
+ struct xfs_dquot *pdqp)
{
+ struct xfs_inode *ip = XFS_I(file_inode(file));
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
int error = -EROFS;
if (mp->m_flags & XFS_MOUNT_RDONLY)
- goto out_unlock;
+ goto out_error;
error = -EIO;
if (XFS_FORCED_SHUTDOWN(mp))
- goto out_unlock;
+ goto out_error;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp,
+ capable(CAP_FOWNER), &tp);
if (error)
- goto out_unlock;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ goto out_error;
/*
* CAP_FOWNER overrides the following restrictions:
* The user ID of the calling process must be equal to the file owner
* ID, except in cases where the CAP_FSETID capability is applicable.
*/
- if (!inode_owner_or_capable(VFS_I(ip))) {
+ if (!inode_owner_or_capable(file_mnt_user_ns(file), VFS_I(ip))) {
error = -EPERM;
goto out_cancel;
}
out_cancel:
xfs_trans_cancel(tp);
-out_unlock:
+out_error:
return ERR_PTR(error);
}
STATIC int
xfs_ioctl_setattr(
- xfs_inode_t *ip,
+ struct file *file,
struct fsxattr *fa)
{
+ struct user_namespace *mnt_userns = file_mnt_user_ns(file);
+ struct xfs_inode *ip = XFS_I(file_inode(file));
struct fsxattr old_fa;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
struct xfs_dquot *pdqp = NULL;
struct xfs_dquot *olddquot = NULL;
- int code;
+ int error;
trace_xfs_ioctl_setattr(ip);
- code = xfs_ioctl_setattr_check_projid(ip, fa);
- if (code)
- return code;
+ error = xfs_ioctl_setattr_check_projid(ip, fa);
+ if (error)
+ return error;
/*
* If disk quotas is on, we make sure that the dquots do exist on disk,
* because the i_*dquot fields will get updated anyway.
*/
if (XFS_IS_QUOTA_ON(mp)) {
- code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
+ error = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
VFS_I(ip)->i_gid, fa->fsx_projid,
XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp);
- if (code)
- return code;
+ if (error)
+ return error;
}
xfs_ioctl_setattr_prepare_dax(ip, fa);
- tp = xfs_ioctl_setattr_get_trans(ip, pdqp);
- tp = xfs_ioctl_setattr_get_trans(file);
++ tp = xfs_ioctl_setattr_get_trans(file, pdqp);
if (IS_ERR(tp)) {
- code = PTR_ERR(tp);
+ error = PTR_ERR(tp);
goto error_free_dquots;
}
- if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
- ip->i_d.di_projid != fa->fsx_projid) {
- code = xfs_qm_vop_chown_reserve(tp, ip, NULL, NULL, pdqp,
- capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0);
- if (code) /* out of quota */
- goto error_trans_cancel;
- }
-
xfs_fill_fsxattr(ip, false, &old_fa);
- code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
- if (code)
+ error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
+ if (error)
goto error_trans_cancel;
- code = xfs_ioctl_setattr_check_extsize(ip, fa);
- if (code)
+ error = xfs_ioctl_setattr_check_extsize(ip, fa);
+ if (error)
goto error_trans_cancel;
- code = xfs_ioctl_setattr_check_cowextsize(ip, fa);
- if (code)
+ error = xfs_ioctl_setattr_check_cowextsize(ip, fa);
+ if (error)
goto error_trans_cancel;
- code = xfs_ioctl_setattr_xflags(tp, ip, fa);
- if (code)
+ error = xfs_ioctl_setattr_xflags(tp, ip, fa);
+ if (error)
goto error_trans_cancel;
/*
*/
if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) &&
- !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
+ !capable_wrt_inode_uidgid(mnt_userns, VFS_I(ip), CAP_FSETID))
VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
/* Change the ownerships and register project quota modifications */
else
ip->i_d.di_cowextsize = 0;
- code = xfs_trans_commit(tp);
+ error = xfs_trans_commit(tp);
/*
* Release any dquot(s) the inode had kept before chown.
xfs_qm_dqrele(olddquot);
xfs_qm_dqrele(pdqp);
- return code;
+ return error;
error_trans_cancel:
xfs_trans_cancel(tp);
error_free_dquots:
xfs_qm_dqrele(pdqp);
- return code;
+ return error;
}
STATIC int
xfs_ioc_fssetxattr(
- xfs_inode_t *ip,
struct file *filp,
void __user *arg)
{
error = mnt_want_write_file(filp);
if (error)
return error;
- error = xfs_ioctl_setattr(ip, &fa);
+ error = xfs_ioctl_setattr(filp, &fa);
mnt_drop_write_file(filp);
return error;
}
xfs_ioctl_setattr_prepare_dax(ip, &fa);
- tp = xfs_ioctl_setattr_get_trans(ip, NULL);
- tp = xfs_ioctl_setattr_get_trans(filp);
++ tp = xfs_ioctl_setattr_get_trans(filp, NULL);
if (IS_ERR(tp)) {
error = PTR_ERR(tp);
goto out_drop_write;
case XFS_IOC_FSBULKSTAT_SINGLE:
case XFS_IOC_FSBULKSTAT:
case XFS_IOC_FSINUMBERS:
- return xfs_ioc_fsbulkstat(mp, cmd, arg);
+ return xfs_ioc_fsbulkstat(filp, cmd, arg);
case XFS_IOC_BULKSTAT:
- return xfs_ioc_bulkstat(mp, cmd, arg);
+ return xfs_ioc_bulkstat(filp, cmd, arg);
case XFS_IOC_INUMBERS:
return xfs_ioc_inumbers(mp, cmd, arg);
case XFS_IOC_FSGETXATTRA:
return xfs_ioc_fsgetxattr(ip, 1, arg);
case XFS_IOC_FSSETXATTR:
- return xfs_ioc_fssetxattr(ip, filp, arg);
+ return xfs_ioc_fssetxattr(filp, arg);
case XFS_IOC_GETXFLAGS:
return xfs_ioc_getxflags(ip, arg);
case XFS_IOC_SETXFLAGS:
}
case XFS_IOC_FSGROWFSDATA: {
- xfs_growfs_data_t in;
+ struct xfs_growfs_data in;
if (copy_from_user(&in, arg, sizeof(in)))
return -EFAULT;
}
case XFS_IOC_FSGROWFSLOG: {
- xfs_growfs_log_t in;
+ struct xfs_growfs_log in;
if (copy_from_user(&in, arg, sizeof(in)))
return -EFAULT;
if (error)
return error;
+ trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_);
+
sb_start_write(mp->m_super);
- error = xfs_icache_free_eofblocks(mp, &keofb);
+ error = xfs_blockgc_free_space(mp, &keofb);
sb_end_write(mp->m_super);
return error;
}
STATIC int
xfs_generic_create(
+ struct user_namespace *mnt_userns,
struct inode *dir,
struct dentry *dentry,
umode_t mode,
goto out_free_acl;
if (!tmpfile) {
- error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+ error = xfs_create(mnt_userns, XFS_I(dir), &name, mode, rdev,
+ &ip);
} else {
- error = xfs_create_tmpfile(XFS_I(dir), mode, &ip);
+ error = xfs_create_tmpfile(mnt_userns, XFS_I(dir), mode, &ip);
}
if (unlikely(error))
goto out_free_acl;
STATIC int
xfs_vn_mknod(
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode,
- dev_t rdev)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ dev_t rdev)
{
- return xfs_generic_create(dir, dentry, mode, rdev, false);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, false);
}
STATIC int
xfs_vn_create(
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode,
- bool flags)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ bool flags)
{
- return xfs_generic_create(dir, dentry, mode, 0, false);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, false);
}
STATIC int
xfs_vn_mkdir(
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
{
- return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0,
+ false);
}
STATIC struct dentry *
STATIC int
xfs_vn_symlink(
- struct inode *dir,
- struct dentry *dentry,
- const char *symname)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ const char *symname)
{
struct inode *inode;
struct xfs_inode *cip = NULL;
if (unlikely(error))
goto out;
- error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
+ error = xfs_symlink(mnt_userns, XFS_I(dir), &name, symname, mode, &cip);
if (unlikely(error))
goto out;
STATIC int
xfs_vn_rename(
- struct inode *odir,
- struct dentry *odentry,
- struct inode *ndir,
- struct dentry *ndentry,
- unsigned int flags)
+ struct user_namespace *mnt_userns,
+ struct inode *odir,
+ struct dentry *odentry,
+ struct inode *ndir,
+ struct dentry *ndentry,
+ unsigned int flags)
{
struct inode *new_inode = d_inode(ndentry);
int omode = 0;
if (unlikely(error))
return error;
- return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)),
- XFS_I(ndir), &nname,
+ return xfs_rename(mnt_userns, XFS_I(odir), &oname,
+ XFS_I(d_inode(odentry)), XFS_I(ndir), &nname,
new_inode ? XFS_I(new_inode) : NULL, flags);
}
STATIC int
xfs_vn_getattr(
+ struct user_namespace *mnt_userns,
const struct path *path,
struct kstat *stat,
u32 request_mask,
stat->dev = inode->i_sb->s_dev;
stat->mode = inode->i_mode;
stat->nlink = inode->i_nlink;
- stat->uid = inode->i_uid;
- stat->gid = inode->i_gid;
+ stat->uid = i_uid_into_mnt(mnt_userns, inode);
+ stat->gid = i_gid_into_mnt(mnt_userns, inode);
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
stat->mtime = inode->i_mtime;
static int
xfs_vn_change_ok(
- struct dentry *dentry,
- struct iattr *iattr)
+ struct user_namespace *mnt_userns,
+ struct dentry *dentry,
+ struct iattr *iattr)
{
struct xfs_mount *mp = XFS_I(d_inode(dentry))->i_mount;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- return setattr_prepare(dentry, iattr);
+ return setattr_prepare(mnt_userns, dentry, iattr);
}
/*
*/
static int
xfs_setattr_nonsize(
+ struct user_namespace *mnt_userns,
struct xfs_inode *ip,
struct iattr *iattr)
{
return error;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL,
+ capable(CAP_FOWNER), &tp);
if (error)
goto out_dqrele;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* Change file ownership. Must be the owner or privileged.
*/
uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
/*
- * Do a quota reservation only if uid/gid is actually
- * going to change.
- */
- if (XFS_IS_QUOTA_RUNNING(mp) &&
- ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
- (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
- ASSERT(tp);
- error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
- NULL, capable(CAP_FOWNER) ?
- XFS_QMOPT_FORCE_RES : 0);
- if (error) /* out of quota */
- goto out_cancel;
- }
-
- /*
* CAP_FSETID overrides the following restrictions:
*
* The set-user-ID and set-group-ID bits of a file will be
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
/*
* Release any dquot(s) the inode had kept before chown.
*/
* Posix ACL code seems to care about this issue either.
*/
if (mask & ATTR_MODE) {
- error = posix_acl_chmod(inode, inode->i_mode);
+ error = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
if (error)
return error;
}
return 0;
-out_cancel:
- xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_dqrele:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
*/
STATIC int
xfs_setattr_size(
+ struct user_namespace *mnt_userns,
struct xfs_inode *ip,
struct iattr *iattr)
{
ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
ASSERT(S_ISREG(inode->i_mode));
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
- ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+ ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
oldsize = inode->i_size;
newsize = iattr->ia_size;
* Use the regular setattr path to update the timestamps.
*/
iattr->ia_valid &= ~ATTR_SIZE;
- return xfs_setattr_nonsize(ip, iattr);
+ return xfs_setattr_nonsize(mnt_userns, ip, iattr);
}
/*
int
xfs_vn_setattr_size(
+ struct user_namespace *mnt_userns,
struct dentry *dentry,
struct iattr *iattr)
{
trace_xfs_setattr(ip);
- error = xfs_vn_change_ok(dentry, iattr);
+ error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (error)
return error;
- return xfs_setattr_size(ip, iattr);
+ return xfs_setattr_size(mnt_userns, ip, iattr);
}
STATIC int
xfs_vn_setattr(
+ struct user_namespace *mnt_userns,
struct dentry *dentry,
struct iattr *iattr)
{
return error;
}
- error = xfs_vn_setattr_size(dentry, iattr);
+ error = xfs_vn_setattr_size(mnt_userns, dentry, iattr);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
} else {
trace_xfs_setattr(ip);
- error = xfs_vn_change_ok(dentry, iattr);
+ error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (!error)
- error = xfs_setattr_nonsize(ip, iattr);
+ error = xfs_setattr_nonsize(mnt_userns, ip, iattr);
}
return error;
STATIC int
xfs_vn_tmpfile(
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
{
- return xfs_generic_create(dir, dentry, mode, 0, true);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, true);
}
static const struct inode_operations xfs_inode_operations = {
return error;
if (need_alloc) {
- error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ipp);
+ error = xfs_dir_ialloc(&init_user_ns, &tp, NULL, S_IFREG, 1, 0,
+ 0, ipp);
if (error) {
xfs_trans_cancel(tp);
return error;
xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
/*
- * Take an extra reference, because the inode is going to keep
- * this dquot pointer even after the trans_commit.
+ * Back when we made quota reservations for the chown, we reserved the
+ * ondisk blocks + delalloc blocks with the new dquot. Now that we've
+ * switched the dquots, decrease the new dquot's block reservation
+ * (having already bumped up the real counter) so that we don't have
+ * any reservation to give back when we commit.
*/
- *IO_olddq = xfs_qm_dqhold(newdq);
-
- return prevdq;
-}
-
-/*
- * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID).
- */
-int
-xfs_qm_vop_chown_reserve(
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- struct xfs_dquot *udqp,
- struct xfs_dquot *gdqp,
- struct xfs_dquot *pdqp,
- uint flags)
-{
- struct xfs_mount *mp = ip->i_mount;
- uint64_t delblks;
- unsigned int blkflags;
- struct xfs_dquot *udq_unres = NULL;
- struct xfs_dquot *gdq_unres = NULL;
- struct xfs_dquot *pdq_unres = NULL;
- struct xfs_dquot *udq_delblks = NULL;
- struct xfs_dquot *gdq_delblks = NULL;
- struct xfs_dquot *pdq_delblks = NULL;
- int error;
-
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
- delblks = ip->i_delayed_blks;
- blkflags = XFS_IS_REALTIME_INODE(ip) ?
- XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
+ xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS,
+ -ip->i_delayed_blks);
- if (XFS_IS_UQUOTA_ON(mp) && udqp &&
- i_uid_read(VFS_I(ip)) != udqp->q_id) {
- udq_delblks = udqp;
- /*
- * If there are delayed allocation blocks, then we have to
- * unreserve those from the old dquot, and add them to the
- * new dquot.
- */
- if (delblks) {
- ASSERT(ip->i_udquot);
- udq_unres = ip->i_udquot;
- }
- }
- if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
- i_gid_read(VFS_I(ip)) != gdqp->q_id) {
- gdq_delblks = gdqp;
- if (delblks) {
- ASSERT(ip->i_gdquot);
- gdq_unres = ip->i_gdquot;
- }
- }
-
- if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
- ip->i_d.di_projid != pdqp->q_id) {
- pdq_delblks = pdqp;
- if (delblks) {
- ASSERT(ip->i_pdquot);
- pdq_unres = ip->i_pdquot;
- }
- }
-
- error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
- udq_delblks, gdq_delblks, pdq_delblks,
- ip->i_d.di_nblocks, 1, flags | blkflags);
- if (error)
- return error;
+ /*
+ * Give the incore reservation for delalloc blocks back to the old
+ * dquot. We don't normally handle delalloc quota reservations
+ * transactionally, so just lock the dquot and subtract from the
+ * reservation. Dirty the transaction because it's too late to turn
+ * back now.
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ xfs_dqlock(prevdq);
+ ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks);
+ prevdq->q_blk.reserved -= ip->i_delayed_blks;
+ xfs_dqunlock(prevdq);
/*
- * Do the delayed blks reservations/unreservations now. Since, these
- * are done without the help of a transaction, if a reservation fails
- * its previous reservations won't be automatically undone by trans
- * code. So, we have to do it manually here.
+ * Take an extra reference, because the inode is going to keep
+ * this dquot pointer even after the trans_commit.
*/
- if (delblks) {
- /*
- * Do the reservations first. Unreservation can't fail.
- */
- ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
- ASSERT(udq_unres || gdq_unres || pdq_unres);
- error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- udq_delblks, gdq_delblks, pdq_delblks,
- (xfs_qcnt_t)delblks, 0, flags | blkflags);
- if (error)
- return error;
- xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- udq_unres, gdq_unres, pdq_unres,
- -((xfs_qcnt_t)delblks), 0, blkflags);
- }
+ *IO_olddq = xfs_qm_dqhold(newdq);
- return 0;
+ return prevdq;
}
int
#include "xfs_refcount_item.h"
#include "xfs_bmap_item.h"
#include "xfs_reflink.h"
+#include "xfs_pwork.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
xfs_blkdev_issue_flush(
xfs_buftarg_t *buftarg)
{
- blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS);
+ blkdev_issue_flush(buftarg->bt_bdev);
}
STATIC void
struct xfs_mount *mp)
{
mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 1, mp->m_super->s_id);
if (!mp->m_buf_workqueue)
goto out;
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 0, mp->m_super->s_id);
if (!mp->m_unwritten_workqueue)
goto out_destroy_buf;
mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
- WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND),
0, mp->m_super->s_id);
if (!mp->m_cil_workqueue)
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 0, mp->m_super->s_id);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
- mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
- if (!mp->m_eofblocks_workqueue)
+ mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s",
+ WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM,
+ 0, mp->m_super->s_id);
+ if (!mp->m_blockgc_workqueue)
goto out_destroy_reclaim;
- mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
- mp->m_super->s_id);
+ mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s",
+ XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id);
if (!mp->m_sync_workqueue)
goto out_destroy_eofb;
return 0;
out_destroy_eofb:
- destroy_workqueue(mp->m_eofblocks_workqueue);
+ destroy_workqueue(mp->m_blockgc_workqueue);
out_destroy_reclaim:
destroy_workqueue(mp->m_reclaim_workqueue);
out_destroy_cil:
struct xfs_mount *mp)
{
destroy_workqueue(mp->m_sync_workqueue);
- destroy_workqueue(mp->m_eofblocks_workqueue);
+ destroy_workqueue(mp->m_blockgc_workqueue);
destroy_workqueue(mp->m_reclaim_workqueue);
destroy_workqueue(mp->m_cil_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
}
/*
- * Trigger writeback of all the dirty metadata in the file system.
- *
- * This ensures that the metadata is written to their location on disk rather
- * than just existing in transactions in the log. This means after a quiesce
- * there is no log replay required to write the inodes to disk - this is the
- * primary difference between a sync and a quiesce.
- *
- * We cancel log work early here to ensure all transactions the log worker may
- * run have finished before we clean up and log the superblock and write an
- * unmount record. The unfreeze process is responsible for restarting the log
- * worker correctly.
- */
-void
-xfs_quiesce_attr(
- struct xfs_mount *mp)
-{
- int error = 0;
-
- cancel_delayed_work_sync(&mp->m_log->l_work);
-
- /* force the log to unpin objects from the now complete transactions */
- xfs_log_force(mp, XFS_LOG_SYNC);
-
-
- /* Push the superblock and write an unmount record */
- error = xfs_log_sbcount(mp);
- if (error)
- xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
- "Frozen image may not be consistent.");
- xfs_log_quiesce(mp);
-}
-
-/*
* Second stage of a freeze. The data is already frozen so we only
* need to take care of the metadata. Once that's done sync the superblock
* to the log to dirty it in case of a crash while frozen. This ensures that we
* set a GFP_NOFS context here to avoid recursion deadlocks.
*/
flags = memalloc_nofs_save();
- xfs_stop_block_reaping(mp);
+ xfs_blockgc_stop(mp);
xfs_save_resvblks(mp);
- xfs_quiesce_attr(mp);
- ret = xfs_sync_sb(mp, true);
+ ret = xfs_log_quiesce(mp);
memalloc_nofs_restore(flags);
return ret;
}
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
- xfs_start_block_reaping(mp);
+ xfs_blockgc_start(mp);
return 0;
}
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
- xfs_start_block_reaping(mp);
+ xfs_blockgc_start(mp);
/* Create the per-AG metadata reservation pool .*/
error = xfs_fs_reserve_ag_blocks(mp);
* Cancel background eofb scanning so it cannot race with the final
* log force+buftarg wait and deadlock the remount.
*/
- xfs_stop_block_reaping(mp);
+ xfs_blockgc_stop(mp);
/* Get rid of any leftover CoW reservations... */
- error = xfs_icache_free_cowblocks(mp, NULL);
+ error = xfs_blockgc_free_space(mp, NULL);
if (error) {
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
*/
xfs_save_resvblks(mp);
- xfs_quiesce_attr(mp);
+ xfs_log_clean(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
return 0;
mutex_init(&mp->m_growlock);
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
- INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
- INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
mp->m_kobj.kobject.kset = xfs_kset;
/*
* We don't create the finobt per-ag space reservation until after log
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("xfs");
* max_active value for this workqueue.
*/
xfs_alloc_wq = alloc_workqueue("xfsalloc",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0);
+ XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0);
if (!xfs_alloc_wq)
return -ENOMEM;
- xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0);
+ xfs_discard_wq = alloc_workqueue("xfsdiscard", XFS_WQFLAGS(WQ_UNBOUND),
+ 0);
if (!xfs_discard_wq)
goto out_free_alloc_wq;
int
xfs_symlink(
+ struct user_namespace *mnt_userns,
struct xfs_inode *dp,
struct xfs_name *link_name,
const char *target_path,
fs_blocks = xfs_symlink_blocks(mp, pathlen);
resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp,
+ pdqp, resblks, &tp);
if (error)
- goto out_release_inode;
+ goto out_release_dquots;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
goto out_trans_cancel;
}
- /*
- * Reserve disk quota : blocks and inode.
- */
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- pdqp, resblks, 1, 0);
+ error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
+ XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
goto out_trans_cancel;
/*
* Allocate an inode for the symlink.
*/
- error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
- prid, &ip);
+ error = xfs_dir_ialloc(mnt_userns, &tp, dp, S_IFLNK | (mode & ~S_IFMT),
+ 1, 0, prid, &ip);
if (error)
goto out_trans_cancel;
}
ASSERT(pathlen == 0);
}
+ i_size_write(VFS_I(ip), ip->i_d.di_size);
/*
* Create the directory entry for the symlink.
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
-
+out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
#include "zonefs.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
static inline int zonefs_zone_mgmt(struct inode *inode,
enum req_opf op)
{
lockdep_assert_held(&zi->i_truncate_mutex);
+ trace_zonefs_zone_mgmt(inode, op);
ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
if (ret) {
iomap->bdev = inode->i_sb->s_bdev;
iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
+ trace_zonefs_iomap_begin(inode, iomap);
+
return 0;
}
}
inode->i_mode &= ~0222;
return i_size_read(inode);
+ case BLK_ZONE_COND_FULL:
+ /* The write pointer of full zones is invalid. */
+ return zi->i_max_size;
default:
if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
return zi->i_max_size;
return ret;
}
- static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr)
+ static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
int ret;
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
- ret = setattr_prepare(dentry, iattr);
+ ret = setattr_prepare(&init_user_ns, dentry, iattr);
if (ret)
return ret;
return ret;
}
- setattr_copy(inode, iattr);
+ setattr_copy(&init_user_ns, inode, iattr);
return 0;
}
if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
ret = file_write_and_wait_range(file, start, end);
if (!ret)
- ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev);
if (ret)
zonefs_io_error(inode, true);
if (!nr_pages)
return 0;
- bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
+ bio = bio_alloc(GFP_NOFS, nr_pages);
if (!bio)
return -ENOMEM;
ret = submit_bio_wait(bio);
zonefs_file_write_dio_end_io(iocb, size, ret, 0);
+ trace_zonefs_file_dio_append(inode, size, ret);
out_release:
bio_release_pages(bio, false);
ret = zonefs_file_dio_append(iocb, from);
else
ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
- &zonefs_write_dio_ops, sync);
+ &zonefs_write_dio_ops, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
}
file_accessed(iocb->ki_filp);
ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
- &zonefs_read_dio_ops, is_sync_kiocb(iocb));
+ &zonefs_read_dio_ops, 0);
} else {
ret = generic_file_read_iter(iocb, to);
if (ret == -EIO)
struct super_block *sb = parent->i_sb;
inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1;
- inode_init_owner(inode, parent, S_IFDIR | 0555);
+ inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555);
inode->i_op = &zonefs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
sb->s_time_gran = 1;
/*
- * The block size is set to the device physical sector size to ensure
- * that write operations on 512e devices (512B logical block and 4KB
- * physical block) are always aligned to the device physical blocks,
- * as mandated by the ZBC/ZAC specifications.
+ * The block size is set to the device zone write granularity to ensure
+ * that write operations are always aligned according to the device
+ * interface constraints.
*/
- sb_set_blocksize(sb, bdev_physical_block_size(sb->s_bdev));
+ sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev));
sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev));
sbi->s_uid = GLOBAL_ROOT_UID;
sbi->s_gid = GLOBAL_ROOT_GID;
#include <linux/fs_types.h>
#include <linux/build_bug.h>
#include <linux/stddef.h>
+ #include <linux/mount.h>
+ #include <linux/cred.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
}
+ static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns,
+ kuid_t kuid)
+ {
+ return make_kuid(mnt_userns, __kuid_val(kuid));
+ }
+
+ static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns,
+ kgid_t kgid)
+ {
+ return make_kgid(mnt_userns, __kgid_val(kgid));
+ }
+
+ static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
+ const struct inode *inode)
+ {
+ return kuid_into_mnt(mnt_userns, inode->i_uid);
+ }
+
+ static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
+ const struct inode *inode)
+ {
+ return kgid_into_mnt(mnt_userns, inode->i_gid);
+ }
+
+ static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns,
+ kuid_t kuid)
+ {
+ return KUIDT_INIT(from_kuid(mnt_userns, kuid));
+ }
+
+ static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
+ kgid_t kgid)
+ {
+ return KGIDT_INIT(from_kgid(mnt_userns, kgid));
+ }
+
+ static inline kuid_t fsuid_into_mnt(struct user_namespace *mnt_userns)
+ {
+ return kuid_from_mnt(mnt_userns, current_fsuid());
+ }
+
+ static inline kgid_t fsgid_into_mnt(struct user_namespace *mnt_userns)
+ {
+ return kgid_from_mnt(mnt_userns, current_fsgid());
+ }
+
extern struct timespec64 current_time(struct inode *inode);
/*
return __sb_start_write_trylock(sb, SB_FREEZE_FS);
}
-
- extern bool inode_owner_or_capable(const struct inode *inode);
+ bool inode_owner_or_capable(struct user_namespace *mnt_userns,
+ const struct inode *inode);
/*
* VFS helper functions..
*/
- extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
- extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
- extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
- extern int vfs_symlink(struct inode *, struct dentry *, const char *);
- extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
- extern int vfs_rmdir(struct inode *, struct dentry *);
- extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
- extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
+ int vfs_create(struct user_namespace *, struct inode *,
+ struct dentry *, umode_t, bool);
+ int vfs_mkdir(struct user_namespace *, struct inode *,
+ struct dentry *, umode_t);
+ int vfs_mknod(struct user_namespace *, struct inode *, struct dentry *,
+ umode_t, dev_t);
+ int vfs_symlink(struct user_namespace *, struct inode *,
+ struct dentry *, const char *);
+ int vfs_link(struct dentry *, struct user_namespace *, struct inode *,
+ struct dentry *, struct inode **);
+ int vfs_rmdir(struct user_namespace *, struct inode *, struct dentry *);
+ int vfs_unlink(struct user_namespace *, struct inode *, struct dentry *,
+ struct inode **);
+
+ struct renamedata {
+ struct user_namespace *old_mnt_userns;
+ struct inode *old_dir;
+ struct dentry *old_dentry;
+ struct user_namespace *new_mnt_userns;
+ struct inode *new_dir;
+ struct dentry *new_dentry;
+ struct inode **delegated_inode;
+ unsigned int flags;
+ } __randomize_layout;
- static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry)
+ int vfs_rename(struct renamedata *);
+
+ static inline int vfs_whiteout(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry)
{
- return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+ return vfs_mknod(mnt_userns, dir, dentry, S_IFCHR | WHITEOUT_MODE,
+ WHITEOUT_DEV);
}
- extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode,
- int open_flag);
+ struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
+ struct dentry *dentry, umode_t mode, int open_flag);
int vfs_mkobj(struct dentry *, umode_t,
int (*f)(struct dentry *, umode_t, void *),
/*
* VFS file helper functions.
*/
- extern void inode_init_owner(struct inode *inode, const struct inode *dir,
- umode_t mode);
+ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
+ const struct inode *dir, umode_t mode);
extern bool may_open_dev(const struct path *path);
/*
struct inode_operations {
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
- int (*permission) (struct inode *, int);
+ int (*permission) (struct user_namespace *, struct inode *, int);
struct posix_acl * (*get_acl)(struct inode *, int);
int (*readlink) (struct dentry *, char __user *,int);
- int (*create) (struct inode *,struct dentry *, umode_t, bool);
+ int (*create) (struct user_namespace *, struct inode *,struct dentry *,
+ umode_t, bool);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
- int (*symlink) (struct inode *,struct dentry *,const char *);
- int (*mkdir) (struct inode *,struct dentry *,umode_t);
+ int (*symlink) (struct user_namespace *, struct inode *,struct dentry *,
+ const char *);
+ int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *,
+ umode_t);
int (*rmdir) (struct inode *,struct dentry *);
- int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
- int (*rename) (struct inode *, struct dentry *,
+ int (*mknod) (struct user_namespace *, struct inode *,struct dentry *,
+ umode_t,dev_t);
+ int (*rename) (struct user_namespace *, struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
- int (*setattr) (struct dentry *, struct iattr *);
- int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
+ int (*setattr) (struct user_namespace *, struct dentry *,
+ struct iattr *);
+ int (*getattr) (struct user_namespace *, const struct path *,
+ struct kstat *, u32, unsigned int);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
u64 len);
int (*atomic_open)(struct inode *, struct dentry *,
struct file *, unsigned open_flag,
umode_t create_mode);
- int (*tmpfile) (struct inode *, struct dentry *, umode_t);
- int (*set_acl)(struct inode *, struct posix_acl *, int);
+ int (*tmpfile) (struct user_namespace *, struct inode *,
+ struct dentry *, umode_t);
+ int (*set_acl)(struct user_namespace *, struct inode *,
+ struct posix_acl *, int);
} ____cacheline_aligned;
static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
- static inline bool HAS_UNMAPPED_ID(struct inode *inode)
+ static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns,
+ struct inode *inode)
{
- return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
+ return !uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
+ !gid_valid(i_gid_into_mnt(mnt_userns, inode));
}
static inline enum rw_hint file_write_hint(struct file *file)
/*
* Inode state bits. Protected by inode->i_lock
*
- * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
- * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
+ * Four bits determine the dirty state of the inode: I_DIRTY_SYNC,
+ * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME.
*
* Four bits define the lifetime of an inode. Initially, inodes are I_NEW,
* until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at
* Two bits are used for locking and completion notification, I_NEW and I_SYNC.
*
* I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on
- * fdatasync(). i_atime is the usual cause.
- * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of
+ * fdatasync() (unless I_DIRTY_DATASYNC is also set).
+ * Timestamp updates are the usual cause.
+ * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of
* these changes separately from I_DIRTY_SYNC so that we
* don't have to write inode on fdatasync() when only
- * mtime has changed in it.
+ * e.g. the timestamps have changed.
* I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean.
+ * I_DIRTY_TIME The inode itself only has dirty timestamps, and the
+ * lazytime mount option is enabled. We keep track of this
+ * separately from I_DIRTY_SYNC in order to implement
+ * lazytime. This gets cleared if I_DIRTY_INODE
+ * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. I.e.
+ * either I_DIRTY_TIME *or* I_DIRTY_INODE can be set in
+ * i_state, but not both. I_DIRTY_PAGES may still be set.
* I_NEW Serves as both a mutex and completion notification.
* New inodes set I_NEW. If two processes both create
* the same inode, one of them will release its inode and
__mark_inode_dirty(inode, I_DIRTY_SYNC);
}
+/*
+ * Returns true if the given inode itself only has dirty timestamps (its pages
+ * may still be dirty) and isn't currently being allocated or freed.
+ * Filesystems should call this if when writing an inode when lazytime is
+ * enabled, they want to opportunistically write the timestamps of other inodes
+ * located very nearby on-disk, e.g. in the same inode block. This returns true
+ * if the given inode is in need of such an opportunistic update. Requires
+ * i_lock, or at least later re-checking under i_lock.
+ */
+static inline bool inode_is_dirtytime_only(struct inode *inode)
+{
+ return (inode->i_state & (I_DIRTY_TIME | I_NEW |
+ I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
+}
+
extern void inc_nlink(struct inode *inode);
extern void drop_nlink(struct inode *inode);
extern void clear_nlink(struct inode *inode);
#define FS_HAS_SUBTYPE 4
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */
+ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */
#define FS_THP_SUPPORT 8192 /* Remove once all fs converted */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
int (*init_fs_context)(struct fs_context *);
};
static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
+ static inline struct user_namespace *file_mnt_user_ns(struct file *file)
+ {
+ return mnt_user_ns(file->f_path.mnt);
+ }
extern long vfs_truncate(const struct path *, loff_t);
- extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
- struct file *filp);
+ int do_truncate(struct user_namespace *, struct dentry *, loff_t start,
+ unsigned int time_attrs, struct file *filp);
extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern long do_sys_open(int dfd, const char __user *filename, int flags,
}
#endif
- extern int notify_change(struct dentry *, struct iattr *, struct inode **);
- extern int inode_permission(struct inode *, int);
- extern int generic_permission(struct inode *, int);
- extern int __check_sticky(struct inode *dir, struct inode *inode);
+ int notify_change(struct user_namespace *, struct dentry *,
+ struct iattr *, struct inode **);
+ int inode_permission(struct user_namespace *, struct inode *, int);
+ int generic_permission(struct user_namespace *, struct inode *, int);
+ static inline int file_permission(struct file *file, int mask)
+ {
+ return inode_permission(file_mnt_user_ns(file),
+ file_inode(file), mask);
+ }
+ static inline int path_permission(const struct path *path, int mask)
+ {
+ return inode_permission(mnt_user_ns(path->mnt),
+ d_inode(path->dentry), mask);
+ }
+ int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
+ struct inode *inode);
static inline bool execute_ok(struct inode *inode)
{
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
- extern void generic_fillattr(struct inode *, struct kstat *);
+ void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
void __inode_add_bytes(struct inode *inode, loff_t bytes);
extern int dcache_dir_close(struct inode *, struct file *);
extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
extern int dcache_readdir(struct file *, struct dir_context *);
- extern int simple_setattr(struct dentry *, struct iattr *);
- extern int simple_getattr(const struct path *, struct kstat *, u32, unsigned int);
+ extern int simple_setattr(struct user_namespace *, struct dentry *,
+ struct iattr *);
+ extern int simple_getattr(struct user_namespace *, const struct path *,
+ struct kstat *, u32, unsigned int);
extern int simple_statfs(struct dentry *, struct kstatfs *);
extern int simple_open(struct inode *inode, struct file *file);
extern int simple_link(struct dentry *, struct inode *, struct dentry *);
extern int simple_unlink(struct inode *, struct dentry *);
extern int simple_rmdir(struct inode *, struct dentry *);
- extern int simple_rename(struct inode *, struct dentry *,
- struct inode *, struct dentry *, unsigned int);
+ extern int simple_rename(struct user_namespace *, struct inode *,
+ struct dentry *, struct inode *, struct dentry *,
+ unsigned int);
extern void simple_recursive_removal(struct dentry *,
void (*callback)(struct dentry *));
extern int noop_fsync(struct file *, loff_t, loff_t, int);
extern int generic_check_addressable(unsigned, u64);
-#ifdef CONFIG_UNICODE
-extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
-extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
- const char *str, const struct qstr *name);
-#endif
extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);
#ifdef CONFIG_MIGRATION
#define buffer_migrate_page_norefs NULL
#endif
- extern int setattr_prepare(struct dentry *, struct iattr *);
+ int setattr_prepare(struct user_namespace *, struct dentry *, struct iattr *);
extern int inode_newsize_ok(const struct inode *, loff_t offset);
- extern void setattr_copy(struct inode *inode, const struct iattr *attr);
+ void setattr_copy(struct user_namespace *, struct inode *inode,
+ const struct iattr *attr);
extern int file_update_time(struct file *file);
return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP));
}
- static inline int check_sticky(struct inode *dir, struct inode *inode)
+ static inline int check_sticky(struct user_namespace *mnt_userns,
+ struct inode *dir, struct inode *inode)
{
if (!(dir->i_mode & S_ISVTX))
return 0;
- return __check_sticky(dir, inode);
+ return __check_sticky(mnt_userns, dir, inode);
}
static inline void inode_has_no_xattr(struct inode *inode)
#ifdef CONFIG_IMA
extern int ima_bprm_check(struct linux_binprm *bprm);
extern int ima_file_check(struct file *file, int mask);
- extern void ima_post_create_tmpfile(struct inode *inode);
+ extern void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
+ struct inode *inode);
extern void ima_file_free(struct file *file);
extern int ima_file_mmap(struct file *file, unsigned long prot);
extern int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot);
bool contents);
extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
enum kernel_read_file_id id);
- extern void ima_post_path_mknod(struct dentry *dentry);
+ extern void ima_post_path_mknod(struct user_namespace *mnt_userns,
+ struct dentry *dentry);
extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size);
extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
+extern void ima_measure_critical_data(const char *event_label,
+ const char *event_name,
+ const void *buf, size_t buf_len,
+ bool hash);
#ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
extern void ima_appraise_parse_cmdline(void);
return 0;
}
- static inline void ima_post_create_tmpfile(struct inode *inode)
+ static inline void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
+ struct inode *inode)
{
}
return 0;
}
- static inline void ima_post_path_mknod(struct dentry *dentry)
+ static inline void ima_post_path_mknod(struct user_namespace *mnt_userns,
+ struct dentry *dentry)
{
return;
}
}
static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {}
+
+static inline void ima_measure_critical_data(const char *event_label,
+ const char *event_name,
+ const void *buf, size_t buf_len,
+ bool hash) {}
+
#endif /* CONFIG_IMA */
#ifndef CONFIG_IMA_KEXEC
#ifdef CONFIG_IMA_APPRAISE
extern bool is_ima_appraise_enabled(void);
- extern void ima_inode_post_setattr(struct dentry *dentry);
+ extern void ima_inode_post_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry);
extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name,
const void *xattr_value, size_t xattr_value_len);
extern int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name);
return 0;
}
- static inline void ima_inode_post_setattr(struct dentry *dentry)
+ static inline void ima_inode_post_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry)
{
return;
}
LSM_HOOK(int, 0, inode_init_security, struct inode *inode,
struct inode *dir, const struct qstr *qstr, const char **name,
void **value, size_t *len)
+LSM_HOOK(int, 0, inode_init_security_anon, struct inode *inode,
+ const struct qstr *name, const struct inode *context_inode)
LSM_HOOK(int, 0, inode_create, struct inode *dir, struct dentry *dentry,
umode_t mode)
LSM_HOOK(int, 0, inode_link, struct dentry *old_dentry, struct inode *dir,
LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask)
LSM_HOOK(int, 0, inode_setattr, struct dentry *dentry, struct iattr *attr)
LSM_HOOK(int, 0, inode_getattr, const struct path *path)
- LSM_HOOK(int, 0, inode_setxattr, struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ LSM_HOOK(int, 0, inode_setxattr, struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry,
const char *name, const void *value, size_t size, int flags)
LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name)
LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry)
- LSM_HOOK(int, 0, inode_removexattr, struct dentry *dentry, const char *name)
+ LSM_HOOK(int, 0, inode_removexattr, struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *name)
LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry)
- LSM_HOOK(int, 0, inode_killpriv, struct dentry *dentry)
- LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct inode *inode,
- const char *name, void **buffer, bool alloc)
+ LSM_HOOK(int, 0, inode_killpriv, struct user_namespace *mnt_userns,
+ struct dentry *dentry)
+ LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct user_namespace *mnt_userns,
+ struct inode *inode, const char *name, void **buffer, bool alloc)
LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode,
const char *name, const void *value, size_t size, int flags)
LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer,
* Returns 0 if @name and @value have been successfully set,
* -EOPNOTSUPP if no security attribute is needed, or
* -ENOMEM on memory allocation failure.
+ * @inode_init_security_anon:
+ * Set up the incore security field for the new anonymous inode
+ * and return whether the inode creation is permitted by the security
+ * module or not.
+ * @inode contains the inode structure
+ * @name name of the anonymous inode class
+ * @context_inode optional related inode
+ * Returns 0 on success, -EACCES if the security module denies the
+ * creation of this inode, or another -errno upon other errors.
* @inode_create:
* Check permission to create a regular file.
* @dir contains inode structure of the parent of the new file.
* @inode_killpriv:
* The setuid bit is being removed. Remove similar security labels.
* Called with the dentry->d_inode->i_mutex held.
+ * @mnt_userns: user namespace of the mount
* @dentry is the dentry being changed.
* Return 0 on success. If error is returned, then the operation
* causing setuid bit removal is failed.
const kernel_cap_t *inheritable,
const kernel_cap_t *permitted);
extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file);
- extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags);
- extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
- extern int cap_inode_need_killpriv(struct dentry *dentry);
- extern int cap_inode_killpriv(struct dentry *dentry);
- extern int cap_inode_getsecurity(struct inode *inode, const char *name,
- void **buffer, bool alloc);
+ int cap_inode_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags);
+ int cap_inode_removexattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *name);
+ int cap_inode_need_killpriv(struct dentry *dentry);
+ int cap_inode_killpriv(struct user_namespace *mnt_userns,
+ struct dentry *dentry);
+ int cap_inode_getsecurity(struct user_namespace *mnt_userns,
+ struct inode *inode, const char *name, void **buffer,
+ bool alloc);
extern int cap_mmap_addr(unsigned long addr);
extern int cap_mmap_file(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags);
int security_inode_init_security(struct inode *inode, struct inode *dir,
const struct qstr *qstr,
initxattrs initxattrs, void *fs_data);
+int security_inode_init_security_anon(struct inode *inode,
+ const struct qstr *name,
+ const struct inode *context_inode);
int security_old_inode_init_security(struct inode *inode, struct inode *dir,
const struct qstr *qstr, const char **name,
void **value, size_t *len);
int security_inode_permission(struct inode *inode, int mask);
int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
int security_inode_getattr(const struct path *path);
- int security_inode_setxattr(struct dentry *dentry, const char *name,
+ int security_inode_setxattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
void security_inode_post_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
int security_inode_getxattr(struct dentry *dentry, const char *name);
int security_inode_listxattr(struct dentry *dentry);
- int security_inode_removexattr(struct dentry *dentry, const char *name);
+ int security_inode_removexattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *name);
int security_inode_need_killpriv(struct dentry *dentry);
- int security_inode_killpriv(struct dentry *dentry);
- int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc);
+ int security_inode_killpriv(struct user_namespace *mnt_userns,
+ struct dentry *dentry);
+ int security_inode_getsecurity(struct user_namespace *mnt_userns,
+ struct inode *inode, const char *name,
+ void **buffer, bool alloc);
int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags);
int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size);
void security_inode_getsecid(struct inode *inode, u32 *secid);
return 0;
}
+static inline int security_inode_init_security_anon(struct inode *inode,
+ const struct qstr *name,
+ const struct inode *context_inode)
+{
+ return 0;
+}
+
static inline int security_old_inode_init_security(struct inode *inode,
struct inode *dir,
const struct qstr *qstr,
return 0;
}
- static inline int security_inode_setxattr(struct dentry *dentry,
- const char *name, const void *value, size_t size, int flags)
+ static inline int security_inode_setxattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
{
return cap_inode_setxattr(dentry, name, value, size, flags);
}
return 0;
}
- static inline int security_inode_removexattr(struct dentry *dentry,
- const char *name)
+ static inline int security_inode_removexattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry,
+ const char *name)
{
- return cap_inode_removexattr(dentry, name);
+ return cap_inode_removexattr(mnt_userns, dentry, name);
}
static inline int security_inode_need_killpriv(struct dentry *dentry)
return cap_inode_need_killpriv(dentry);
}
- static inline int security_inode_killpriv(struct dentry *dentry)
+ static inline int security_inode_killpriv(struct user_namespace *mnt_userns,
+ struct dentry *dentry)
{
- return cap_inode_killpriv(dentry);
+ return cap_inode_killpriv(mnt_userns, dentry);
}
- static inline int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+ static inline int security_inode_getsecurity(struct user_namespace *mnt_userns,
+ struct inode *inode,
+ const char *name, void **buffer,
+ bool alloc)
{
- return cap_inode_getsecurity(inode, name, buffer, alloc);
+ return cap_inode_getsecurity(mnt_userns, inode, name, buffer, alloc);
}
static inline int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
struct io_uring_params;
struct clone_args;
struct open_how;
+ struct mount_attr;
#include <linux/types.h>
#include <linux/aio_abi.h>
/* kernel/futex.c */
asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
- struct __kernel_timespec __user *utime, u32 __user *uaddr2,
- u32 val3);
+ const struct __kernel_timespec __user *utime,
+ u32 __user *uaddr2, u32 val3);
asmlinkage long sys_futex_time32(u32 __user *uaddr, int op, u32 val,
- struct old_timespec32 __user *utime, u32 __user *uaddr2,
- u32 val3);
+ const struct old_timespec32 __user *utime,
+ u32 __user *uaddr2, u32 val3);
asmlinkage long sys_get_robust_list(int pid,
struct robust_list_head __user * __user *head_ptr,
size_t __user *len_ptr);
asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
int to_dfd, const char __user *to_path,
unsigned int ms_flags);
+ asmlinkage long sys_mount_setattr(int dfd, const char __user *path,
+ unsigned int flags,
+ struct mount_attr __user *uattr, size_t usize);
asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key,
const void __user *value, int aux);
return rule->mask[word] & bit;
}
-/* At syscall entry and exit time, this filter is called if the
- * audit_state is not low enough that auditing cannot take place, but is
- * also not high enough that we already know we have to write an audit
- * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
+/* At syscall exit time, this filter is called if the audit_state is
+ * not low enough that auditing cannot take place, but is also not
+ * high enough that we already know we have to write an audit record
+ * (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
*/
-static enum audit_state audit_filter_syscall(struct task_struct *tsk,
+static void audit_filter_syscall(struct task_struct *tsk,
struct audit_context *ctx,
struct list_head *list)
{
enum audit_state state;
if (auditd_test_task(tsk))
- return AUDIT_DISABLED;
+ return;
rcu_read_lock();
list_for_each_entry_rcu(e, list, list) {
&state, false)) {
rcu_read_unlock();
ctx->current_state = state;
- return state;
+ return;
}
}
rcu_read_unlock();
- return AUDIT_BUILD_CONTEXT;
+ return;
}
/*
if (!dentry)
return 0;
- rc = get_vfs_caps_from_disk(dentry, &caps);
+ rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
if (rc)
return rc;
ax->d.next = context->aux;
context->aux = (void *)ax;
- get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+ get_vfs_caps_from_disk(&init_user_ns,
+ bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
{
struct psi_trigger *new;
struct cgroup *cgrp;
+ struct psi_group *psi;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
- new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+ psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ new = psi_trigger_create(psi, buf, nbytes, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
if (!inode)
return -ENOMEM;
- ret = inode_permission(inode, MAY_WRITE);
+ ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
iput(inode);
return ret;
}
return ret;
}
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ bool threadgroup)
{
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, true, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
+ /* process and thread migrations follow same delegation rule */
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb, true);
+ of->file->f_path.dentry->d_sb, threadgroup);
if (ret)
goto out_finish;
- ret = cgroup_attach_task(dst_cgrp, task, true);
+ ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
cgroup_procs_write_finish(task, locked);
out_unlock:
cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
+ return ret;
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, true) ?: nbytes;
}
static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- struct cgroup *src_cgrp, *dst_cgrp;
- struct task_struct *task;
- ssize_t ret;
- bool locked;
-
- buf = strstrip(buf);
-
- dst_cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!dst_cgrp)
- return -ENODEV;
-
- task = cgroup_procs_write_start(buf, false, &locked);
- ret = PTR_ERR_OR_ZERO(task);
- if (ret)
- goto out_unlock;
-
- /* find the source cgroup */
- spin_lock_irq(&css_set_lock);
- src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- /* thread migrations follow the cgroup.procs delegation rule */
- ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb, false);
- if (ret)
- goto out_finish;
-
- ret = cgroup_attach_task(dst_cgrp, task, false);
-
-out_finish:
- cgroup_procs_write_finish(task, locked);
-out_unlock:
- cgroup_kn_unlock(of->kn);
-
- return ret ?: nbytes;
+ return __cgroup_procs_write(of, buf, false) ?: nbytes;
}
/* cgroup core interface files for the default hierarchy */
#include <linux/times.h>
#include <linux/posix-timers.h>
#include <linux/security.h>
-#include <linux/dcookies.h>
#include <linux/suspend.h>
#include <linux/tty.h>
#include <linux/signal.h>
if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
goto exit;
- err = inode_permission(inode, MAY_EXEC);
+ err = file_permission(exe.file, MAY_EXEC);
if (err)
goto exit;
return -EINVAL;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, mm);
madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
- tlb_finish_mmu(&tlb, start_addr, end_addr);
+ tlb_finish_mmu(&tlb);
return 0;
}
* otherwise we'd be including shared non-exclusive mappings, which
* opens a side channel.
*/
- return inode_owner_or_capable(file_inode(vma->vm_file)) ||
- inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+ return inode_owner_or_capable(&init_user_ns,
+ file_inode(vma->vm_file)) ||
+ file_permission(vma->vm_file, MAY_WRITE) == 0;
}
static long madvise_pageout(struct vm_area_struct *vma,
return 0;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, mm);
madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
- tlb_finish_mmu(&tlb, start_addr, end_addr);
+ tlb_finish_mmu(&tlb);
return 0;
}
range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
&madvise_free_walk_ops, &tlb);
tlb_end_vma(&tlb, vma);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, range.start, range.end);
+ tlb_finish_mmu(&tlb);
return 0;
}
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->kmem, nr_pages);
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
+ refill_stock(memcg, nr_pages);
}
/**
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(file_inode(cfile.file), MAY_READ);
+ ret = file_permission(cfile.file, MAY_READ);
if (ret < 0)
goto out_put_cfile;
if (err)
return err;
+ page_counter_set_high(&memcg->memory, high);
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
break;
}
- page_counter_set_high(&memcg->memory, high);
-
memcg_wb_domain_size_changed(memcg);
-
return nbytes;
}
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
- static int shmem_getattr(const struct path *path, struct kstat *stat,
+ static int shmem_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = path->dentry->d_inode;
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
}
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
if (is_huge_enabled(sb_info))
stat->blksize = HPAGE_PMD_SIZE;
return 0;
}
- static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
+ static int shmem_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
int error;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(&init_user_ns, dentry, attr);
if (error)
return error;
}
}
- setattr_copy(inode, attr);
+ setattr_copy(&init_user_ns, inode, attr);
if (attr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(inode, inode->i_mode);
+ error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
return error;
}
{
struct vm_area_struct pvma;
struct page *page;
- struct vm_fault vmf;
+ struct vm_fault vmf = {
+ .vma = &pvma,
+ };
shmem_pseudo_vma_init(&pvma, info, index);
- vmf.vma = &pvma;
- vmf.address = 0;
page = swap_cluster_readahead(swap, gfp, &vmf);
shmem_pseudo_vma_destroy(&pvma);
inode = new_inode(sb);
if (inode) {
inode->i_ino = ino;
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(&init_user_ns, inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_generation = prandom_u32();
* File creation. Allocate an inode, and we're done..
*/
static int
- shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
int error = -ENOSPC;
}
static int
- shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+ shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
int error = -ENOSPC;
return error;
}
- static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
- if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
+ if ((error = shmem_mknod(&init_user_ns, dir, dentry,
+ mode | S_IFDIR, 0)))
return error;
inc_nlink(dir);
return 0;
}
- static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+ static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
- return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
+ return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
}
/*
return 0;
}
- static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
+ static int shmem_whiteout(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry)
{
struct dentry *whiteout;
int error;
if (!whiteout)
return -ENOMEM;
- error = shmem_mknod(old_dir, whiteout,
+ error = shmem_mknod(&init_user_ns, old_dir, whiteout,
S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
dput(whiteout);
if (error)
* it exists so that the VFS layer correctly free's it when it
* gets overwritten.
*/
- static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
+ static int shmem_rename2(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = S_ISDIR(inode->i_mode);
if (flags & RENAME_WHITEOUT) {
int error;
- error = shmem_whiteout(old_dir, old_dentry);
+ error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
if (error)
return error;
}
return 0;
}
- static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *symname)
{
int error;
int len;
}
static int shmem_xattr_handler_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
};
static int sockfs_security_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *dentry, struct inode *inode,
const char *suffix, const void *value,
size_t size, int flags)
return used;
}
- static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
+ static int sockfs_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *iattr)
{
- int err = simple_setattr(dentry, iattr);
+ int err = simple_setattr(&init_user_ns, dentry, iattr);
if (!err && (iattr->ia_valid & ATTR_UID)) {
struct socket *sock = SOCKET_I(d_inode(dentry));
return __sys_setsockopt(fd, level, optname, optval, optlen);
}
+INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level,
+ int optname));
+
/*
* Get a socket option. Because we don't know the option lengths we have
* to pass a user mode parameter for the protocols to sort out.
/**
* cap_inode_killpriv - Erase the security markings on an inode
- * @dentry: The inode/dentry to alter
+ *
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dentry: The inode/dentry to alter
*
* Erase the privilege-enhancing security markings on an inode.
*
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
* Returns 0 if successful, -ve on error.
*/
- int cap_inode_killpriv(struct dentry *dentry)
+ int cap_inode_killpriv(struct user_namespace *mnt_userns, struct dentry *dentry)
{
int error;
- error = __vfs_removexattr(dentry, XATTR_NAME_CAPS);
+ error = __vfs_removexattr(mnt_userns, dentry, XATTR_NAME_CAPS);
if (error == -EOPNOTSUPP)
error = 0;
return error;
* by the integrity subsystem, which really wants the unconverted values -
* so that's good.
*/
- int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
+ int cap_inode_getsecurity(struct user_namespace *mnt_userns,
+ struct inode *inode, const char *name, void **buffer,
bool alloc)
{
int size, ret;
kuid_t kroot;
+ u32 nsmagic, magic;
uid_t root, mappedroot;
char *tmpbuf = NULL;
struct vfs_cap_data *cap;
- struct vfs_ns_cap_data *nscap;
+ struct vfs_ns_cap_data *nscap = NULL;
struct dentry *dentry;
struct user_namespace *fs_ns;
return -EINVAL;
size = sizeof(struct vfs_ns_cap_data);
- ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
- &tmpbuf, size, GFP_NOFS);
+ ret = (int)vfs_getxattr_alloc(mnt_userns, dentry, XATTR_NAME_CAPS,
+ &tmpbuf, size, GFP_NOFS);
dput(dentry);
if (ret < 0)
fs_ns = inode->i_sb->s_user_ns;
cap = (struct vfs_cap_data *) tmpbuf;
if (is_v2header((size_t) ret, cap)) {
- /* If this is sizeof(vfs_cap_data) then we're ok with the
- * on-disk value, so return that. */
- if (alloc)
- *buffer = tmpbuf;
- else
- kfree(tmpbuf);
- return ret;
- } else if (!is_v3header((size_t) ret, cap)) {
- kfree(tmpbuf);
- return -EINVAL;
+ root = 0;
+ } else if (is_v3header((size_t) ret, cap)) {
+ nscap = (struct vfs_ns_cap_data *) tmpbuf;
+ root = le32_to_cpu(nscap->rootid);
+ } else {
+ size = -EINVAL;
+ goto out_free;
}
- nscap = (struct vfs_ns_cap_data *) tmpbuf;
- root = le32_to_cpu(nscap->rootid);
kroot = make_kuid(fs_ns, root);
+ /* If this is an idmapped mount shift the kuid. */
+ kroot = kuid_into_mnt(mnt_userns, kroot);
+
/* If the root kuid maps to a valid uid in current ns, then return
* this as a nscap. */
mappedroot = from_kuid(current_user_ns(), kroot);
if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
+ size = sizeof(struct vfs_ns_cap_data);
if (alloc) {
- *buffer = tmpbuf;
+ if (!nscap) {
+ /* v2 -> v3 conversion */
+ nscap = kzalloc(size, GFP_ATOMIC);
+ if (!nscap) {
+ size = -ENOMEM;
+ goto out_free;
+ }
+ nsmagic = VFS_CAP_REVISION_3;
+ magic = le32_to_cpu(cap->magic_etc);
+ if (magic & VFS_CAP_FLAGS_EFFECTIVE)
+ nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
+ memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+ nscap->magic_etc = cpu_to_le32(nsmagic);
+ } else {
+ /* use allocated v3 buffer */
+ tmpbuf = NULL;
+ }
nscap->rootid = cpu_to_le32(mappedroot);
- } else
- kfree(tmpbuf);
- return size;
+ *buffer = nscap;
+ }
+ goto out_free;
}
if (!rootid_owns_currentns(kroot)) {
- kfree(tmpbuf);
- return -EOPNOTSUPP;
+ size = -EOVERFLOW;
+ goto out_free;
}
/* This comes from a parent namespace. Return as a v2 capability */
size = sizeof(struct vfs_cap_data);
if (alloc) {
- *buffer = kmalloc(size, GFP_ATOMIC);
- if (*buffer) {
- struct vfs_cap_data *cap = *buffer;
- __le32 nsmagic, magic;
+ if (nscap) {
+ /* v3 -> v2 conversion */
+ cap = kzalloc(size, GFP_ATOMIC);
+ if (!cap) {
+ size = -ENOMEM;
+ goto out_free;
+ }
magic = VFS_CAP_REVISION_2;
nsmagic = le32_to_cpu(nscap->magic_etc);
if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
cap->magic_etc = cpu_to_le32(magic);
} else {
- size = -ENOMEM;
+ /* use unconverted v2 */
+ tmpbuf = NULL;
}
+ *buffer = cap;
}
+out_free:
kfree(tmpbuf);
return size;
}
+ /**
+ * rootid_from_xattr - translate root uid of vfs caps
+ *
+ * @value: vfs caps value which may be modified by this function
+ * @size: size of @ivalue
+ * @task_ns: user namespace of the caller
+ * @mnt_userns: user namespace of the mount the inode was found from
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ */
static kuid_t rootid_from_xattr(const void *value, size_t size,
- struct user_namespace *task_ns)
+ struct user_namespace *task_ns,
+ struct user_namespace *mnt_userns)
{
const struct vfs_ns_cap_data *nscap = value;
+ kuid_t rootkid;
uid_t rootid = 0;
if (size == XATTR_CAPS_SZ_3)
rootid = le32_to_cpu(nscap->rootid);
- return make_kuid(task_ns, rootid);
+ rootkid = make_kuid(task_ns, rootid);
+ return kuid_from_mnt(mnt_userns, rootkid);
}
static bool validheader(size_t size, const struct vfs_cap_data *cap)
return is_v2header(size, cap) || is_v3header(size, cap);
}
- /*
+ /**
+ * cap_convert_nscap - check vfs caps
+ *
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dentry: used to retrieve inode to check permissions on
+ * @ivalue: vfs caps value which may be modified by this function
+ * @size: size of @ivalue
+ *
* User requested a write of security.capability. If needed, update the
* xattr to change from v2 to v3, or to fixup the v3 rootid.
*
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
* If all is ok, we return the new size, on error return < 0.
*/
- int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size)
+ int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
+ const void **ivalue, size_t size)
{
struct vfs_ns_cap_data *nscap;
uid_t nsrootid;
__u32 magic, nsmagic;
struct inode *inode = d_backing_inode(dentry);
struct user_namespace *task_ns = current_user_ns(),
- *fs_ns = inode->i_sb->s_user_ns;
+ *fs_ns = inode->i_sb->s_user_ns,
+ *ancestor;
kuid_t rootid;
size_t newsize;
return -EINVAL;
if (!validheader(size, cap))
return -EINVAL;
- if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+ if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
return -EPERM;
- if (size == XATTR_CAPS_SZ_2)
+ if (size == XATTR_CAPS_SZ_2 && (mnt_userns == &init_user_ns))
if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
/* user is privileged, just write the v2 */
return size;
- rootid = rootid_from_xattr(*ivalue, size, task_ns);
+ rootid = rootid_from_xattr(*ivalue, size, task_ns, mnt_userns);
if (!uid_valid(rootid))
return -EINVAL;
if (nsrootid == -1)
return -EINVAL;
+ /*
+ * Do not allow allow adding a v3 filesystem capability xattr
+ * if the rootid field is ambiguous.
+ */
+ for (ancestor = task_ns->parent; ancestor; ancestor = ancestor->parent) {
+ if (from_kuid(ancestor, rootid) == 0)
+ return -EINVAL;
+ }
+
newsize = sizeof(struct vfs_ns_cap_data);
nscap = kmalloc(newsize, GFP_ATOMIC);
if (!nscap)
return *effective ? ret : 0;
}
- /*
+ /**
+ * get_vfs_caps_from_disk - retrieve vfs caps from disk
+ *
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dentry: dentry from which @inode is retrieved
+ * @cpu_caps: vfs capabilities
+ *
* Extract the on-exec-apply capability sets for an executable file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
*/
- int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
+ int get_vfs_caps_from_disk(struct user_namespace *mnt_userns,
+ const struct dentry *dentry,
+ struct cpu_vfs_cap_data *cpu_caps)
{
struct inode *inode = d_backing_inode(dentry);
__u32 magic_etc;
/* Limit the caps to the mounter of the filesystem
* or the more limited uid specified in the xattr.
*/
+ rootkuid = kuid_into_mnt(mnt_userns, rootkuid);
if (!rootid_owns_currentns(rootkuid))
return -ENODATA;
if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
return 0;
- rc = get_vfs_caps_from_disk(file->f_path.dentry, &vcaps);
+ rc = get_vfs_caps_from_disk(file_mnt_user_ns(file),
+ file->f_path.dentry, &vcaps);
if (rc < 0) {
if (rc == -EINVAL)
printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
/**
* cap_inode_removexattr - Determine whether an xattr may be removed
- * @dentry: The inode/dentry being altered
- * @name: The name of the xattr to be changed
+ *
+ * @mnt_userns: User namespace of the mount the inode was found from
+ * @dentry: The inode/dentry being altered
+ * @name: The name of the xattr to be changed
*
* Determine whether an xattr may be removed from an inode, returning 0 if
* permission is granted, -ve if denied.
*
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
* This is used to make sure security xattrs don't get removed by those who
* aren't privileged to remove them.
*/
- int cap_inode_removexattr(struct dentry *dentry, const char *name)
+ int cap_inode_removexattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *name)
{
struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
struct inode *inode = d_backing_inode(dentry);
if (!inode)
return -EINVAL;
- if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+ if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
return -EPERM;
return 0;
}
{
long rc;
const char *algo;
- struct crypto_shash **tfm, *tmp_tfm;
+ struct crypto_shash **tfm, *tmp_tfm = NULL;
struct shash_desc *desc;
if (type == EVM_XATTR_HMAC) {
alloc:
desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(*tfm),
GFP_KERNEL);
- if (!desc)
+ if (!desc) {
+ crypto_free_shash(tmp_tfm);
return ERR_PTR(-ENOMEM);
+ }
desc->tfm = *tfm;
rc = crypto_shash_init(desc);
if (rc) {
+ crypto_free_shash(tmp_tfm);
kfree(desc);
return ERR_PTR(rc);
}
ima_present = true;
continue;
}
- size = vfs_getxattr_alloc(dentry, xattr->name,
+ size = vfs_getxattr_alloc(&init_user_ns, dentry, xattr->name,
&xattr_value, xattr_size, GFP_NOFS);
if (size == -ENOMEM) {
error = -ENOMEM;
return 1;
/* Do this the hard way */
- rc = vfs_getxattr_alloc(dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0,
- GFP_NOFS);
+ rc = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_EVM,
+ (char **)&xattr_data, 0, GFP_NOFS);
if (rc <= 0) {
if (rc == -ENODATA)
return 0;
xattr_value_len, &data);
if (rc == 0) {
data.hdr.xattr.sha1.type = EVM_XATTR_HMAC;
- rc = __vfs_setxattr_noperm(dentry, XATTR_NAME_EVM,
+ rc = __vfs_setxattr_noperm(&init_user_ns, dentry,
+ XATTR_NAME_EVM,
&data.hdr.xattr.data[1],
SHA1_DIGEST_SIZE + 1, 0);
} else if (rc == -ENODATA && (inode->i_opflags & IOP_XATTR)) {
- rc = __vfs_removexattr(dentry, XATTR_NAME_EVM);
+ rc = __vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_EVM);
}
return rc;
}
hook(POLICY_CHECK, policy) \
hook(KEXEC_CMDLINE, kexec_cmdline) \
hook(KEY_CHECK, key) \
+ hook(CRITICAL_DATA, critical_data) \
hook(MAX_CHECK, none)
#define __ima_hook_enumify(ENUM, str) ENUM,
#endif /* CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS */
/* LIM API function definitions */
- int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
- int mask, enum ima_hooks func, int *pcr,
+ int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode,
+ const struct cred *cred, u32 secid, int mask,
+ enum ima_hooks func, int *pcr,
struct ima_template_desc **template_desc,
- const char *keyring);
+ const char *func_data);
int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func);
int ima_collect_measurement(struct integrity_iint_cache *iint,
struct file *file, void *buf, loff_t size,
struct evm_ima_xattr_data *xattr_value,
int xattr_len, const struct modsig *modsig, int pcr,
struct ima_template_desc *template_desc);
- void process_buffer_measurement(struct inode *inode, const void *buf, int size,
+ void process_buffer_measurement(struct user_namespace *mnt_userns,
+ struct inode *inode, const void *buf, int size,
const char *eventname, enum ima_hooks func,
- int pcr, const char *keyring);
+ int pcr, const char *func_data,
+ bool buf_hash);
void ima_audit_measurement(struct integrity_iint_cache *iint,
const unsigned char *filename);
int ima_alloc_init_template(struct ima_event_data *event_data,
const char *ima_d_path(const struct path *path, char **pathbuf, char *filename);
/* IMA policy related functions */
- int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
- enum ima_hooks func, int mask, int flags, int *pcr,
+ int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode,
+ const struct cred *cred, u32 secid, enum ima_hooks func,
+ int mask, int flags, int *pcr,
struct ima_template_desc **template_desc,
- const char *keyring);
+ const char *func_data);
void ima_init_policy(void);
void ima_update_policy(void);
void ima_update_policy_flag(void);
struct file *file, const unsigned char *filename,
struct evm_ima_xattr_data *xattr_value,
int xattr_len, const struct modsig *modsig);
- int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func);
+ int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode,
+ int mask, enum ima_hooks func);
void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file);
enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint,
enum ima_hooks func);
return INTEGRITY_UNKNOWN;
}
- static inline int ima_must_appraise(struct inode *inode, int mask,
+ static inline int ima_must_appraise(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask,
enum ima_hooks func)
{
return 0;
/**
* ima_get_action - appraise & measure decision based on policy.
+ * @mnt_userns: user namespace of the mount the inode was found from
* @inode: pointer to the inode associated with the object being validated
* @cred: pointer to credentials structure to validate
* @secid: secid of the task being validated
* @func: caller identifier
* @pcr: pointer filled in if matched measure policy sets pcr=
* @template_desc: pointer filled in if matched measure policy sets template=
- * @keyring: keyring name used to determine the action
+ * @func_data: func specific data, may be NULL
*
* The policy is defined in terms of keypairs:
* subj=, obj=, type=, func=, mask=, fsmagic=
* subj,obj, and type: are LSM specific.
* func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK
- * | KEXEC_CMDLINE | KEY_CHECK
+ * | KEXEC_CMDLINE | KEY_CHECK | CRITICAL_DATA
* mask: contains the permission mask
* fsmagic: hex value
*
* Returns IMA_MEASURE, IMA_APPRAISE mask.
*
*/
- int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
- int mask, enum ima_hooks func, int *pcr,
+ int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode,
+ const struct cred *cred, u32 secid, int mask,
+ enum ima_hooks func, int *pcr,
struct ima_template_desc **template_desc,
- const char *keyring)
+ const char *func_data)
{
int flags = IMA_MEASURE | IMA_AUDIT | IMA_APPRAISE | IMA_HASH;
flags &= ima_policy_flag;
- return ima_match_policy(inode, cred, secid, func, mask, flags, pcr,
- template_desc, func_data);
+ return ima_match_policy(mnt_userns, inode, cred, secid, func, mask,
- flags, pcr, template_desc, keyring);
++ flags, pcr, template_desc, func_data);
}
/*
*
* Return 1 to appraise or hash
*/
- int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func)
+ int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode,
+ int mask, enum ima_hooks func)
{
u32 secid;
return 0;
security_task_getsecid(current, &secid);
- return ima_match_policy(inode, current_cred(), secid, func, mask,
- IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL);
+ return ima_match_policy(mnt_userns, inode, current_cred(), secid, func,
+ mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL);
}
static int ima_fix_xattr(struct dentry *dentry,
iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG;
iint->ima_hash->xattr.ng.algo = algo;
}
- rc = __vfs_setxattr_noperm(dentry, XATTR_NAME_IMA,
+ rc = __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_IMA,
&iint->ima_hash->xattr.data[offset],
(sizeof(iint->ima_hash->xattr) - offset) +
iint->ima_hash->length, 0);
{
ssize_t ret;
- ret = vfs_getxattr_alloc(dentry, XATTR_NAME_IMA, (char **)xattr_value,
- 0, GFP_NOFS);
+ ret = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_IMA,
+ (char **)xattr_value, 0, GFP_NOFS);
if (ret == -EOPNOTSUPP)
ret = 0;
return ret;
rc = is_binary_blacklisted(digest, digestsize);
if ((rc == -EPERM) && (iint->flags & IMA_MEASURE))
- process_buffer_measurement(NULL, digest, digestsize,
+ process_buffer_measurement(&init_user_ns, NULL, digest, digestsize,
"blacklisted-hash", NONE,
- pcr, NULL);
+ pcr, NULL, false);
}
return rc;
/**
* ima_inode_post_setattr - reflect file metadata changes
+ * @mnt_userns: user namespace of the mount the inode was found from
* @dentry: pointer to the affected dentry
*
* Changes to a dentry's metadata might result in needing to appraise.
* This function is called from notify_change(), which expects the caller
* to lock the inode's i_mutex.
*/
- void ima_inode_post_setattr(struct dentry *dentry)
+ void ima_inode_post_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry)
{
struct inode *inode = d_backing_inode(dentry);
struct integrity_iint_cache *iint;
|| !(inode->i_opflags & IOP_XATTR))
return;
- action = ima_must_appraise(inode, MAY_ACCESS, POST_SETATTR);
+ action = ima_must_appraise(mnt_userns, inode, MAY_ACCESS, POST_SETATTR);
if (!action)
- __vfs_removexattr(dentry, XATTR_NAME_IMA);
+ __vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_IMA);
iint = integrity_iint_find(inode);
if (iint) {
set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags);
*/
#include <keys/asymmetric-type.h>
+ #include <linux/user_namespace.h>
#include "ima.h"
/**
* if the IMA policy is configured to measure a key linked
* to the given keyring.
*/
- process_buffer_measurement(NULL, payload, payload_len,
+ process_buffer_measurement(&init_user_ns, NULL, payload, payload_len,
keyring->description, KEY_CHECK, 0,
- keyring->description);
+ keyring->description, false);
}
* bitmask based on the appraise/audit/measurement policy.
* Included is the appraise submask.
*/
- action = ima_get_action(inode, cred, secid, mask, func, &pcr,
- &template_desc, NULL);
+ action = ima_get_action(file_mnt_user_ns(file), inode, cred, secid,
+ mask, func, &pcr, &template_desc, NULL);
violation_check = ((func == FILE_CHECK || func == MMAP_CHECK) &&
(ima_policy_flag & IMA_MEASURE));
if (!action && !violation_check)
security_task_getsecid(current, &secid);
inode = file_inode(vma->vm_file);
- action = ima_get_action(inode, current_cred(), secid, MAY_EXEC,
- MMAP_CHECK, &pcr, &template, 0);
+ action = ima_get_action(file_mnt_user_ns(vma->vm_file), inode,
+ current_cred(), secid, MAY_EXEC, MMAP_CHECK,
+ &pcr, &template, 0);
/* Is the mmap'ed file in policy? */
if (!(action & (IMA_MEASURE | IMA_APPRAISE_SUBMASK)))
/**
* ima_post_create_tmpfile - mark newly created tmpfile as new
+ * @mnt_userns: user namespace of the mount the inode was found from
* @file : newly created tmpfile
*
* No measuring, appraising or auditing of newly created tmpfiles is needed.
* Skip calling process_measurement(), but indicate which newly, created
* tmpfiles are in policy.
*/
- void ima_post_create_tmpfile(struct inode *inode)
+ void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
+ struct inode *inode)
{
struct integrity_iint_cache *iint;
int must_appraise;
- must_appraise = ima_must_appraise(inode, MAY_ACCESS, FILE_CHECK);
+ must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS,
+ FILE_CHECK);
if (!must_appraise)
return;
/**
* ima_post_path_mknod - mark as a new inode
+ * @mnt_userns: user namespace of the mount the inode was found from
* @dentry: newly created dentry
*
* Mark files created via the mknodat syscall as new, so that the
* file data can be written later.
*/
- void ima_post_path_mknod(struct dentry *dentry)
+ void ima_post_path_mknod(struct user_namespace *mnt_userns,
+ struct dentry *dentry)
{
struct integrity_iint_cache *iint;
struct inode *inode = dentry->d_inode;
int must_appraise;
- must_appraise = ima_must_appraise(inode, MAY_ACCESS, FILE_CHECK);
+ must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS,
+ FILE_CHECK);
if (!must_appraise)
return;
}
/*
- * process_buffer_measurement - Measure the buffer to ima log.
+ * process_buffer_measurement - Measure the buffer or the buffer data hash
+ * @mnt_userns: user namespace of the mount the inode was found from
* @inode: inode associated with the object being measured (NULL for KEY_CHECK)
* @buf: pointer to the buffer that needs to be added to the log.
* @size: size of buffer(in bytes).
* @eventname: event name to be used for the buffer entry.
* @func: IMA hook
* @pcr: pcr to extend the measurement
- * @keyring: keyring name to determine the action to be performed
+ * @func_data: func specific data, may be NULL
+ * @buf_hash: measure buffer data hash
*
- * Based on policy, the buffer is measured into the ima log.
+ * Based on policy, either the buffer data or buffer data hash is measured
*/
- void process_buffer_measurement(struct inode *inode, const void *buf, int size,
+ void process_buffer_measurement(struct user_namespace *mnt_userns,
+ struct inode *inode, const void *buf, int size,
const char *eventname, enum ima_hooks func,
- int pcr, const char *keyring)
+ int pcr, const char *func_data,
+ bool buf_hash)
{
int ret = 0;
const char *audit_cause = "ENOMEM";
struct ima_digest_data hdr;
char digest[IMA_MAX_DIGEST_SIZE];
} hash = {};
+ char digest_hash[IMA_MAX_DIGEST_SIZE];
+ int digest_hash_len = hash_digest_size[ima_hash_algo];
int violation = 0;
int action = 0;
u32 secid;
*/
if (func) {
security_task_getsecid(current, &secid);
- action = ima_get_action(inode, current_cred(), secid, 0, func,
- &pcr, &template, func_data);
+ action = ima_get_action(mnt_userns, inode, current_cred(),
+ secid, 0, func, &pcr, &template,
- keyring);
++ func_data);
if (!(action & IMA_MEASURE))
return;
}
goto out;
}
+ if (buf_hash) {
+ memcpy(digest_hash, hash.hdr.digest, digest_hash_len);
+
+ ret = ima_calc_buffer_hash(digest_hash, digest_hash_len,
+ iint.ima_hash);
+ if (ret < 0) {
+ audit_cause = "hashing_error";
+ goto out;
+ }
+
+ event_data.buf = digest_hash;
+ event_data.buf_len = digest_hash_len;
+ }
+
ret = ima_alloc_init_template(&event_data, &entry, template);
if (ret < 0) {
audit_cause = "alloc_entry";
goto out;
}
- ret = ima_store_template(entry, violation, NULL, buf, pcr);
+ ret = ima_store_template(entry, violation, NULL, event_data.buf, pcr);
if (ret < 0) {
audit_cause = "store_entry";
ima_free_template_entry(entry);
if (!f.file)
return;
- process_buffer_measurement(file_inode(f.file), buf, size,
- "kexec-cmdline", KEXEC_CMDLINE, 0, NULL,
- false);
+ process_buffer_measurement(file_mnt_user_ns(f.file), file_inode(f.file),
+ buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0,
- NULL);
++ NULL, false);
fdput(f);
}
- process_buffer_measurement(NULL, buf, buf_len, event_name,
+/**
+ * ima_measure_critical_data - measure kernel integrity critical data
+ * @event_label: unique event label for grouping and limiting critical data
+ * @event_name: event name for the record in the IMA measurement list
+ * @buf: pointer to buffer data
+ * @buf_len: length of buffer data (in bytes)
+ * @hash: measure buffer data hash
+ *
+ * Measure data critical to the integrity of the kernel into the IMA log
+ * and extend the pcr. Examples of critical data could be various data
+ * structures, policies, and states stored in kernel memory that can
+ * impact the integrity of the system.
+ */
+void ima_measure_critical_data(const char *event_label,
+ const char *event_name,
+ const void *buf, size_t buf_len,
+ bool hash)
+{
+ if (!event_name || !event_label || !buf || !buf_len)
+ return;
+
++ process_buffer_measurement(&init_user_ns, NULL, buf, buf_len, event_name,
+ CRITICAL_DATA, 0, event_label,
+ hash);
+}
+
static int __init init_ima(void)
{
int error;
#define IMA_PCR 0x0100
#define IMA_FSNAME 0x0200
#define IMA_KEYRINGS 0x0400
+#define IMA_LABEL 0x0800
#define UNKNOWN 0
#define MEASURE 0x0001 /* same as IMA_MEASURE */
} lsm[MAX_LSM_RULES];
char *fsname;
struct ima_rule_opt_list *keyrings; /* Measure keys added to these keyrings */
+ struct ima_rule_opt_list *label; /* Measure data grouped under this label */
struct ima_template_desc *template;
};
.flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
};
+static struct ima_rule_entry critical_data_rules[] __ro_after_init = {
+ {.action = MEASURE, .func = CRITICAL_DATA, .flags = IMA_FUNC},
+};
+
/* An array of architecture specific rules */
static struct ima_rule_entry *arch_policy_entry __ro_after_init;
static bool ima_use_appraise_tcb __initdata;
static bool ima_use_secure_boot __initdata;
+static bool ima_use_critical_data __initdata;
static bool ima_fail_unverifiable_sigs __ro_after_init;
static int __init policy_setup(char *str)
{
ima_use_appraise_tcb = true;
else if (strcmp(p, "secure_boot") == 0)
ima_use_secure_boot = true;
+ else if (strcmp(p, "critical_data") == 0)
+ ima_use_critical_data = true;
else if (strcmp(p, "fail_securely") == 0)
ima_fail_unverifiable_sigs = true;
else
}
/**
- * ima_match_keyring - determine whether the keyring matches the measure rule
+ * ima_match_rule_data - determine whether func_data matches the policy rule
* @rule: a pointer to a rule
- * @keyring: name of the keyring to match against the measure rule
+ * @func_data: data to match against the measure rule data
* @cred: a pointer to a credentials structure for user validation
*
- * Returns true if keyring matches one in the rule, false otherwise.
+ * Returns true if func_data matches one in the rule, false otherwise.
*/
-static bool ima_match_keyring(struct ima_rule_entry *rule,
- const char *keyring, const struct cred *cred)
+static bool ima_match_rule_data(struct ima_rule_entry *rule,
+ const char *func_data,
+ const struct cred *cred)
{
+ const struct ima_rule_opt_list *opt_list = NULL;
bool matched = false;
size_t i;
if ((rule->flags & IMA_UID) && !rule->uid_op(cred->uid, rule->uid))
return false;
- if (!rule->keyrings)
- return true;
+ switch (rule->func) {
+ case KEY_CHECK:
+ if (!rule->keyrings)
+ return true;
+
+ opt_list = rule->keyrings;
+ break;
+ case CRITICAL_DATA:
+ if (!rule->label)
+ return true;
+
+ opt_list = rule->label;
+ break;
+ default:
+ return false;
+ }
- if (!keyring)
+ if (!func_data)
return false;
- for (i = 0; i < rule->keyrings->count; i++) {
- if (!strcmp(rule->keyrings->items[i], keyring)) {
+ for (i = 0; i < opt_list->count; i++) {
+ if (!strcmp(opt_list->items[i], func_data)) {
matched = true;
break;
}
/**
* ima_match_rules - determine whether an inode matches the policy rule.
* @rule: a pointer to a rule
+ * @mnt_userns: user namespace of the mount the inode was found from
* @inode: a pointer to an inode
* @cred: a pointer to a credentials structure for user validation
* @secid: the secid of the task to be validated
* @func: LIM hook identifier
* @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
- * @keyring: keyring name to check in policy for KEY_CHECK func
+ * @func_data: func specific data, may be NULL
*
* Returns true on rule match, false on failure.
*/
- static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
- const struct cred *cred, u32 secid,
- enum ima_hooks func, int mask,
+ static bool ima_match_rules(struct ima_rule_entry *rule,
+ struct user_namespace *mnt_userns,
+ struct inode *inode, const struct cred *cred,
+ u32 secid, enum ima_hooks func, int mask,
- const char *keyring)
+ const char *func_data)
{
int i;
- if (func == KEY_CHECK) {
- return (rule->flags & IMA_FUNC) && (rule->func == func) &&
- ima_match_keyring(rule, keyring, cred);
- }
if ((rule->flags & IMA_FUNC) &&
(rule->func != func && func != POST_SETATTR))
return false;
+
+ switch (func) {
+ case KEY_CHECK:
+ case CRITICAL_DATA:
+ return ((rule->func == func) &&
+ ima_match_rule_data(rule, func_data, cred));
+ default:
+ break;
+ }
+
if ((rule->flags & IMA_MASK) &&
(rule->mask != mask && func != POST_SETATTR))
return false;
}
if ((rule->flags & IMA_FOWNER) &&
- !rule->fowner_op(inode->i_uid, rule->fowner))
+ !rule->fowner_op(i_uid_into_mnt(mnt_userns, inode), rule->fowner))
return false;
for (i = 0; i < MAX_LSM_RULES; i++) {
int rc = 0;
/**
* ima_match_policy - decision based on LSM and other conditions
+ * @mnt_userns: user namespace of the mount the inode was found from
* @inode: pointer to an inode for which the policy decision is being made
* @cred: pointer to a credentials structure for which the policy decision is
* being made
* @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
* @pcr: set the pcr to extend
* @template_desc: the template that should be used for this rule
- * @keyring: the keyring name, if given, to be used to check in the policy.
- * keyring can be NULL if func is anything other than KEY_CHECK.
+ * @func_data: func specific data, may be NULL
*
* Measure decision based on func/mask/fsmagic and LSM(subj/obj/type)
* conditions.
* list when walking it. Reads are many orders of magnitude more numerous
* than writes so ima_match_policy() is classical RCU candidate.
*/
- int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
- enum ima_hooks func, int mask, int flags, int *pcr,
+ int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode,
+ const struct cred *cred, u32 secid, enum ima_hooks func,
+ int mask, int flags, int *pcr,
struct ima_template_desc **template_desc,
- const char *keyring)
+ const char *func_data)
{
struct ima_rule_entry *entry;
int action = 0, actmask = flags | (flags << 1);
if (!(entry->action & actmask))
continue;
- if (!ima_match_rules(entry, inode, cred, secid, func, mask,
- func_data))
+ if (!ima_match_rules(entry, mnt_userns, inode, cred, secid,
- func, mask, keyring))
++ func, mask, func_data))
continue;
action |= entry->flags & IMA_ACTION_FLAGS;
ARRAY_SIZE(default_appraise_rules),
IMA_DEFAULT_POLICY);
+ if (ima_use_critical_data)
+ add_rules(critical_data_rules,
+ ARRAY_SIZE(critical_data_rules),
+ IMA_DEFAULT_POLICY);
+
ima_update_policy_flag();
}
Opt_uid_lt, Opt_euid_lt, Opt_fowner_lt,
Opt_appraise_type, Opt_appraise_flag,
Opt_permit_directio, Opt_pcr, Opt_template, Opt_keyrings,
- Opt_err
+ Opt_label, Opt_err
};
static const match_table_t policy_tokens = {
{Opt_pcr, "pcr=%s"},
{Opt_template, "template=%s"},
{Opt_keyrings, "keyrings=%s"},
+ {Opt_label, "label=%s"},
{Opt_err, NULL}
};
return false;
break;
+ case CRITICAL_DATA:
+ if (entry->action & ~(MEASURE | DONT_MEASURE))
+ return false;
+
+ if (entry->flags & ~(IMA_FUNC | IMA_UID | IMA_PCR |
+ IMA_LABEL))
+ return false;
+
+ if (ima_rule_contains_lsm_cond(entry))
+ return false;
+
+ break;
default:
return false;
}
else if (IS_ENABLED(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) &&
strcmp(args[0].from, "KEY_CHECK") == 0)
entry->func = KEY_CHECK;
+ else if (strcmp(args[0].from, "CRITICAL_DATA") == 0)
+ entry->func = CRITICAL_DATA;
else
result = -EINVAL;
if (!result)
entry->flags |= IMA_KEYRINGS;
break;
+ case Opt_label:
+ ima_log_string(ab, "label", args[0].from);
+
+ if (entry->label) {
+ result = -EINVAL;
+ break;
+ }
+
+ entry->label = ima_alloc_rule_opt_list(args);
+ if (IS_ERR(entry->label)) {
+ result = PTR_ERR(entry->label);
+ entry->label = NULL;
+ break;
+ }
+
+ entry->flags |= IMA_LABEL;
+ break;
case Opt_fsuuid:
ima_log_string(ab, "fsuuid", args[0].from);
seq_puts(m, " ");
}
+ if (entry->flags & IMA_LABEL) {
+ seq_puts(m, "label=");
+ ima_show_rule_opt_list(m, entry->label);
+ seq_puts(m, " ");
+ }
+
if (entry->flags & IMA_PCR) {
snprintf(tbuf, sizeof(tbuf), "%d", entry->pcr);
seq_printf(m, pt(Opt_pcr), tbuf);
* Enables deferred processing of keys
*/
+ #include <linux/user_namespace.h>
#include <linux/workqueue.h>
#include <keys/asymmetric-type.h>
#include "ima.h"
list_for_each_entry_safe(entry, tmp, &ima_keys, list) {
if (!timer_expired)
- process_buffer_measurement(NULL, entry->payload,
+ process_buffer_measurement(&init_user_ns, NULL,
+ entry->payload,
entry->payload_len,
entry->keyring_name,
KEY_CHECK, 0,
- entry->keyring_name);
+ entry->keyring_name,
+ false);
list_del(&entry->list);
ima_free_key_entry(entry);
}
}
EXPORT_SYMBOL(security_inode_init_security);
+int security_inode_init_security_anon(struct inode *inode,
+ const struct qstr *name,
+ const struct inode *context_inode)
+{
+ return call_int_hook(inode_init_security_anon, 0, inode, name,
+ context_inode);
+}
+
int security_old_inode_init_security(struct inode *inode, struct inode *dir,
const struct qstr *qstr, const char **name,
void **value, size_t *len)
return call_int_hook(inode_getattr, 0, path);
}
- int security_inode_setxattr(struct dentry *dentry, const char *name,
+ int security_inode_setxattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int ret;
* SELinux and Smack integrate the cap call,
* so assume that all LSMs supplying this call do so.
*/
- ret = call_int_hook(inode_setxattr, 1, dentry, name, value, size,
- flags);
+ ret = call_int_hook(inode_setxattr, 1, mnt_userns, dentry, name, value,
+ size, flags);
if (ret == 1)
ret = cap_inode_setxattr(dentry, name, value, size, flags);
return call_int_hook(inode_listxattr, 0, dentry);
}
- int security_inode_removexattr(struct dentry *dentry, const char *name)
+ int security_inode_removexattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *name)
{
int ret;
* SELinux and Smack integrate the cap call,
* so assume that all LSMs supplying this call do so.
*/
- ret = call_int_hook(inode_removexattr, 1, dentry, name);
+ ret = call_int_hook(inode_removexattr, 1, mnt_userns, dentry, name);
if (ret == 1)
- ret = cap_inode_removexattr(dentry, name);
+ ret = cap_inode_removexattr(mnt_userns, dentry, name);
if (ret)
return ret;
ret = ima_inode_removexattr(dentry, name);
return call_int_hook(inode_need_killpriv, 0, dentry);
}
- int security_inode_killpriv(struct dentry *dentry)
+ int security_inode_killpriv(struct user_namespace *mnt_userns,
+ struct dentry *dentry)
{
- return call_int_hook(inode_killpriv, 0, dentry);
+ return call_int_hook(inode_killpriv, 0, mnt_userns, dentry);
}
- int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+ int security_inode_getsecurity(struct user_namespace *mnt_userns,
+ struct inode *inode, const char *name,
+ void **buffer, bool alloc)
{
struct security_hook_list *hp;
int rc;
* Only one module will provide an attribute with a given name.
*/
hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) {
- rc = hp->hook.inode_getsecurity(inode, name, buffer, alloc);
+ rc = hp->hook.inode_getsecurity(mnt_userns, inode, name, buffer, alloc);
if (rc != LSM_RET_DEFAULT(inode_getsecurity))
return rc;
}
}
}
+static int sb_check_xattr_support(struct super_block *sb)
+{
+ struct superblock_security_struct *sbsec = sb->s_security;
+ struct dentry *root = sb->s_root;
+ struct inode *root_inode = d_backing_inode(root);
+ u32 sid;
+ int rc;
+
+ /*
+ * Make sure that the xattr handler exists and that no
+ * error other than -ENODATA is returned by getxattr on
+ * the root directory. -ENODATA is ok, as this may be
+ * the first boot of the SELinux kernel before we have
+ * assigned xattr values to the filesystem.
+ */
+ if (!(root_inode->i_opflags & IOP_XATTR)) {
+ pr_warn("SELinux: (dev %s, type %s) has no xattr support\n",
+ sb->s_id, sb->s_type->name);
+ goto fallback;
+ }
+
+ rc = __vfs_getxattr(root, root_inode, XATTR_NAME_SELINUX, NULL, 0);
+ if (rc < 0 && rc != -ENODATA) {
+ if (rc == -EOPNOTSUPP) {
+ pr_warn("SELinux: (dev %s, type %s) has no security xattr handler\n",
+ sb->s_id, sb->s_type->name);
+ goto fallback;
+ } else {
+ pr_warn("SELinux: (dev %s, type %s) getxattr errno %d\n",
+ sb->s_id, sb->s_type->name, -rc);
+ return rc;
+ }
+ }
+ return 0;
+
+fallback:
+ /* No xattr support - try to fallback to genfs if possible. */
+ rc = security_genfs_sid(&selinux_state, sb->s_type->name, "/",
+ SECCLASS_DIR, &sid);
+ if (rc)
+ return -EOPNOTSUPP;
+
+ pr_warn("SELinux: (dev %s, type %s) falling back to genfs\n",
+ sb->s_id, sb->s_type->name);
+ sbsec->behavior = SECURITY_FS_USE_GENFS;
+ sbsec->sid = sid;
+ return 0;
+}
+
static int sb_finish_set_opts(struct super_block *sb)
{
struct superblock_security_struct *sbsec = sb->s_security;
int rc = 0;
if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
- /* Make sure that the xattr handler exists and that no
- error other than -ENODATA is returned by getxattr on
- the root directory. -ENODATA is ok, as this may be
- the first boot of the SELinux kernel before we have
- assigned xattr values to the filesystem. */
- if (!(root_inode->i_opflags & IOP_XATTR)) {
- pr_warn("SELinux: (dev %s, type %s) has no "
- "xattr support\n", sb->s_id, sb->s_type->name);
- rc = -EOPNOTSUPP;
- goto out;
- }
-
- rc = __vfs_getxattr(root, root_inode, XATTR_NAME_SELINUX, NULL, 0);
- if (rc < 0 && rc != -ENODATA) {
- if (rc == -EOPNOTSUPP)
- pr_warn("SELinux: (dev %s, type "
- "%s) has no security xattr handler\n",
- sb->s_id, sb->s_type->name);
- else
- pr_warn("SELinux: (dev %s, type "
- "%s) getxattr errno %d\n", sb->s_id,
- sb->s_type->name, -rc);
- goto out;
- }
+ rc = sb_check_xattr_support(sb);
+ if (rc)
+ return rc;
}
sbsec->flags |= SE_SBINITIALIZED;
spin_lock(&sbsec->isec_lock);
}
spin_unlock(&sbsec->isec_lock);
-out:
return rc;
}
static inline int default_protocol_stream(int protocol)
{
- return (protocol == IPPROTO_IP || protocol == IPPROTO_TCP);
+ return (protocol == IPPROTO_IP || protocol == IPPROTO_TCP ||
+ protocol == IPPROTO_MPTCP);
}
static inline int default_protocol_dgram(int protocol)
return 0;
}
+static int selinux_inode_init_security_anon(struct inode *inode,
+ const struct qstr *name,
+ const struct inode *context_inode)
+{
+ const struct task_security_struct *tsec = selinux_cred(current_cred());
+ struct common_audit_data ad;
+ struct inode_security_struct *isec;
+ int rc;
+
+ if (unlikely(!selinux_initialized(&selinux_state)))
+ return 0;
+
+ isec = selinux_inode(inode);
+
+ /*
+ * We only get here once per ephemeral inode. The inode has
+ * been initialized via inode_alloc_security but is otherwise
+ * untouched.
+ */
+
+ if (context_inode) {
+ struct inode_security_struct *context_isec =
+ selinux_inode(context_inode);
+ if (context_isec->initialized != LABEL_INITIALIZED) {
+ pr_err("SELinux: context_inode is not initialized");
+ return -EACCES;
+ }
+
+ isec->sclass = context_isec->sclass;
+ isec->sid = context_isec->sid;
+ } else {
+ isec->sclass = SECCLASS_ANON_INODE;
+ rc = security_transition_sid(
+ &selinux_state, tsec->sid, tsec->sid,
+ isec->sclass, name, &isec->sid);
+ if (rc)
+ return rc;
+ }
+
+ isec->initialized = LABEL_INITIALIZED;
+ /*
+ * Now that we've initialized security, check whether we're
+ * allowed to actually create this type of anonymous inode.
+ */
+
+ ad.type = LSM_AUDIT_DATA_INODE;
+ ad.u.inode = inode;
+
+ return avc_has_perm(&selinux_state,
+ tsec->sid,
+ isec->sid,
+ isec->sclass,
+ FILE__CREATE,
+ &ad);
+}
+
static int selinux_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return may_create(dir, dentry, SECCLASS_FILE);
return true;
}
- static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
+ static int selinux_inode_setxattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
struct inode *inode = d_backing_inode(dentry);
}
if (!selinux_initialized(&selinux_state))
- return (inode_owner_or_capable(inode) ? 0 : -EPERM);
+ return (inode_owner_or_capable(mnt_userns, inode) ? 0 : -EPERM);
sbsec = inode->i_sb->s_security;
if (!(sbsec->flags & SBLABEL_MNT))
return -EOPNOTSUPP;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
ad.type = LSM_AUDIT_DATA_DENTRY;
return dentry_has_perm(cred, dentry, FILE__GETATTR);
}
- static int selinux_inode_removexattr(struct dentry *dentry, const char *name)
+ static int selinux_inode_removexattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *name)
{
if (strcmp(name, XATTR_NAME_SELINUX)) {
- int rc = cap_inode_removexattr(dentry, name);
+ int rc = cap_inode_removexattr(mnt_userns, dentry, name);
if (rc)
return rc;
*
* Permission check is handled by selinux_inode_getxattr hook.
*/
- static int selinux_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+ static int selinux_inode_getsecurity(struct user_namespace *mnt_userns,
+ struct inode *inode, const char *name,
+ void **buffer, bool alloc)
{
u32 size;
int error;
static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
{
const int len = sizeof(XATTR_NAME_SELINUX);
+
+ if (!selinux_initialized(&selinux_state))
+ return 0;
+
if (buffer && len <= buffer_size)
memcpy(buffer, XATTR_NAME_SELINUX, len);
return len;
*/
static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
- return __vfs_setxattr_noperm(dentry, XATTR_NAME_SELINUX, ctx, ctxlen, 0);
+ return __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_SELINUX,
+ ctx, ctxlen, 0);
}
static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
{
int len = 0;
- len = selinux_inode_getsecurity(inode, XATTR_SELINUX_SUFFIX,
- ctx, true);
+ len = selinux_inode_getsecurity(&init_user_ns, inode,
+ XATTR_SELINUX_SUFFIX, ctx, true);
if (len < 0)
return len;
*ctxlen = len;
LSM_HOOK_INIT(inode_free_security, selinux_inode_free_security),
LSM_HOOK_INIT(inode_init_security, selinux_inode_init_security),
+ LSM_HOOK_INIT(inode_init_security_anon, selinux_inode_init_security_anon),
LSM_HOOK_INIT(inode_create, selinux_inode_create),
LSM_HOOK_INIT(inode_link, selinux_inode_link),
LSM_HOOK_INIT(inode_unlink, selinux_inode_unlink),
TARGETS += memory-hotplug
TARGETS += mincore
TARGETS += mount
+ TARGETS += mount_setattr
TARGETS += mqueue
+TARGETS += nci
TARGETS += net
TARGETS += net/forwarding
TARGETS += net/mptcp
export KSFT_KHDR_INSTALL_DONE := 1
export BUILD
-# build and run gpio when output directory is the src dir.
-# gpio has dependency on tools/gpio and builds tools/gpio
-# objects in the src directory in all cases making the src
-# repo dirty even when objects are relocated.
-ifneq (1,$(DEFAULT_INSTALL_HDR_PATH))
- TMP := $(filter-out gpio, $(TARGETS))
- TARGETS := $(TMP)
-endif
-
# set default goal to all, so make without a target runs all, even when
# all isn't the first target in the file.
.DEFAULT_GOAL := all