ocfs2: protect extent tree in ocfs2_prepare_inode_for_write()
authorShuning Zhang <sunny.s.zhang@oracle.com>
Wed, 6 Nov 2019 05:16:34 +0000 (21:16 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 12 Nov 2019 18:21:18 +0000 (19:21 +0100)
[ Upstream commit e74540b285569d2b1e14fe7aee92297078f235ce ]

When the extent tree is modified, it should be protected by inode
cluster lock and ip_alloc_sem.

The extent tree is accessed and modified in the
ocfs2_prepare_inode_for_write, but isn't protected by ip_alloc_sem.

The following is a case.  The function ocfs2_fiemap is accessing the
extent tree, which is modified at the same time.

  kernel BUG at fs/ocfs2/extent_map.c:475!
  invalid opcode: 0000 [#1] SMP
  Modules linked in: tun ocfs2 ocfs2_nodemanager configfs ocfs2_stackglue [...]
  CPU: 16 PID: 14047 Comm: o2info Not tainted 4.1.12-124.23.1.el6uek.x86_64 #2
  Hardware name: Oracle Corporation ORACLE SERVER X7-2L/ASM, MB MECH, X7-2L, BIOS 42040600 10/19/2018
  task: ffff88019487e200 ti: ffff88003daa4000 task.ti: ffff88003daa4000
  RIP: ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2]
  Call Trace:
    ocfs2_fiemap+0x1e3/0x430 [ocfs2]
    do_vfs_ioctl+0x155/0x510
    SyS_ioctl+0x81/0xa0
    system_call_fastpath+0x18/0xd8
  Code: 18 48 c7 c6 60 7f 65 a0 31 c0 bb e2 ff ff ff 48 8b 4a 40 48 8b 7a 28 48 c7 c2 78 2d 66 a0 e8 38 4f 05 00 e9 28 fe ff ff 0f 1f 00 <0f> 0b 66 0f 1f 44 00 00 bb 86 ff ff ff e9 13 fe ff ff 66 0f 1f
  RIP  ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2]
  ---[ end trace c8aa0c8180e869dc ]---
  Kernel panic - not syncing: Fatal exception
  Kernel Offset: disabled

This issue can be reproduced every week in a production environment.

This issue is related to the usage mode.  If others use ocfs2 in this
mode, the kernel will panic frequently.

[akpm@linux-foundation.org: coding style fixes]
[Fix new warning due to unused function by removing said function - Linus ]
Link: http://lkml.kernel.org/r/1568772175-2906-2-git-send-email-sunny.s.zhang@oracle.com
Signed-off-by: Shuning Zhang <sunny.s.zhang@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Gang He <ghe@suse.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
fs/ocfs2/file.c

index 9fa35cb..a847fe5 100644 (file)
@@ -2106,54 +2106,90 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
        return 0;
 }
 
-static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
-                                           struct file *file,
-                                           loff_t pos, size_t count,
-                                           int *meta_level)
+static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
+                                           struct buffer_head **di_bh,
+                                           int meta_level,
+                                           int overwrite_io,
+                                           int write_sem,
+                                           int wait)
 {
-       int ret;
-       struct buffer_head *di_bh = NULL;
-       u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
-       u32 clusters =
-               ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+       int ret = 0;
 
-       ret = ocfs2_inode_lock(inode, &di_bh, 1);
-       if (ret) {
-               mlog_errno(ret);
+       if (wait)
+               ret = ocfs2_inode_lock(inode, NULL, meta_level);
+       else
+               ret = ocfs2_try_inode_lock(inode,
+                       overwrite_io ? NULL : di_bh, meta_level);
+       if (ret < 0)
                goto out;
+
+       if (wait) {
+               if (write_sem)
+                       down_write(&OCFS2_I(inode)->ip_alloc_sem);
+               else
+                       down_read(&OCFS2_I(inode)->ip_alloc_sem);
+       } else {
+               if (write_sem)
+                       ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+               else
+                       ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+
+               if (!ret) {
+                       ret = -EAGAIN;
+                       goto out_unlock;
+               }
        }
 
-       *meta_level = 1;
+       return ret;
 
-       ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
-       if (ret)
-               mlog_errno(ret);
+out_unlock:
+       brelse(*di_bh);
+       ocfs2_inode_unlock(inode, meta_level);
 out:
-       brelse(di_bh);
        return ret;
 }
 
+static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
+                                              struct buffer_head **di_bh,
+                                              int meta_level,
+                                              int write_sem)
+{
+       if (write_sem)
+               up_write(&OCFS2_I(inode)->ip_alloc_sem);
+       else
+               up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+       brelse(*di_bh);
+       *di_bh = NULL;
+
+       if (meta_level >= 0)
+               ocfs2_inode_unlock(inode, meta_level);
+}
+
 static int ocfs2_prepare_inode_for_write(struct file *file,
                                         loff_t pos, size_t count, int wait)
 {
        int ret = 0, meta_level = 0, overwrite_io = 0;
+       int write_sem = 0;
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = d_inode(dentry);
        struct buffer_head *di_bh = NULL;
        loff_t end;
+       u32 cpos;
+       u32 clusters;
 
        /*
         * We start with a read level meta lock and only jump to an ex
         * if we need to make modifications here.
         */
        for(;;) {
-               if (wait)
-                       ret = ocfs2_inode_lock(inode, NULL, meta_level);
-               else
-                       ret = ocfs2_try_inode_lock(inode,
-                               overwrite_io ? NULL : &di_bh, meta_level);
+               ret = ocfs2_inode_lock_for_extent_tree(inode,
+                                                      &di_bh,
+                                                      meta_level,
+                                                      overwrite_io,
+                                                      write_sem,
+                                                      wait);
                if (ret < 0) {
-                       meta_level = -1;
                        if (ret != -EAGAIN)
                                mlog_errno(ret);
                        goto out;
@@ -2165,15 +2201,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                 */
                if (!wait && !overwrite_io) {
                        overwrite_io = 1;
-                       if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
-                               ret = -EAGAIN;
-                               goto out_unlock;
-                       }
 
                        ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
-                       brelse(di_bh);
-                       di_bh = NULL;
-                       up_read(&OCFS2_I(inode)->ip_alloc_sem);
                        if (ret < 0) {
                                if (ret != -EAGAIN)
                                        mlog_errno(ret);
@@ -2192,7 +2221,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                 * set inode->i_size at the end of a write. */
                if (should_remove_suid(dentry)) {
                        if (meta_level == 0) {
-                               ocfs2_inode_unlock(inode, meta_level);
+                               ocfs2_inode_unlock_for_extent_tree(inode,
+                                                                  &di_bh,
+                                                                  meta_level,
+                                                                  write_sem);
                                meta_level = 1;
                                continue;
                        }
@@ -2208,18 +2240,32 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
 
                ret = ocfs2_check_range_for_refcount(inode, pos, count);
                if (ret == 1) {
-                       ocfs2_inode_unlock(inode, meta_level);
-                       meta_level = -1;
-
-                       ret = ocfs2_prepare_inode_for_refcount(inode,
-                                                              file,
-                                                              pos,
-                                                              count,
-                                                              &meta_level);
+                       ocfs2_inode_unlock_for_extent_tree(inode,
+                                                          &di_bh,
+                                                          meta_level,
+                                                          write_sem);
+                       ret = ocfs2_inode_lock_for_extent_tree(inode,
+                                                              &di_bh,
+                                                              meta_level,
+                                                              overwrite_io,
+                                                              1,
+                                                              wait);
+                       write_sem = 1;
+                       if (ret < 0) {
+                               if (ret != -EAGAIN)
+                                       mlog_errno(ret);
+                               goto out;
+                       }
+
+                       cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+                       clusters =
+                               ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+                       ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
                }
 
                if (ret < 0) {
-                       mlog_errno(ret);
+                       if (ret != -EAGAIN)
+                               mlog_errno(ret);
                        goto out_unlock;
                }
 
@@ -2230,10 +2276,10 @@ out_unlock:
        trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
                                            pos, count, wait);
 
-       brelse(di_bh);
-
-       if (meta_level >= 0)
-               ocfs2_inode_unlock(inode, meta_level);
+       ocfs2_inode_unlock_for_extent_tree(inode,
+                                          &di_bh,
+                                          meta_level,
+                                          write_sem);
 
 out:
        return ret;