Commit
84c17543ab56 (ext4: move work from io_end to inode) triggered a
regression when running xfstest #270 when the file system is mounted
with dioread_nolock.
The problem is that after ext4_evict_inode() calls ext4_ioend_wait(),
this guarantees that last io_end structure has been freed, but it does
not guarantee that the workqueue structure, which was moved into the
inode by commit
84c17543ab56, is actually finished. Once
ext4_flush_completed_IO() calls ext4_free_io_end() on CPU #1, this
will allow ext4_ioend_wait() to return on CPU #2, at which point the
evict_inode() codepath can race against the workqueue code on CPU #1
accessing EXT4_I(inode)->i_unwritten_work to find the next item of
work to do.
Fix this by calling cancel_work_sync() in ext4_ioend_wait(), which
will be renamed ext4_ioend_shutdown(), since it is only used by
ext4_evict_inode(). Also, move the call to ext4_ioend_shutdown()
until after truncate_inode_pages() and filemap_write_and_wait() are
called, to make sure all dirty pages have been written back and
flushed from the page cache first.
BUG: unable to handle kernel NULL pointer dereference at (null)
IP: [<
c01dda6a>] cwq_activate_delayed_work+0x3b/0x7e
*pdpt =
0000000030bc3001 *pde =
0000000000000000
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
Modules linked in:
Pid: 6, comm: kworker/u:0 Not tainted 3.8.0-rc3-00013-g84c1754-dirty #91 Bochs Bochs
EIP: 0060:[<
c01dda6a>] EFLAGS:
00010046 CPU: 0
EIP is at cwq_activate_delayed_work+0x3b/0x7e
EAX:
00000000 EBX:
00000000 ECX:
f505fe54 EDX:
00000000
ESI:
ed5b697c EDI:
00000006 EBP:
f64b7e8c ESP:
f64b7e84
DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
CR0:
8005003b CR2:
00000000 CR3:
30bc2000 CR4:
000006f0
DR0:
00000000 DR1:
00000000 DR2:
00000000 DR3:
00000000
DR6:
ffff0ff0 DR7:
00000400
Process kworker/u:0 (pid: 6, ti=
f64b6000 task=
f64b4160 task.ti=
f64b6000)
Stack:
f505fe00 00000006 f64b7e9c c01de3d7 f6435540 00000003 f64b7efc c01def1d
f6435540 00000002 00000000 0000008a c16d0808 c040a10b c16d07d8 c16d08b0
f505fe00 c16d0780 00000000 00000000 ee153df4 c1ce4a30 c17d0e30 00000000
Call Trace:
[<
c01de3d7>] cwq_dec_nr_in_flight+0x71/0xfb
[<
c01def1d>] process_one_work+0x5d8/0x637
[<
c040a10b>] ? ext4_end_bio+0x300/0x300
[<
c01e3105>] worker_thread+0x249/0x3ef
[<
c01ea317>] kthread+0xd8/0xeb
[<
c01e2ebc>] ? manage_workers+0x4bb/0x4bb
[<
c023a370>] ? trace_hardirqs_on+0x27/0x37
[<
c0f1b4b7>] ret_from_kernel_thread+0x1b/0x28
[<
c01ea23f>] ? __init_kthread_worker+0x71/0x71
Code: 01 83 15 ac ff 6c c1 00 31 db 89 c6 8b 00 a8 04 74 12 89 c3 30 db 83 05 b0 ff 6c c1 01 83 15 b4 ff 6c c1 00 89 f0 e8 42 ff ff ff <8b> 13 89 f0 83 05 b8 ff 6c c1
6c c1 00 31 c9 83
EIP: [<
c01dda6a>] cwq_activate_delayed_work+0x3b/0x7e SS:ESP 0068:
f64b7e84
CR2:
0000000000000000
---[ end trace
a1923229da53d8a4 ]---
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jan Kara <jack@suse.cz>
extern int __init ext4_init_pageio(void);
extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
-extern void ext4_ioend_wait(struct inode *);
+extern void ext4_ioend_shutdown(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern void ext4_end_io_work(struct work_struct *work);
trace_ext4_evict_inode(inode);
- ext4_ioend_wait(inode);
-
if (inode->i_nlink) {
/*
* When journalling data dirty buffers are tracked only in the
filemap_write_and_wait(&inode->i_data);
}
truncate_inode_pages(&inode->i_data, 0);
+ ext4_ioend_shutdown(inode);
goto no_delete;
}
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
+ ext4_ioend_shutdown(inode);
if (is_bad_inode(inode))
goto no_delete;
kmem_cache_destroy(io_page_cachep);
}
-void ext4_ioend_wait(struct inode *inode)
+/*
+ * This function is called by ext4_evict_inode() to make sure there is
+ * no more pending I/O completion work left to do.
+ */
+void ext4_ioend_shutdown(struct inode *inode)
{
wait_queue_head_t *wq = ext4_ioend_wq(inode);
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+ /*
+ * We need to make sure the work structure is finished being
+ * used before we let the inode get destroyed.
+ */
+ if (work_pending(&EXT4_I(inode)->i_unwritten_work))
+ cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
}
static void put_io_page(struct ext4_io_page *io_page)