This patch fixes an issue with the delalloc metadata space reservation
code. The problem is we used to free the reservation as soon as we
allocated the delalloc region. The problem with this is if we are not
inserting an inline extent, we don't actually insert the extent item until
after the ordered extent is written out. This patch does 3 things,
1) It moves the reservation clearing stuff into the ordered code, so when
we remove the ordered extent we remove the reservation.
2) It adds a EXTENT_DO_ACCOUNTING flag that gets passed when we clear
delalloc bits in the cases where we want to clear the metadata reservation
when we clear the delalloc extent, in the case that we do an inline extent
or we invalidate the page.
3) It adds another waitqueue to the space info so that when we start a fs
wide delalloc flush, anybody else who also hits that area will simply wait
for the flush to finish and then try to make their allocation.
This has been tested thoroughly to make sure we did not regress on
performance.
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
u64 last_unlink_trans;
/*
u64 last_unlink_trans;
/*
- * These two counters are for delalloc metadata reservations. We keep
- * track of how many extents we've accounted for vs how many extents we
- * have.
+ * Counters to keep track of the number of extent item's we may use due
+ * to delalloc and such. outstanding_extents is the number of extent
+ * items we think we'll end up using, and reserved_extents is the number
+ * of extent items we've reserved metadata for.
- int delalloc_reserved_extents;
- int delalloc_extents;
+ spinlock_t accounting_lock;
+ int reserved_extents;
+ int outstanding_extents;
/*
* ordered_data_close is set by truncate when a file that used
/*
* ordered_data_close is set by truncate when a file that used
int allocating_chunk;
wait_queue_head_t wait;
int allocating_chunk;
wait_queue_head_t wait;
+
+ int flushing;
+ wait_queue_head_t flush_wait;
num_items);
spin_lock(&meta_sinfo->lock);
num_items);
spin_lock(&meta_sinfo->lock);
- if (BTRFS_I(inode)->delalloc_reserved_extents <=
- BTRFS_I(inode)->delalloc_extents) {
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ if (BTRFS_I(inode)->reserved_extents <=
+ BTRFS_I(inode)->outstanding_extents) {
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
spin_unlock(&meta_sinfo->lock);
return 0;
}
spin_unlock(&meta_sinfo->lock);
return 0;
}
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
- BTRFS_I(inode)->delalloc_reserved_extents--;
- BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
+ BTRFS_I(inode)->reserved_extents--;
+ BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
if (meta_sinfo->bytes_delalloc < num_bytes) {
bug = true;
if (meta_sinfo->bytes_delalloc < num_bytes) {
bug = true;
meta_sinfo->force_delalloc = 0;
}
meta_sinfo->force_delalloc = 0;
}
+static void flush_delalloc(struct btrfs_root *root,
+ struct btrfs_space_info *info)
+{
+ bool wait = false;
+
+ spin_lock(&info->lock);
+
+ if (!info->flushing) {
+ info->flushing = 1;
+ init_waitqueue_head(&info->flush_wait);
+ } else {
+ wait = true;
+ }
+
+ spin_unlock(&info->lock);
+
+ if (wait) {
+ wait_event(info->flush_wait,
+ !info->flushing);
+ return;
+ }
+
+ btrfs_start_delalloc_inodes(root);
+ btrfs_wait_ordered_extents(root, 0);
+
+ spin_lock(&info->lock);
+ info->flushing = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->flush_wait);
+}
+
static int maybe_allocate_chunk(struct btrfs_root *root,
struct btrfs_space_info *info)
{
static int maybe_allocate_chunk(struct btrfs_root *root,
struct btrfs_space_info *info)
{
filemap_flush(inode->i_mapping);
goto again;
} else if (flushed == 3) {
filemap_flush(inode->i_mapping);
goto again;
} else if (flushed == 3) {
- btrfs_start_delalloc_inodes(root);
- btrfs_wait_ordered_extents(root, 0);
+ flush_delalloc(root, meta_sinfo);
goto again;
}
spin_lock(&meta_sinfo->lock);
meta_sinfo->bytes_delalloc -= num_bytes;
spin_unlock(&meta_sinfo->lock);
printk(KERN_ERR "enospc, has %d, reserved %d\n",
goto again;
}
spin_lock(&meta_sinfo->lock);
meta_sinfo->bytes_delalloc -= num_bytes;
spin_unlock(&meta_sinfo->lock);
printk(KERN_ERR "enospc, has %d, reserved %d\n",
- BTRFS_I(inode)->delalloc_extents,
- BTRFS_I(inode)->delalloc_reserved_extents);
+ BTRFS_I(inode)->outstanding_extents,
+ BTRFS_I(inode)->reserved_extents);
dump_space_info(meta_sinfo, 0, 0);
return -ENOSPC;
}
dump_space_info(meta_sinfo, 0, 0);
return -ENOSPC;
}
- BTRFS_I(inode)->delalloc_reserved_extents++;
+ BTRFS_I(inode)->reserved_extents++;
check_force_delalloc(meta_sinfo);
spin_unlock(&meta_sinfo->lock);
check_force_delalloc(meta_sinfo);
spin_unlock(&meta_sinfo->lock);
- btrfs_start_delalloc_inodes(root);
- btrfs_wait_ordered_extents(root, 0);
+ flush_delalloc(root, meta_sinfo);
goto again;
}
spin_lock(&meta_sinfo->lock);
goto again;
}
spin_lock(&meta_sinfo->lock);
struct extent_state *state, int bits, int wake,
int delete)
{
struct extent_state *state, int bits, int wake,
int delete)
{
- int ret = state->state & bits;
+ int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+ int ret = state->state & bits_to_clear;
if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes -= range;
}
clear_state_cb(tree, state, bits);
tree->dirty_bytes -= range;
}
clear_state_cb(tree, state, bits);
+ state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
if (delete || state->state == 0) {
if (wake)
wake_up(&state->wq);
if (delete || state->state == 0) {
gfp_t mask)
{
return clear_extent_bit(tree, start, end,
gfp_t mask)
{
return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0,
if (op & EXTENT_CLEAR_DELALLOC)
clear_bits |= EXTENT_DELALLOC;
if (op & EXTENT_CLEAR_DELALLOC)
clear_bits |= EXTENT_DELALLOC;
+ if (op & EXTENT_CLEAR_ACCOUNTING)
+ clear_bits |= EXTENT_DO_ACCOUNTING;
+
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
- if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK |
- EXTENT_END_WRITEBACK | EXTENT_SET_PRIVATE2)))
+ if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
+ EXTENT_SET_PRIVATE2)))
return 0;
while (nr_pages > 0) {
return 0;
while (nr_pages > 0) {
lock_extent(tree, start, end, GFP_NOFS);
wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
lock_extent(tree, start, end, GFP_NOFS);
wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING,
1, 1, NULL, GFP_NOFS);
return 0;
}
1, 1, NULL, GFP_NOFS);
return 0;
}
#define EXTENT_BUFFER_FILLED (1 << 8)
#define EXTENT_BOUNDARY (1 << 9)
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_BUFFER_FILLED (1 << 8)
#define EXTENT_BOUNDARY (1 << 9)
#define EXTENT_NODATASUM (1 << 10)
+#define EXTENT_DO_ACCOUNTING (1 << 11)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
/* flags for bio submission */
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
/* flags for bio submission */
#define EXTENT_SET_WRITEBACK 0x10
#define EXTENT_END_WRITEBACK 0x20
#define EXTENT_SET_PRIVATE2 0x40
#define EXTENT_SET_WRITEBACK 0x10
#define EXTENT_END_WRITEBACK 0x20
#define EXTENT_SET_PRIVATE2 0x40
+#define EXTENT_CLEAR_ACCOUNTING 0x80
/*
* page->private values. Every page that is controlled by the extent
/*
* page->private values. Every page that is controlled by the extent
btrfs_put_ordered_extent(ordered);
clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
btrfs_put_ordered_extent(ordered);
clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
- last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
+ last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING,
GFP_NOFS);
unlock_extent(&BTRFS_I(inode)->io_tree,
start_pos, last_pos - 1, GFP_NOFS);
GFP_NOFS);
unlock_extent(&BTRFS_I(inode)->io_tree,
start_pos, last_pos - 1, GFP_NOFS);
start, end, NULL,
EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
EXTENT_CLEAR_DELALLOC |
start, end, NULL,
EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_ACCOUNTING |
EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
ret = 0;
goto free_pages_out;
EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
ret = 0;
goto free_pages_out;
EXTENT_CLEAR_UNLOCK_PAGE |
EXTENT_CLEAR_UNLOCK |
EXTENT_CLEAR_DELALLOC |
EXTENT_CLEAR_UNLOCK_PAGE |
EXTENT_CLEAR_UNLOCK |
EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_ACCOUNTING |
EXTENT_CLEAR_DIRTY |
EXTENT_SET_WRITEBACK |
EXTENT_END_WRITEBACK);
EXTENT_CLEAR_DIRTY |
EXTENT_SET_WRITEBACK |
EXTENT_END_WRITEBACK);
root->fs_info->max_extent);
/*
root->fs_info->max_extent);
/*
- * if we break a large extent up then leave delalloc_extents be,
- * since we've already accounted for the large extent.
+ * if we break a large extent up then leave oustanding_extents
+ * be, since we've already accounted for the large extent.
*/
if (div64_u64(new_size + root->fs_info->max_extent - 1,
root->fs_info->max_extent) < num_extents)
return 0;
}
*/
if (div64_u64(new_size + root->fs_info->max_extent - 1,
root->fs_info->max_extent) < num_extents)
return 0;
}
- BTRFS_I(inode)->delalloc_extents++;
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
/* we're not bigger than the max, unreserve the space and go */
if (new_size <= root->fs_info->max_extent) {
/* we're not bigger than the max, unreserve the space and go */
if (new_size <= root->fs_info->max_extent) {
- BTRFS_I(inode)->delalloc_extents--;
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
root->fs_info->max_extent) > num_extents)
return 0;
root->fs_info->max_extent) > num_extents)
return 0;
- BTRFS_I(inode)->delalloc_extents--;
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
- BTRFS_I(inode)->delalloc_extents++;
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
btrfs_delalloc_reserve_space(root, inode, end - start + 1);
spin_lock(&root->fs_info->delalloc_lock);
BTRFS_I(inode)->delalloc_bytes += end - start + 1;
btrfs_delalloc_reserve_space(root, inode, end - start + 1);
spin_lock(&root->fs_info->delalloc_lock);
BTRFS_I(inode)->delalloc_bytes += end - start + 1;
if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
- BTRFS_I(inode)->delalloc_extents--;
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ if (bits & EXTENT_DO_ACCOUNTING) {
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ }
spin_lock(&root->fs_info->delalloc_lock);
if (state->end - state->start + 1 >
spin_lock(&root->fs_info->delalloc_lock);
if (state->end - state->start + 1 >
*/
clear_extent_bit(tree, page_start, page_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
*/
clear_extent_bit(tree, page_start, page_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
+ NULL, GFP_NOFS);
/*
* whoever cleared the private bit is responsible
* for the finish_ordered_io
/*
* whoever cleared the private bit is responsible
* for the finish_ordered_io
lock_extent(tree, page_start, page_end, GFP_NOFS);
}
clear_extent_bit(tree, page_start, page_end,
lock_extent(tree, page_start, page_end, GFP_NOFS);
}
clear_extent_bit(tree, page_start, page_end,
- EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
- 1, 1, NULL, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
__btrfs_releasepage(page, GFP_NOFS);
ClearPageChecked(page);
__btrfs_releasepage(page, GFP_NOFS);
ClearPageChecked(page);
* prepare_pages in the normal write path.
*/
clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
* prepare_pages in the normal write path.
*/
clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
- EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS);
+ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ GFP_NOFS);
ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
if (ret) {
ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
if (ret) {
return NULL;
ei->last_trans = 0;
ei->logged_trans = 0;
return NULL;
ei->last_trans = 0;
ei->logged_trans = 0;
- ei->delalloc_extents = 0;
- ei->delalloc_reserved_extents = 0;
+ ei->outstanding_extents = 0;
+ ei->reserved_extents = 0;
+ spin_lock_init(&ei->accounting_lock);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
INIT_LIST_HEAD(&ei->ordered_operations);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
INIT_LIST_HEAD(&ei->ordered_operations);
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
+ inode, 1);
+
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);