--- /dev/null
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include "extent_map.h"
+
+static struct kmem_cache *extent_map_cache;
+static struct kmem_cache *extent_state_cache;
+
+struct tree_entry {
+ u64 start;
+ u64 end;
+ int in_tree;
+ struct rb_node rb_node;
+};
+
+/* bits for the extent state */
+#define EXTENT_DIRTY 1
+#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_NEW (1 << 4)
+#define EXTENT_DELALLOC (1 << 5)
+
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+
+static LIST_HEAD(all_states);
+spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
+
+void __init extent_map_init(void)
+{
+ extent_map_cache = kmem_cache_create("extent_map",
+ sizeof(struct extent_map), 0,
+ SLAB_RECLAIM_ACCOUNT |
+ SLAB_DESTROY_BY_RCU,
+ NULL);
+ extent_state_cache = kmem_cache_create("extent_state",
+ sizeof(struct extent_state), 0,
+ SLAB_RECLAIM_ACCOUNT |
+ SLAB_DESTROY_BY_RCU,
+ NULL);
+}
+
+void __exit extent_map_exit(void)
+{
+ while(!list_empty(&all_states)) {
+ struct extent_state *state;
+ struct list_head *cur = all_states.next;
+ state = list_entry(cur, struct extent_state, list);
+ printk("found leaked state %Lu %Lu state %d in_tree %d\n",
+ state->start, state->end, state->state, state->in_tree);
+ list_del(&state->list);
+ kfree(state);
+ }
+ if (extent_map_cache)
+ kmem_cache_destroy(extent_map_cache);
+ if (extent_state_cache)
+ kmem_cache_destroy(extent_state_cache);
+}
+
+void extent_map_tree_init(struct extent_map_tree *tree,
+ struct address_space *mapping, gfp_t mask)
+{
+ tree->map.rb_node = NULL;
+ tree->state.rb_node = NULL;
+ rwlock_init(&tree->lock);
+ tree->mapping = mapping;
+}
+EXPORT_SYMBOL(extent_map_tree_init);
+
+struct extent_map *alloc_extent_map(gfp_t mask)
+{
+ struct extent_map *em;
+ em = kmem_cache_alloc(extent_map_cache, mask);
+ if (!em || IS_ERR(em))
+ return em;
+ em->in_tree = 0;
+ atomic_set(&em->refs, 1);
+ return em;
+}
+EXPORT_SYMBOL(alloc_extent_map);
+
+void free_extent_map(struct extent_map *em)
+{
+ if (atomic_dec_and_test(&em->refs)) {
+ WARN_ON(em->in_tree);
+ kmem_cache_free(extent_map_cache, em);
+ }
+}
+EXPORT_SYMBOL(free_extent_map);
+
+
+struct extent_state *alloc_extent_state(gfp_t mask)
+{
+ struct extent_state *state;
+ state = kmem_cache_alloc(extent_state_cache, mask);
+ if (!state || IS_ERR(state))
+ return state;
+ state->state = 0;
+ state->in_tree = 0;
+ atomic_set(&state->refs, 1);
+ init_waitqueue_head(&state->wq);
+ spin_lock_irq(&state_lock);
+ list_add(&state->list, &all_states);
+ spin_unlock_irq(&state_lock);
+ return state;
+}
+EXPORT_SYMBOL(alloc_extent_state);
+
+void free_extent_state(struct extent_state *state)
+{
+ if (atomic_dec_and_test(&state->refs)) {
+ WARN_ON(state->in_tree);
+ spin_lock_irq(&state_lock);
+ list_del_init(&state->list);
+ spin_unlock_irq(&state_lock);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+EXPORT_SYMBOL(free_extent_state);
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+ struct rb_node *node)
+{
+ struct rb_node ** p = &root->rb_node;
+ struct rb_node * parent = NULL;
+ struct tree_entry *entry;
+
+ while(*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct tree_entry, rb_node);
+
+ if (offset < entry->start)
+ p = &(*p)->rb_left;
+ else if (offset > entry->end)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ entry = rb_entry(node, struct tree_entry, rb_node);
+ entry->in_tree = 1;
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+ struct rb_node **prev_ret)
+{
+ struct rb_node * n = root->rb_node;
+ struct rb_node *prev = NULL;
+ struct tree_entry *entry;
+ struct tree_entry *prev_entry = NULL;
+
+ while(n) {
+ entry = rb_entry(n, struct tree_entry, rb_node);
+ prev = n;
+ prev_entry = entry;
+
+ if (offset < entry->start)
+ n = n->rb_left;
+ else if (offset > entry->end)
+ n = n->rb_right;
+ else
+ return n;
+ }
+ if (!prev_ret)
+ return NULL;
+ while(prev && offset > prev_entry->end) {
+ prev = rb_next(prev);
+ prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+ }
+ *prev_ret = prev;
+ return NULL;
+}
+
+static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
+{
+ struct rb_node *prev;
+ struct rb_node *ret;
+ ret = __tree_search(root, offset, &prev);
+ if (!ret)
+ return prev;
+ return ret;
+}
+
+static int tree_delete(struct rb_root *root, u64 offset)
+{
+ struct rb_node *node;
+ struct tree_entry *entry;
+
+ node = __tree_search(root, offset, NULL);
+ if (!node)
+ return -ENOENT;
+ entry = rb_entry(node, struct tree_entry, rb_node);
+ entry->in_tree = 0;
+ rb_erase(node, root);
+ return 0;
+}
+
+/*
+ * add_extent_mapping tries a simple backward merge with existing
+ * mappings. The extent_map struct passed in will be inserted into
+ * the tree directly (no copies made, just a reference taken).
+ */
+int add_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em)
+{
+ int ret = 0;
+ struct extent_map *prev = NULL;
+ struct rb_node *rb;
+
+ write_lock_irq(&tree->lock);
+ rb = tree_insert(&tree->map, em->end, &em->rb_node);
+ if (rb) {
+ prev = rb_entry(rb, struct extent_map, rb_node);
+ printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end);
+ ret = -EEXIST;
+ goto out;
+ }
+ atomic_inc(&em->refs);
+ if (em->start != 0) {
+ rb = rb_prev(&em->rb_node);
+ if (rb)
+ prev = rb_entry(rb, struct extent_map, rb_node);
+ if (prev && prev->end + 1 == em->start &&
+ ((em->block_start == 0 && prev->block_start == 0) ||
+ (em->block_start == prev->block_end + 1))) {
+ em->start = prev->start;
+ em->block_start = prev->block_start;
+ rb_erase(&prev->rb_node, &tree->map);
+ prev->in_tree = 0;
+ free_extent_map(prev);
+ }
+ }
+out:
+ write_unlock_irq(&tree->lock);
+ return ret;
+}
+EXPORT_SYMBOL(add_extent_mapping);
+
+/*
+ * lookup_extent_mapping returns the first extent_map struct in the
+ * tree that intersects the [start, end] (inclusive) range. There may
+ * be additional objects in the tree that intersect, so check the object
+ * returned carefully to make sure you don't need additional lookups.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 end)
+{
+ struct extent_map *em;
+ struct rb_node *rb_node;
+
+ read_lock_irq(&tree->lock);
+ rb_node = tree_search(&tree->map, start);
+ if (!rb_node) {
+ em = NULL;
+ goto out;
+ }
+ if (IS_ERR(rb_node)) {
+ em = ERR_PTR(PTR_ERR(rb_node));
+ goto out;
+ }
+ em = rb_entry(rb_node, struct extent_map, rb_node);
+ if (em->end < start || em->start > end) {
+ em = NULL;
+ goto out;
+ }
+ atomic_inc(&em->refs);
+out:
+ read_unlock_irq(&tree->lock);
+ return em;
+}
+EXPORT_SYMBOL(lookup_extent_mapping);
+
+/*
+ * removes an extent_map struct from the tree. No reference counts are
+ * dropped, and no checks are done to see if the range is in use
+ */
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+ int ret;
+
+ write_lock_irq(&tree->lock);
+ ret = tree_delete(&tree->map, em->end);
+ write_unlock_irq(&tree->lock);
+ return ret;
+}
+EXPORT_SYMBOL(remove_extent_mapping);
+
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree. Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static int merge_state(struct extent_map_tree *tree,
+ struct extent_state *state)
+{
+ struct extent_state *other;
+ struct rb_node *other_node;
+
+ if (state->state & EXTENT_IOBITS)
+ return 0;
+
+ other_node = rb_prev(&state->rb_node);
+ if (other_node) {
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->end == state->start - 1 &&
+ other->state == state->state) {
+ state->start = other->start;
+ other->in_tree = 0;
+ rb_erase(&other->rb_node, &tree->state);
+ free_extent_state(other);
+ }
+ }
+ other_node = rb_next(&state->rb_node);
+ if (other_node) {
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->start == state->end + 1 &&
+ other->state == state->state) {
+ other->start = state->start;
+ state->in_tree = 0;
+ rb_erase(&state->rb_node, &tree->state);
+ free_extent_state(state);
+ }
+ }
+ return 0;
+}
+
+/*
+ * insert an extent_state struct into the tree. 'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally. This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_map_tree *tree,
+ struct extent_state *state, u64 start, u64 end,
+ int bits)
+{
+ struct rb_node *node;
+
+ if (end < start) {
+ printk("end < start %Lu %Lu\n", end, start);
+ WARN_ON(1);
+ }
+ state->state |= bits;
+ state->start = start;
+ state->end = end;
+ if ((end & 4095) == 0) {
+ printk("insert state %Lu %Lu strange end\n", start, end);
+ WARN_ON(1);
+ }
+ node = tree_insert(&tree->state, end, &state->rb_node);
+ if (node) {
+ struct extent_state *found;
+ found = rb_entry(node, struct extent_state, rb_node);
+printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
+ free_extent_state(state);
+ return -EEXIST;
+ }
+ merge_state(tree, state);
+ return 0;
+}
+
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half. 'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end]. After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
+ struct extent_state *prealloc, u64 split)
+{
+ struct rb_node *node;
+ prealloc->start = orig->start;
+ prealloc->end = split - 1;
+ prealloc->state = orig->state;
+ orig->start = split;
+ if ((prealloc->end & 4095) == 0) {
+ printk("insert state %Lu %Lu strange end\n", prealloc->start,
+ prealloc->end);
+ WARN_ON(1);
+ }
+ node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+ if (node) {
+ struct extent_state *found;
+ found = rb_entry(node, struct extent_state, rb_node);
+printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
+ free_extent_state(prealloc);
+ return -EEXIST;
+ }
+ return 0;
+}
+
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1), or
+ * forcibly remove the state from the tree (delete == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static int clear_state_bit(struct extent_map_tree *tree,
+ struct extent_state *state, int bits, int wake,
+ int delete)
+{
+ int ret = state->state & bits;
+ state->state &= ~bits;
+ if (wake)
+ wake_up(&state->wq);
+ if (delete || state->state == 0) {
+ if (state->in_tree) {
+ rb_erase(&state->rb_node, &tree->state);
+ state->in_tree = 0;
+ free_extent_state(state);
+ } else {
+ WARN_ON(1);
+ }
+ } else {
+ merge_state(tree, state);
+ }
+ return ret;
+}
+
+/*
+ * clear some bits on a range in the tree. This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns < 0 on error, > 0 if any of the
+ * bits were already set, or zero if none of the bits were already set.
+ */
+int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
+ int bits, int wake, int delete, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ int err;
+ int set = 0;
+
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ prealloc = alloc_extent_state(mask);
+ if (!prealloc)
+ return -ENOMEM;
+ }
+
+ write_lock_irq(&tree->lock);
+ /*
+ * this search will find the extents that end after
+ * our range starts
+ */
+ node = tree_search(&tree->state, start);
+ if (!node)
+ goto out;
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start > end)
+ goto out;
+ WARN_ON(state->end < start);
+
+ /*
+ * | ---- desired range ---- |
+ * | state | or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip
+ * bits on second half.
+ *
+ * If the extent we found extends past our range, we
+ * just split and search again. It'll get split again
+ * the next time though.
+ *
+ * If the extent we found is inside our range, we clear
+ * the desired bit on it.
+ */
+
+ if (state->start < start) {
+ err = split_state(tree, state, prealloc, start);
+ BUG_ON(err == -EEXIST);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ start = state->end + 1;
+ set |= clear_state_bit(tree, state, bits,
+ wake, delete);
+ } else {
+ start = state->start;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and clear the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ err = split_state(tree, state, prealloc, end + 1);
+ BUG_ON(err == -EEXIST);
+
+ if (wake)
+ wake_up(&state->wq);
+ set |= clear_state_bit(tree, prealloc, bits,
+ wake, delete);
+ prealloc = NULL;
+ goto out;
+ }
+
+ start = state->end + 1;
+ set |= clear_state_bit(tree, state, bits, wake, delete);
+ goto search_again;
+
+out:
+ write_unlock_irq(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return set;
+
+search_again:
+ if (start >= end)
+ goto out;
+ write_unlock_irq(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+EXPORT_SYMBOL(clear_extent_bit);
+
+static int wait_on_state(struct extent_map_tree *tree,
+ struct extent_state *state)
+{
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ read_unlock_irq(&tree->lock);
+ schedule();
+ read_lock_irq(&tree->lock);
+ finish_wait(&state->wq, &wait);
+ return 0;
+}
+
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits)
+{
+ struct extent_state *state;
+ struct rb_node *node;
+
+ read_lock_irq(&tree->lock);
+again:
+ while (1) {
+ /*
+ * this search will find all the extents that end after
+ * our range starts
+ */
+ node = tree_search(&tree->state, start);
+ if (!node)
+ break;
+
+ state = rb_entry(node, struct extent_state, rb_node);
+
+ if (state->start > end)
+ goto out;
+
+ if (state->state & bits) {
+ start = state->start;
+ atomic_inc(&state->refs);
+ wait_on_state(tree, state);
+ free_extent_state(state);
+ goto again;
+ }
+ start = state->end + 1;
+
+ if (start > end)
+ break;
+
+ if (need_resched()) {
+ read_unlock_irq(&tree->lock);
+ cond_resched();
+ read_lock_irq(&tree->lock);
+ }
+ }
+out:
+ read_unlock_irq(&tree->lock);
+ return 0;
+}
+EXPORT_SYMBOL(wait_extent_bit);
+
+/*
+ * set some bits on a range in the tree. This may require allocations
+ * or sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
+ * range already has the desired bits set. The start of the existing
+ * range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive
+ * This takes the tree lock.
+ */
+int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
+ int exclusive, u64 *failed_start, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ int err = 0;
+ int set;
+ u64 last_start;
+ u64 last_end;
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ prealloc = alloc_extent_state(mask);
+ if (!prealloc)
+ return -ENOMEM;
+ }
+
+ write_lock_irq(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(&tree->state, start);
+ if (!node) {
+ err = insert_state(tree, prealloc, start, end, bits);
+ prealloc = NULL;
+ BUG_ON(err == -EEXIST);
+ goto out;
+ }
+
+ state = rb_entry(node, struct extent_state, rb_node);
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ set = state->state & bits;
+ if (set && exclusive) {
+ *failed_start = state->start;
+ err = -EEXIST;
+ goto out;
+ }
+ state->state |= bits;
+ start = state->end + 1;
+ merge_state(tree, state);
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on
+ * second half.
+ *
+ * If the extent we found extends past our
+ * range, we just split and search again. It'll get split
+ * again the next time though.
+ *
+ * If the extent we found is inside our range, we set the
+ * desired bit on it.
+ */
+ if (state->start < start) {
+ set = state->state & bits;
+ if (exclusive && set) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, start);
+ BUG_ON(err == -EEXIST);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ state->state |= bits;
+ start = state->end + 1;
+ merge_state(tree, state);
+ } else {
+ start = state->start;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and set the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ set = state->state & bits;
+ if (exclusive && set) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, end + 1);
+ BUG_ON(err == -EEXIST);
+
+ prealloc->state |= bits;
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and
+ * ignore the extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start -1;
+ err = insert_state(tree, prealloc, start, this_end,
+ bits);
+ prealloc = NULL;
+ BUG_ON(err == -EEXIST);
+ if (err)
+ goto out;
+ start = this_end + 1;
+ goto search_again;
+ }
+ goto search_again;
+
+out:
+ write_unlock_irq(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+search_again:
+ if (start > end)
+ goto out;
+ write_unlock_irq(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+EXPORT_SYMBOL(set_extent_bit);
+
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+ mask);
+}
+EXPORT_SYMBOL(set_extent_dirty);
+
+int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_DIRTY, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_dirty);
+
+int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+ mask);
+}
+EXPORT_SYMBOL(set_extent_new);
+
+int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_new);
+
+int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+ mask);
+}
+EXPORT_SYMBOL(set_extent_uptodate);
+
+int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_uptodate);
+
+int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
+ 0, NULL, mask);
+}
+EXPORT_SYMBOL(set_extent_writeback);
+
+int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_writeback);
+
+int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end)
+{
+ return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
+}
+EXPORT_SYMBOL(wait_on_extent_writeback);
+
+/*
+ * locks a range in ascending order, waiting for any locked regions
+ * it hits on the way. [start,end] are inclusive, and this will sleep.
+ */
+int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+ int err;
+ u64 failed_start;
+ while (1) {
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+ &failed_start, mask);
+ if (err == -EEXIST && (mask & __GFP_WAIT)) {
+ wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ start = failed_start;
+ } else {
+ break;
+ }
+ WARN_ON(start > end);
+ }
+ return err;
+}
+EXPORT_SYMBOL(lock_extent);
+
+int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+}
+EXPORT_SYMBOL(unlock_extent);
+
+/*
+ * helper function to set pages and extents in the tree dirty
+ */
+int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(tree->mapping, index);
+ BUG_ON(!page);
+ __set_page_dirty_nobuffers(page);
+ page_cache_release(page);
+ index++;
+ }
+ set_extent_dirty(tree, start, end, GFP_NOFS);
+ return 0;
+}
+EXPORT_SYMBOL(set_range_dirty);
+
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(tree->mapping, index);
+ BUG_ON(!page);
+ set_page_writeback(page);
+ page_cache_release(page);
+ index++;
+ }
+ set_extent_writeback(tree, start, end, GFP_NOFS);
+ return 0;
+}
+EXPORT_SYMBOL(set_range_writeback);
+
+/*
+ * helper function to lock both pages and extents in the tree.
+ * pages must be locked first.
+ */
+int lock_range(struct extent_map_tree *tree, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+ int err;
+
+ while (index <= end_index) {
+ page = grab_cache_page(tree->mapping, index);
+ if (!page) {
+ err = -ENOMEM;
+ goto failed;
+ }
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto failed;
+ }
+ index++;
+ }
+ lock_extent(tree, start, end, GFP_NOFS);
+ return 0;
+
+failed:
+ /*
+ * we failed above in getting the page at 'index', so we undo here
+ * up to but not including the page at 'index'
+ */
+ end_index = index;
+ index = start >> PAGE_CACHE_SHIFT;
+ while (index < end_index) {
+ page = find_get_page(tree->mapping, index);
+ unlock_page(page);
+ page_cache_release(page);
+ index++;
+ }
+ return err;
+}
+EXPORT_SYMBOL(lock_range);
+
+/*
+ * helper function to unlock both pages and extents in the tree.
+ */
+int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(tree->mapping, index);
+ unlock_page(page);
+ page_cache_release(page);
+ index++;
+ }
+ unlock_extent(tree, start, end, GFP_NOFS);
+ return 0;
+}
+EXPORT_SYMBOL(unlock_range);
+
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if ever extent in the tree
+ * has the bits set. Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
+ int bits, int filled)
+{
+ struct extent_state *state = NULL;
+ struct rb_node *node;
+ int bitset = 0;
+
+ read_lock_irq(&tree->lock);
+ node = tree_search(&tree->state, start);
+ while (node && start <= end) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start > end)
+ break;
+
+ if (filled && state->start > start) {
+ bitset = 0;
+ break;
+ }
+ if (state->state & bits) {
+ bitset = 1;
+ if (!filled)
+ break;
+ } else if (filled) {
+ bitset = 0;
+ break;
+ }
+ start = state->end + 1;
+ if (start > end)
+ break;
+ node = rb_next(node);
+ }
+ read_unlock_irq(&tree->lock);
+ return bitset;
+}
+
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static int check_page_uptodate(struct extent_map_tree *tree,
+ struct page *page)
+{
+ u64 start = page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+ SetPageUptodate(page);
+ return 0;
+}
+
+/*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static int check_page_locked(struct extent_map_tree *tree,
+ struct page *page)
+{
+ u64 start = page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+ unlock_page(page);
+ return 0;
+}
+
+/*
+ * helper function to end page writeback if all the extents
+ * in the tree for that page are done with writeback
+ */
+static int check_page_writeback(struct extent_map_tree *tree,
+ struct page *page)
+{
+ u64 start = page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
+ end_page_writeback(page);
+ return 0;
+}
+
+/* lots and lots of room for performance fixes in the end_bio funcs */
+
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static int end_bio_extent_writepage(struct bio *bio,
+ unsigned int bytes_done, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct extent_map_tree *tree = bio->bi_private;
+ u64 start;
+ u64 end;
+ int whole_page;
+
+ if (bio->bi_size)
+ return 1;
+
+ do {
+ struct page *page = bvec->bv_page;
+ start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+ whole_page = 1;
+ else
+ whole_page = 0;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+
+ if (!uptodate) {
+ clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+
+ if (whole_page)
+ end_page_writeback(page);
+ else
+ check_page_writeback(tree, page);
+ } while (bvec >= bio->bi_io_vec);
+
+ bio_put(bio);
+ return 0;
+}
+
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static int end_bio_extent_readpage(struct bio *bio,
+ unsigned int bytes_done, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct extent_map_tree *tree = bio->bi_private;
+ u64 start;
+ u64 end;
+ int whole_page;
+
+ if (bio->bi_size)
+ return 1;
+
+ do {
+ struct page *page = bvec->bv_page;
+ start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+ whole_page = 1;
+ else
+ whole_page = 0;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+
+ if (uptodate) {
+ set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+ if (whole_page)
+ SetPageUptodate(page);
+ else
+ check_page_uptodate(tree, page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+
+ unlock_extent(tree, start, end, GFP_ATOMIC);
+
+ if (whole_page)
+ unlock_page(page);
+ else
+ check_page_locked(tree, page);
+ } while (bvec >= bio->bi_io_vec);
+
+ bio_put(bio);
+ return 0;
+}
+
+/*
+ * IO done from prepare_write is pretty simple, we just unlock
+ * the structs in the extent tree when done, and set the uptodate bits
+ * as appropriate.
+ */
+static int end_bio_extent_preparewrite(struct bio *bio,
+ unsigned int bytes_done, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct extent_map_tree *tree = bio->bi_private;
+ u64 start;
+ u64 end;
+
+ if (bio->bi_size)
+ return 1;
+
+ do {
+ struct page *page = bvec->bv_page;
+ start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+
+ if (uptodate) {
+ set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+
+ unlock_extent(tree, start, end, GFP_ATOMIC);
+
+ } while (bvec >= bio->bi_io_vec);
+
+ bio_put(bio);
+ return 0;
+}
+
+static int submit_extent_page(int rw, struct extent_map_tree *tree,
+ struct page *page, sector_t sector,
+ size_t size, unsigned long offset,
+ struct block_device *bdev,
+ bio_end_io_t end_io_func)
+{
+ struct bio *bio;
+ int ret = 0;
+
+ bio = bio_alloc(GFP_NOIO, 1);
+
+ bio->bi_sector = sector;
+ bio->bi_bdev = bdev;
+ bio->bi_io_vec[0].bv_page = page;
+ bio->bi_io_vec[0].bv_len = size;
+ bio->bi_io_vec[0].bv_offset = offset;
+
+ bio->bi_vcnt = 1;
+ bio->bi_idx = 0;
+ bio->bi_size = size;
+
+ bio->bi_end_io = end_io_func;
+ bio->bi_private = tree;
+
+ bio_get(bio);
+ submit_bio(rw, bio);
+
+ if (bio_flagged(bio, BIO_EOPNOTSUPP))
+ ret = -EOPNOTSUPP;
+
+ bio_put(bio);
+ return ret;
+}
+
+/*
+ * basic readpage implementation. Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ */
+int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
+ get_extent_t *get_extent)
+{
+ struct inode *inode = page->mapping->host;
+ u64 start = page->index << PAGE_CACHE_SHIFT;
+ u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 end;
+ u64 cur = start;
+ u64 extent_offset;
+ u64 last_byte = i_size_read(inode);
+ u64 block_start;
+ u64 cur_end;
+ sector_t sector;
+ struct extent_map *em;
+ struct block_device *bdev;
+ int ret;
+ int nr = 0;
+ size_t page_offset = 0;
+ size_t iosize;
+ size_t blocksize = inode->i_sb->s_blocksize;
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ set_page_private(page, 1);
+ page_cache_get(page);
+ }
+
+ end = page_end;
+ lock_extent(tree, start, end, GFP_NOFS);
+
+ while (cur <= end) {
+ if (cur >= last_byte) {
+ iosize = PAGE_CACHE_SIZE - page_offset;
+ zero_user_page(page, page_offset, iosize, KM_USER0);
+ set_extent_uptodate(tree, cur, cur + iosize - 1,
+ GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ break;
+ }
+ em = get_extent(inode, page, page_offset, cur, end, 0);
+ if (IS_ERR(em) || !em) {
+ SetPageError(page);
+ unlock_extent(tree, cur, end, GFP_NOFS);
+ break;
+ }
+
+ extent_offset = cur - em->start;
+ BUG_ON(em->end < cur);
+ BUG_ON(end < cur);
+
+ iosize = min(em->end - cur, end - cur) + 1;
+ cur_end = min(em->end, end);
+ iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+ sector = (em->block_start + extent_offset) >> 9;
+ bdev = em->bdev;
+ block_start = em->block_start;
+ free_extent_map(em);
+ em = NULL;
+
+ /* we've found a hole, just zero and go on */
+ if (block_start == 0) {
+ zero_user_page(page, page_offset, iosize, KM_USER0);
+ set_extent_uptodate(tree, cur, cur + iosize - 1,
+ GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ cur = cur + iosize;
+ page_offset += iosize;
+ continue;
+ }
+ /* the get_extent function already copied into the page */
+ if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+ unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ cur = cur + iosize;
+ page_offset += iosize;
+ continue;
+ }
+
+ ret = submit_extent_page(READ, tree, page,
+ sector, iosize, page_offset, bdev,
+ end_bio_extent_readpage);
+ if (ret)
+ SetPageError(page);
+ cur = cur + iosize;
+ page_offset += iosize;
+ nr++;
+ }
+ if (!nr) {
+ if (!PageError(page))
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(extent_read_full_page);
+
+/*
+ * the writepage semantics are similar to regular writepage. extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback. Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ u64 start = page->index << PAGE_CACHE_SHIFT;
+ u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 end;
+ u64 cur = start;
+ u64 extent_offset;
+ u64 last_byte = i_size_read(inode);
+ u64 block_start;
+ sector_t sector;
+ struct extent_map *em;
+ struct block_device *bdev;
+ int ret;
+ int nr = 0;
+ size_t page_offset = 0;
+ size_t iosize;
+ size_t blocksize;
+ loff_t i_size = i_size_read(inode);
+ unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+
+ if (page->index > end_index) {
+ clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+ unlock_page(page);
+ return 0;
+ }
+
+ if (page->index == end_index) {
+ size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
+ zero_user_page(page, offset,
+ PAGE_CACHE_SIZE - offset, KM_USER0);
+ }
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ set_page_private(page, 1);
+ page_cache_get(page);
+ }
+
+ end = page_end;
+ lock_extent(tree, start, page_end, GFP_NOFS);
+
+ if (last_byte <= start) {
+ clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+ goto done;
+ }
+
+ set_extent_uptodate(tree, start, page_end, GFP_NOFS);
+ blocksize = inode->i_sb->s_blocksize;
+
+ while (cur <= end) {
+ if (cur >= last_byte) {
+ clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+ break;
+ }
+ em = get_extent(inode, page, page_offset, cur, end, 1);
+ if (IS_ERR(em) || !em) {
+ SetPageError(page);
+ break;
+ }
+
+ extent_offset = cur - em->start;
+ BUG_ON(em->end < cur);
+ BUG_ON(end < cur);
+ iosize = min(em->end - cur, end - cur) + 1;
+ iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+ sector = (em->block_start + extent_offset) >> 9;
+ bdev = em->bdev;
+ block_start = em->block_start;
+ free_extent_map(em);
+ em = NULL;
+
+ if (block_start == 0 || block_start == EXTENT_MAP_INLINE) {
+ clear_extent_dirty(tree, cur,
+ cur + iosize - 1, GFP_NOFS);
+ cur = cur + iosize;
+ page_offset += iosize;
+ continue;
+ }
+
+ /* leave this out until we have a page_mkwrite call */
+ if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+ EXTENT_DIRTY, 0)) {
+ cur = cur + iosize;
+ page_offset += iosize;
+ continue;
+ }
+ clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
+ set_range_writeback(tree, cur, cur + iosize - 1);
+ ret = submit_extent_page(WRITE, tree, page,
+ sector, iosize, page_offset, bdev,
+ end_bio_extent_writepage);
+ if (ret)
+ SetPageError(page);
+ cur = cur + iosize;
+ page_offset += iosize;
+ nr++;
+ }
+done:
+ WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0));
+ unlock_extent(tree, start, page_end, GFP_NOFS);
+ unlock_page(page);
+ return 0;
+}
+EXPORT_SYMBOL(extent_write_full_page);
+
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_map_tree *tree,
+ struct page *page, unsigned long offset)
+{
+ u64 start = (page->index << PAGE_CACHE_SHIFT);
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+
+ start += (offset + blocksize -1) & ~(blocksize - 1);
+ if (start > end)
+ return 0;
+
+ lock_extent(tree, start, end, GFP_NOFS);
+ wait_on_extent_writeback(tree, start, end);
+ clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY,
+ 1, 1, GFP_NOFS);
+ return 0;
+}
+EXPORT_SYMBOL(extent_invalidatepage);
+
+/*
+ * simple commit_write call, set_range_dirty is used to mark both
+ * the pages and the extent records as dirty
+ */
+int extent_commit_write(struct extent_map_tree *tree,
+ struct inode *inode, struct page *page,
+ unsigned from, unsigned to)
+{
+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ set_page_private(page, 1);
+ page_cache_get(page);
+ }
+
+ set_page_dirty(page);
+
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(extent_commit_write);
+
+int extent_prepare_write(struct extent_map_tree *tree,
+ struct inode *inode, struct page *page,
+ unsigned from, unsigned to, get_extent_t *get_extent)
+{
+ u64 page_start = page->index << PAGE_CACHE_SHIFT;
+ u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+ u64 block_start;
+ u64 orig_block_start;
+ u64 block_end;
+ u64 cur_end;
+ struct extent_map *em;
+ unsigned blocksize = 1 << inode->i_blkbits;
+ size_t page_offset = 0;
+ size_t block_off_start;
+ size_t block_off_end;
+ int err = 0;
+ int iocount = 0;
+ int ret = 0;
+ int isnew;
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ set_page_private(page, 1);
+ page_cache_get(page);
+ }
+ block_start = (page_start + from) & ~((u64)blocksize - 1);
+ block_end = (page_start + to - 1) | (blocksize - 1);
+ orig_block_start = block_start;
+
+ lock_extent(tree, page_start, page_end, GFP_NOFS);
+ while(block_start <= block_end) {
+ em = get_extent(inode, page, page_offset, block_start,
+ block_end, 1);
+ if (IS_ERR(em) || !em) {
+ goto err;
+ }
+ cur_end = min(block_end, em->end);
+ block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
+ block_off_end = block_off_start + blocksize;
+ isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
+
+ if (!PageUptodate(page) && isnew &&
+ (block_off_end > to || block_off_start < from)) {
+ void *kaddr;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (block_off_end > to)
+ memset(kaddr + to, 0, block_off_end - to);
+ if (block_off_start < from)
+ memset(kaddr + block_off_start, 0,
+ from - block_off_start);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ if (!isnew && !PageUptodate(page) &&
+ (block_off_end > to || block_off_start < from) &&
+ !test_range_bit(tree, block_start, cur_end,
+ EXTENT_UPTODATE, 1)) {
+ u64 sector;
+ u64 extent_offset = block_start - em->start;
+ size_t iosize;
+ sector = (em->block_start + extent_offset) >> 9;
+ iosize = (cur_end - block_start + blocksize - 1) &
+ ~((u64)blocksize - 1);
+ /*
+ * we've already got the extent locked, but we
+ * need to split the state such that our end_bio
+ * handler can clear the lock.
+ */
+ set_extent_bit(tree, block_start,
+ block_start + iosize - 1,
+ EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+ ret = submit_extent_page(READ, tree, page,
+ sector, iosize, page_offset, em->bdev,
+ end_bio_extent_preparewrite);
+ iocount++;
+ block_start = block_start + iosize;
+ } else {
+ set_extent_uptodate(tree, block_start, cur_end,
+ GFP_NOFS);
+ unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+ block_start = cur_end + 1;
+ }
+ page_offset = block_start & (PAGE_CACHE_SIZE - 1);
+ free_extent_map(em);
+ }
+ if (iocount) {
+ wait_extent_bit(tree, orig_block_start,
+ block_end, EXTENT_LOCKED);
+ }
+ check_page_uptodate(tree, page);
+err:
+ /* FIXME, zero out newly allocated blocks on error */
+ return err;
+}
+EXPORT_SYMBOL(extent_prepare_write);
+
+/*
+ * a helper for releasepage. As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
+{
+ struct extent_map *em;
+ u64 start = page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ u64 orig_start = start;
+
+ while (start <= end) {
+ em = lookup_extent_mapping(tree, start, end);
+ if (!em || IS_ERR(em))
+ break;
+ if (test_range_bit(tree, em->start, em->end,
+ EXTENT_LOCKED, 0)) {
+ free_extent_map(em);
+ start = em->end + 1;
+printk("range still locked %Lu %Lu\n", em->start, em->end);
+ break;
+ }
+ remove_extent_mapping(tree, em);
+ start = em->end + 1;
+ /* once for the rb tree */
+ free_extent_map(em);
+ /* once for us */
+ free_extent_map(em);
+ }
+ WARN_ON(test_range_bit(tree, orig_start, end, EXTENT_WRITEBACK, 0));
+ clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
+ 1, 1, GFP_NOFS);
+ return 1;
+}
+EXPORT_SYMBOL(try_release_extent_mapping);
+
}
}
-static int insert_inline_extent(struct btrfs_root *root, struct inode *inode,
+static int insert_inline_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
u64 offset, ssize_t size,
- struct buffer_head *bh)
+ struct page *page, size_t page_offset)
{
struct btrfs_key key;
struct btrfs_path *path;
char *ptr, *kaddr;
- struct btrfs_trans_handle *trans;
struct btrfs_file_extent_item *ei;
u32 datasize;
int err = 0;
if (!path)
return -ENOMEM;
- mutex_lock(&root->fs_info->fs_mutex);
- trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, inode);
key.objectid = inode->i_ino;
BTRFS_FILE_EXTENT_INLINE);
ptr = btrfs_file_extent_inline_start(ei);
- kaddr = kmap_atomic(bh->b_page, KM_USER0);
+ kaddr = kmap_atomic(page, KM_USER0);
btrfs_memcpy(root, path->nodes[0]->b_data,
- ptr, kaddr + bh_offset(bh),
- size);
+ ptr, kaddr + page_offset, size);
kunmap_atomic(kaddr, KM_USER0);
btrfs_mark_buffer_dirty(path->nodes[0]);
fail:
btrfs_free_path(path);
- ret = btrfs_end_transaction(trans, root);
- if (ret && !err)
- err = ret;
- mutex_unlock(&root->fs_info->fs_mutex);
return err;
}
loff_t pos,
size_t write_bytes)
{
- int i;
- int offset;
int err = 0;
- int ret;
- int this_write;
+ int i;
struct inode *inode = file->f_path.dentry->d_inode;
- struct buffer_head *bh;
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct btrfs_key ins;
+ u64 hint_block;
+ u64 num_blocks;
+ u64 start_pos;
+ u64 end_of_last_block;
+ u64 end_pos = pos + write_bytes;
+ loff_t isize = i_size_read(inode);
- for (i = 0; i < num_pages; i++) {
- offset = pos & (PAGE_CACHE_SIZE -1);
- this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em)
+ return -ENOMEM;
- /* FIXME, one block at a time */
- bh = page_buffers(pages[i]);
+ em->bdev = inode->i_sb->s_bdev;
- if (buffer_mapped(bh) && bh->b_blocknr == 0) {
- ret = insert_inline_extent(root, inode,
- pages[i]->index << PAGE_CACHE_SHIFT,
- offset + this_write, bh);
- if (ret) {
- err = ret;
- goto failed;
- }
- }
+ start_pos = pos & ~((u64)root->blocksize - 1);
+ num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >>
+ inode->i_blkbits;
- ret = btrfs_commit_write(file, pages[i], offset,
- offset + this_write);
- pos += this_write;
- if (ret) {
- err = ret;
+ end_of_last_block = start_pos + (num_blocks << inode->i_blkbits) - 1;
+ mutex_lock(&root->fs_info->fs_mutex);
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ btrfs_set_trans_block_group(trans, inode);
+ inode->i_blocks += num_blocks << 3;
+ hint_block = 0;
+
+ if ((end_of_last_block & 4095) == 0) {
+ printk("strange end of last %Lu %lu %Lu\n", start_pos, write_bytes, end_of_last_block);
+ }
+ set_extent_uptodate(em_tree, start_pos, end_of_last_block, GFP_NOFS);
+
+ /* FIXME...EIEIO, ENOSPC and more */
+
+ /* step one, delete the existing extents in this range */
+ /* FIXME blocksize != pagesize */
+ if (start_pos < inode->i_size) {
+ err = btrfs_drop_extents(trans, root, inode,
+ start_pos, (pos + write_bytes + root->blocksize -1) &
+ ~((u64)root->blocksize - 1), &hint_block);
+ if (err)
+ goto failed;
+ }
+
+ /* insert any holes we need to create */
+ if (inode->i_size < start_pos) {
+ u64 last_pos_in_file;
+ u64 hole_size;
+ u64 mask = root->blocksize - 1;
+ last_pos_in_file = (isize + mask) & ~mask;
+ hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
+ hole_size >>= inode->i_blkbits;
+ if (last_pos_in_file < start_pos) {
+ err = btrfs_insert_file_extent(trans, root,
+ inode->i_ino,
+ last_pos_in_file,
+ 0, 0, hole_size);
+ }
+ if (err)
goto failed;
+ }
+
+ /*
+ * either allocate an extent for the new bytes or setup the key
+ * to show we are doing inline data in the extent
+ */
+ if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
+ pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
+ err = btrfs_alloc_extent(trans, root, inode->i_ino,
+ num_blocks, 0, hint_block, (u64)-1,
+ &ins, 1);
+ BUG_ON(err);
+ err = btrfs_insert_file_extent(trans, root, inode->i_ino,
+ start_pos, ins.objectid, ins.offset,
+ ins.offset);
+ BUG_ON(err);
+ em->start = start_pos;
+ em->end = end_of_last_block;
+ em->block_start = ins.objectid << inode->i_blkbits;
+ em->block_end = em->block_start +
+ (ins.offset << inode->i_blkbits) - 1;
+ set_extent_dirty(em_tree, start_pos, end_of_last_block,
+ GFP_NOFS);
+ err = add_extent_mapping(em_tree, em);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = pages[i];
+ SetPageUptodate(p);
+ __set_page_dirty_nobuffers(p);
}
- WARN_ON(this_write > write_bytes);
- write_bytes -= this_write;
+ } else {
+ struct page *p = pages[0];
+ err = insert_inline_extent(trans, root, inode, start_pos,
+ end_pos - start_pos, p, 0);
+ BUG_ON(err);
+ em->start = start_pos;
+ em->end = end_pos;
+ em->block_start = EXTENT_MAP_INLINE;
+ em->block_end = EXTENT_MAP_INLINE;
+ add_extent_mapping(em_tree, em);
+ }
+ if (end_pos > isize) {
+ i_size_write(inode, end_pos);
+ btrfs_update_inode(trans, root, inode);
}
failed:
+ err = btrfs_end_transaction(trans, root);
+out_unlock:
+ mutex_unlock(&root->fs_info->fs_mutex);
+ free_extent_map(em);
return err;
}
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
+{
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+
+ while(1) {
+ em = lookup_extent_mapping(em_tree, start, end);
+ if (!em)
+ break;
+ remove_extent_mapping(em_tree, em);
+ /* once for us */
+ free_extent_map(em);
+ /* once for the tree*/
+ free_extent_map(em);
+ }
+ return 0;
+}
+
/*
* this is very complex, but the basic idea is to drop all extents
* in the range start - end. hint_block is filled in with a block number
int found_inline;
int recow;
+ btrfs_drop_extent_cache(inode, start, end - 1);
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
int i;
unsigned long index = pos >> PAGE_CACHE_SHIFT;
struct inode *inode = file->f_path.dentry->d_inode;
- int offset;
int err = 0;
- int this_write;
- struct buffer_head *bh;
- struct buffer_head *head;
- loff_t isize = i_size_read(inode);
- struct btrfs_trans_handle *trans;
- u64 hint_block;
u64 num_blocks;
- u64 alloc_extent_start;
u64 start_pos;
- struct btrfs_key ins;
start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >>
pages[i] = grab_cache_page(inode->i_mapping, index + i);
if (!pages[i]) {
err = -ENOMEM;
- goto failed_release;
+ BUG_ON(1);
}
cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
wait_on_page_writeback(pages[i]);
- }
-
- mutex_lock(&root->fs_info->fs_mutex);
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- err = -ENOMEM;
- mutex_unlock(&root->fs_info->fs_mutex);
- goto out_unlock;
- }
- btrfs_set_trans_block_group(trans, inode);
- /* FIXME blocksize != 4096 */
- inode->i_blocks += num_blocks << 3;
- hint_block = 0;
-
- /* FIXME...EIEIO, ENOSPC and more */
-
- /* step one, delete the existing extents in this range */
- /* FIXME blocksize != pagesize */
- if (start_pos < inode->i_size) {
- err = btrfs_drop_extents(trans, root, inode,
- start_pos, (pos + write_bytes + root->blocksize -1) &
- ~((u64)root->blocksize - 1), &hint_block);
- if (err)
- goto failed_release;
- }
-
- /* insert any holes we need to create */
- if (inode->i_size < start_pos) {
- u64 last_pos_in_file;
- u64 hole_size;
- u64 mask = root->blocksize - 1;
- last_pos_in_file = (isize + mask) & ~mask;
- hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
- hole_size >>= inode->i_blkbits;
- if (last_pos_in_file < start_pos) {
- err = btrfs_insert_file_extent(trans, root,
- inode->i_ino,
- last_pos_in_file,
- 0, 0, hole_size);
- }
- if (err)
- goto failed_release;
- }
-
- /*
- * either allocate an extent for the new bytes or setup the key
- * to show we are doing inline data in the extent
- */
- if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
- pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
- err = btrfs_alloc_extent(trans, root, inode->i_ino,
- num_blocks, 0, hint_block, (u64)-1,
- &ins, 1);
- if (err)
- goto failed_truncate;
- err = btrfs_insert_file_extent(trans, root, inode->i_ino,
- start_pos, ins.objectid, ins.offset,
- ins.offset);
- if (err)
- goto failed_truncate;
- } else {
- ins.offset = 0;
- ins.objectid = 0;
- }
- BUG_ON(err);
- alloc_extent_start = ins.objectid;
- err = btrfs_end_transaction(trans, root);
- mutex_unlock(&root->fs_info->fs_mutex);
-
- for (i = 0; i < num_pages; i++) {
- offset = pos & (PAGE_CACHE_SIZE -1);
- this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
- if (!page_has_buffers(pages[i])) {
- create_empty_buffers(pages[i],
- root->fs_info->sb->s_blocksize,
- (1 << BH_Uptodate));
+ if (!PagePrivate(pages[i])) {
+ SetPagePrivate(pages[i]);
+ set_page_private(pages[i], 1);
+ page_cache_get(pages[i]);
}
- head = page_buffers(pages[i]);
- bh = head;
- do {
- err = btrfs_map_bh_to_logical(root, bh,
- alloc_extent_start);
- BUG_ON(err);
- if (err)
- goto failed_truncate;
- bh = bh->b_this_page;
- if (alloc_extent_start)
- alloc_extent_start++;
- } while (bh != head);
- pos += this_write;
- WARN_ON(this_write > write_bytes);
- write_bytes -= this_write;
}
return 0;
-
-failed_release:
- btrfs_drop_pages(pages, num_pages);
- return err;
-
-failed_truncate:
- btrfs_drop_pages(pages, num_pages);
- if (pos > isize)
- vmtruncate(inode, isize);
- return err;
-
-out_unlock:
- mutex_unlock(&root->fs_info->fs_mutex);
- goto failed_release;
-
}
static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
page_cache_release(pinned[1]);
*ppos = pos;
current->backing_dev_info = NULL;
- mark_inode_dirty(inode);
return num_written ? num_written : err;
}
mutex_unlock(&root->fs_info->trans_mutex);
/*
- * ok we haven't committed the transaction yet, lets do a commit
- */
+ * ok we haven't committed the transaction yet, lets do a commit
+ */
trans = btrfs_start_transaction(root, 1);
if (!trans) {
ret = -ENOMEM;
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &btrfs_aops;
+ extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+ inode->i_mapping, GFP_NOFS);
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
break;
BTRFS_I(inode)->block_group->key.objectid);
}
-static int btrfs_update_inode(struct btrfs_trans_handle *trans,
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode)
{
int found_extent;
int del_item;
+ btrfs_drop_extent_cache(inode, inode->i_size, (u64)-1);
path = btrfs_alloc_path();
path->reada = -1;
BUG_ON(!path);
return ret;
}
+static int btrfs_cow_one_page(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct page *page,
+ size_t zero_start)
+{
+ char *kaddr;
+ int ret = 0;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 alloc_hint = 0;
+ u64 page_start = page->index << PAGE_CACHE_SHIFT;
+ struct btrfs_key ins;
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ set_page_private(page, 1);
+ page_cache_get(page);
+ }
+
+ btrfs_set_trans_block_group(trans, inode);
+
+ ret = btrfs_drop_extents(trans, root, inode,
+ page_start, page_start + PAGE_CACHE_SIZE,
+ &alloc_hint);
+ if (ret)
+ goto out;
+ ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
+ alloc_hint, (u64)-1, &ins, 1);
+ if (ret)
+ goto out;
+ ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+ page_start, ins.objectid, 1, 1);
+ if (ret)
+ goto out;
+ SetPageChecked(page);
+ kaddr = kmap(page);
+ if (zero_start != PAGE_CACHE_SIZE) {
+ memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+ flush_dcache_page(page);
+ }
+ kunmap(page);
+
+out:
+ return ret;
+}
+
/*
* taken from block_truncate_page, but does cow as it zeros out
* any bytes left in the last page in the file.
static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
{
struct inode *inode = mapping->host;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
unsigned blocksize = 1 << inode->i_blkbits;
pgoff_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
struct page *page;
- char *kaddr;
int ret = 0;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 alloc_hint = 0;
- struct btrfs_key ins;
- struct btrfs_trans_handle *trans;
+ u64 page_start;
if ((offset & (blocksize - 1)) == 0)
goto out;
page = grab_cache_page(mapping, index);
if (!page)
goto out;
-
if (!PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
lock_page(page);
goto out;
}
}
+ page_start = page->index << PAGE_CACHE_SHIFT;
+
mutex_lock(&root->fs_info->fs_mutex);
trans = btrfs_start_transaction(root, 1);
- btrfs_set_trans_block_group(trans, inode);
-
- ret = btrfs_drop_extents(trans, root, inode,
- page->index << PAGE_CACHE_SHIFT,
- (page->index + 1) << PAGE_CACHE_SHIFT,
- &alloc_hint);
- if (ret)
- goto out;
- ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
- alloc_hint, (u64)-1, &ins, 1);
- if (ret)
- goto out;
- ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
- page->index << PAGE_CACHE_SHIFT,
- ins.objectid, 1, 1);
- if (ret)
- goto out;
- SetPageChecked(page);
- kaddr = kmap(page);
- memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
- flush_dcache_page(page);
- ret = btrfs_csum_file_block(trans, root, inode->i_ino,
- page->index << PAGE_CACHE_SHIFT,
- kaddr, PAGE_CACHE_SIZE);
- kunmap(page);
+ ret = btrfs_cow_one_page(trans, inode, page, offset);
+ if (!ret) {
+ char *kaddr = kmap(page);
+ ret = btrfs_csum_file_block(trans, root, inode->i_ino,
+ page_start, kaddr, PAGE_CACHE_SIZE);
+ kunmap(page);
+ }
+ set_extent_dirty(&BTRFS_I(inode)->extent_tree,
+ page_start, page_start + PAGE_CACHE_SIZE - 1,
+ GFP_NOFS);
+ set_page_dirty(page);
btrfs_end_transaction(trans, root);
mutex_unlock(&root->fs_info->fs_mutex);
- set_page_dirty(page);
unlock_page(page);
page_cache_release(page);
out:
inode->i_mapping->a_ops = &btrfs_aops;
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
+ extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+ inode->i_mapping, GFP_NOFS);
}
dir->i_sb->s_dirt = 1;
btrfs_update_inode_block_group(trans, inode);
return err;
}
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+ size_t page_offset, u64 start, u64 end,
+ int create)
+{
+ int ret;
+ int err = 0;
+ u64 blocknr;
+ u64 extent_start = 0;
+ u64 extent_end = 0;
+ u64 objectid = inode->i_ino;
+ u32 found_type;
+ int failed_insert = 0;
+ struct btrfs_path *path;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *item;
+ struct btrfs_leaf *leaf;
+ struct btrfs_disk_key *found_key;
+ struct extent_map *em = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct btrfs_trans_handle *trans = NULL;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ mutex_lock(&root->fs_info->fs_mutex);
+
+again:
+ em = lookup_extent_mapping(em_tree, start, end);
+ if (em) {
+ goto out;
+ }
+ if (!em) {
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em) {
+ err = -ENOMEM;
+ goto out;
+ }
+ em->start = 0;
+ em->end = 0;
+ }
+ em->bdev = inode->i_sb->s_bdev;
+ ret = btrfs_lookup_file_extent(NULL, root, path,
+ objectid, start, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ if (ret != 0) {
+ if (path->slots[0] == 0)
+ goto not_found;
+ path->slots[0]--;
+ }
+
+ item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
+ struct btrfs_file_extent_item);
+ leaf = btrfs_buffer_leaf(path->nodes[0]);
+ blocknr = btrfs_file_extent_disk_blocknr(item);
+ blocknr += btrfs_file_extent_offset(item);
+
+ /* are we inside the extent that was found? */
+ found_key = &leaf->items[path->slots[0]].key;
+ found_type = btrfs_disk_key_type(found_key);
+ if (btrfs_disk_key_objectid(found_key) != objectid ||
+ found_type != BTRFS_EXTENT_DATA_KEY) {
+ goto not_found;
+ }
+
+ found_type = btrfs_file_extent_type(item);
+ extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
+ if (found_type == BTRFS_FILE_EXTENT_REG) {
+ extent_end = extent_start +
+ (btrfs_file_extent_num_blocks(item) << inode->i_blkbits);
+ err = 0;
+ if (start < extent_start || start > extent_end) {
+ em->start = start;
+ if (start < extent_start) {
+ em->end = extent_end - 1;
+ } else {
+ em->end = end;
+ }
+ goto not_found_em;
+ }
+ if (btrfs_file_extent_disk_blocknr(item) == 0) {
+ em->start = extent_start;
+ em->end = extent_end - 1;
+ em->block_start = 0;
+ em->block_end = 0;
+ goto insert;
+ }
+ em->block_start = blocknr << inode->i_blkbits;
+ em->block_end = em->block_start +
+ (btrfs_file_extent_num_blocks(item) <<
+ inode->i_blkbits) - 1;
+ em->start = extent_start;
+ em->end = extent_end - 1;
+ goto insert;
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ char *ptr;
+ char *map;
+ u32 size;
+
+ size = btrfs_file_extent_inline_len(leaf->items +
+ path->slots[0]);
+ extent_end = extent_start + size;
+ if (start < extent_start || start > extent_end) {
+ em->start = start;
+ if (start < extent_start) {
+ em->end = extent_end - 1;
+ } else {
+ em->end = end;
+ }
+ goto not_found_em;
+ }
+ em->block_start = EXTENT_MAP_INLINE;
+ em->block_end = EXTENT_MAP_INLINE;
+ em->start = extent_start;
+ em->end = extent_end - 1;
+ if (!page) {
+ goto insert;
+ }
+ ptr = btrfs_file_extent_inline_start(item);
+ map = kmap(page);
+ memcpy(map + page_offset, ptr, size);
+ flush_dcache_page(result->b_page);
+ kunmap(page);
+ set_extent_uptodate(em_tree, extent_start,
+ extent_end, GFP_NOFS);
+ goto insert;
+ } else {
+ printk("unkknown found_type %d\n", found_type);
+ WARN_ON(1);
+ }
+not_found:
+ em->start = start;
+ em->end = end;
+not_found_em:
+ em->block_start = 0;
+ em->block_end = 0;
+insert:
+ btrfs_release_path(root, path);
+ if (em->start > start || em->end < start) {
+ printk("bad extent! %Lu %Lu start %Lu end %Lu\n", em->start, em->end, start, end);
+ WARN_ON(1);
+ err = -EIO;
+ goto out;
+ }
+ ret = add_extent_mapping(em_tree, em);
+ if (ret == -EEXIST) {
+ free_extent_map(em);
+ failed_insert++;
+ if (failed_insert > 5) {
+ printk("failing to insert %Lu %Lu\n", start, end);
+ err = -EIO;
+ goto out;
+ }
+ em = NULL;
+ goto again;
+ }
+ err = 0;
+out:
+ btrfs_free_path(path);
+ if (trans) {
+ ret = btrfs_end_transaction(trans, root);
+ if (!err)
+ err = ret;
+ }
+ mutex_unlock(&root->fs_info->fs_mutex);
+ if (err) {
+ free_extent_map(em);
+ WARN_ON(1);
+ return ERR_PTR(err);
+ }
+ return em;
+}
+
+
/*
* FIBMAP and others want to pass in a fake buffer head. They need to
* use BTRFS_GET_BLOCK_NO_DIRECT to make sure we don't try to memcpy
return err;
}
-static int btrfs_get_block_csum(struct inode *inode, sector_t iblock,
- struct buffer_head *result, int create)
-{
- int ret;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct page *page = result->b_page;
- u64 offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(result);
- struct btrfs_csum_item *item;
- struct btrfs_path *path = NULL;
-
- mutex_lock(&root->fs_info->fs_mutex);
- ret = btrfs_get_block_lock(inode, iblock, result, create);
- if (ret)
- goto out;
-
- path = btrfs_alloc_path();
- item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, offset, 0);
- if (IS_ERR(item)) {
- ret = PTR_ERR(item);
- /* a csum that isn't present is a preallocated region. */
- if (ret == -ENOENT || ret == -EFBIG)
- ret = 0;
- result->b_private = NULL;
- goto out;
- }
- memcpy((char *)&result->b_private, &item->csum, BTRFS_CRC32_SIZE);
-out:
- if (path)
- btrfs_free_path(path);
- mutex_unlock(&root->fs_info->fs_mutex);
- return ret;
-}
-
static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock,
struct buffer_head *result, int create)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- mutex_lock(&root->fs_info->fs_mutex);
- btrfs_get_block_lock(inode, iblock, result, BTRFS_GET_BLOCK_NO_DIRECT);
- mutex_unlock(&root->fs_info->fs_mutex);
+ u64 start = iblock << inode->i_blkbits;
+ u64 end = start + root->blocksize -1;
+ struct extent_map *em;
+
+ em = btrfs_get_extent(inode, NULL, 0, start, end, 0);
+ if (em && !IS_ERR(em) && em->block_start != EXTENT_MAP_INLINE &&
+ em->block_start != 0) {
+ u64 offset;
+ offset = start - em->start;
+ start = (em->block_start + offset) >> inode->i_blkbits;
+ btrfs_map_bh_to_logical(root, result, start);
+ }
return 0;
}
static int btrfs_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
- return block_prepare_write(page, from, to, btrfs_get_block);
+ return extent_prepare_write(&BTRFS_I(page->mapping->host)->extent_tree,
+ page->mapping->host, page, from, to,
+ btrfs_get_extent);
}
-static void buffer_io_error(struct buffer_head *bh)
+int btrfs_readpage(struct file *file, struct page *page)
{
- char b[BDEVNAME_SIZE];
-
- printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
- bdevname(bh->b_bdev, b),
- (unsigned long long)bh->b_blocknr);
+ struct extent_map_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->extent_tree;
+ return extent_read_full_page(tree, page, btrfs_get_extent);
}
-
-/*
- * I/O completion handler for block_read_full_page() - pages
- * which come unlocked at the end of I/O.
- */
-static void btrfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
- unsigned long flags;
- struct buffer_head *first;
- struct buffer_head *tmp;
- struct page *page;
- int page_uptodate = 1;
- struct inode *inode;
- int ret;
-
- BUG_ON(!buffer_async_read(bh));
-
- page = bh->b_page;
- inode = page->mapping->host;
- if (uptodate) {
- void *kaddr;
- struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- if (bh->b_private) {
- char csum[BTRFS_CRC32_SIZE];
- kaddr = kmap_atomic(page, KM_IRQ0);
- ret = btrfs_csum_data(root, kaddr + bh_offset(bh),
- bh->b_size, csum);
- BUG_ON(ret);
- if (memcmp(csum, &bh->b_private, BTRFS_CRC32_SIZE)) {
- u64 offset;
- offset = (page->index << PAGE_CACHE_SHIFT) +
- bh_offset(bh);
- printk("btrfs csum failed ino %lu off %llu\n",
- page->mapping->host->i_ino,
- (unsigned long long)offset);
- memset(kaddr + bh_offset(bh), 1, bh->b_size);
- flush_dcache_page(page);
- }
- kunmap_atomic(kaddr, KM_IRQ0);
- }
- set_buffer_uptodate(bh);
- } else {
- clear_buffer_uptodate(bh);
- if (printk_ratelimit())
- buffer_io_error(bh);
- SetPageError(page);
- }
-
- /*
- * Be _very_ careful from here on. Bad things can happen if
- * two buffer heads end IO at almost the same time and both
- * decide that the page is now completely done.
- */
- first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
- clear_buffer_async_read(bh);
- unlock_buffer(bh);
- tmp = bh;
- do {
- if (!buffer_uptodate(tmp))
- page_uptodate = 0;
- if (buffer_async_read(tmp)) {
- BUG_ON(!buffer_locked(tmp));
- goto still_busy;
- }
- tmp = tmp->b_this_page;
- } while (tmp != bh);
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
-
- /*
- * If none of the buffers had errors and they are all
- * uptodate then we can set the page uptodate.
- */
- if (page_uptodate && !PageError(page))
- SetPageUptodate(page);
- unlock_page(page);
- return;
-
-still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
- return;
+ struct extent_map_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->extent_tree;
+ return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
}
-/*
- * Generic "read page" function for block devices that have the normal
- * get_block functionality. This is most of the block device filesystems.
- * Reads the page asynchronously --- the unlock_buffer() and
- * set/clear_buffer_uptodate() functions propagate buffer state into the
- * page struct once IO has completed.
- */
-int btrfs_readpage(struct file *file, struct page *page)
+static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
{
- struct inode *inode = page->mapping->host;
- sector_t iblock, lblock;
- struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
- unsigned int blocksize;
- int nr, i;
- int fully_mapped = 1;
-
- BUG_ON(!PageLocked(page));
- blocksize = 1 << inode->i_blkbits;
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
- head = page_buffers(page);
-
- iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
- bh = head;
- nr = 0;
- i = 0;
-
- do {
- if (buffer_uptodate(bh))
- continue;
-
- if (!buffer_mapped(bh)) {
- int err = 0;
-
- fully_mapped = 0;
- if (iblock < lblock) {
- WARN_ON(bh->b_size != blocksize);
- err = btrfs_get_block_csum(inode, iblock,
- bh, 0);
- if (err)
- SetPageError(page);
- }
- if (!buffer_mapped(bh)) {
- void *kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + i * blocksize, 0, blocksize);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- if (!err)
- set_buffer_uptodate(bh);
- continue;
- }
- /*
- * get_block() might have updated the buffer
- * synchronously
- */
- if (buffer_uptodate(bh))
- continue;
- }
- arr[nr++] = bh;
- } while (i++, iblock++, (bh = bh->b_this_page) != head);
-
- if (fully_mapped)
- SetPageMappedToDisk(page);
-
- if (!nr) {
- /*
- * All buffers are uptodate - we can set the page uptodate
- * as well. But not if get_block() returned an error.
- */
- if (!PageError(page))
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
- }
-
- /* Stage two: lock the buffers */
- for (i = 0; i < nr; i++) {
- bh = arr[i];
- lock_buffer(bh);
- bh->b_end_io = btrfs_end_buffer_async_read;
- set_buffer_async_read(bh);
- }
-
- /*
- * Stage 3: start the IO. Check for uptodateness
- * inside the buffer lock in case another process reading
- * the underlying blockdev brought it uptodate (the sct fix).
- */
- for (i = 0; i < nr; i++) {
- bh = arr[i];
- if (buffer_uptodate(bh))
- btrfs_end_buffer_async_read(bh, 1);
- else
- submit_bh(READ, bh);
- }
- return 0;
-}
-
-/*
- * Aside from a tiny bit of packed file data handling, this is the
- * same as the generic code.
- *
- * While block_write_full_page is writing back the dirty buffers under
- * the page lock, whoever dirtied the buffers may decide to clean them
- * again at any time. We handle that by only looking at the buffer
- * state inside lock_buffer().
- *
- * If block_write_full_page() is called for regular writeback
- * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
- * locked buffer. This only can happen if someone has written the buffer
- * directly, with submit_bh(). At the address_space level PageWriteback
- * prevents this contention from occurring.
- */
-static int __btrfs_write_full_page(struct inode *inode, struct page *page,
- struct writeback_control *wbc)
-{
- int err;
- sector_t block;
- sector_t last_block;
- struct buffer_head *bh, *head;
- const unsigned blocksize = 1 << inode->i_blkbits;
- int nr_underway = 0;
- struct btrfs_root *root = BTRFS_I(inode)->root;
-
- BUG_ON(!PageLocked(page));
-
- last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
-
- /* no csumming allowed when from PF_MEMALLOC */
- if (current->flags & PF_MEMALLOC) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
+ struct extent_map_tree *tree;
+ int ret;
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
+ if (page->private != 1) {
+ WARN_ON(1);
+ return try_to_free_buffers(page);
}
-
- /*
- * Be very careful. We have no exclusion from __set_page_dirty_buffers
- * here, and the (potentially unmapped) buffers may become dirty at
- * any time. If a buffer becomes dirty here after we've inspected it
- * then we just miss that fact, and the page stays dirty.
- *
- * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
- * handle that here by just cleaning them.
- */
-
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- head = page_buffers(page);
- bh = head;
-
- /*
- * Get all the dirty buffers mapped to disk addresses and
- * handle any aliases from the underlying blockdev's mapping.
- */
- do {
- if (block > last_block) {
- /*
- * mapped buffers outside i_size will occur, because
- * this page can be outside i_size when there is a
- * truncate in progress.
- */
- /*
- * The buffer was zeroed by block_write_full_page()
- */
- clear_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
- WARN_ON(bh->b_size != blocksize);
- err = btrfs_get_block(inode, block, bh, 0);
- if (err) {
- goto recover;
- }
- if (buffer_new(bh)) {
- /* blockdev mappings never come here */
- clear_buffer_new(bh);
- }
- }
- bh = bh->b_this_page;
- block++;
- } while (bh != head);
-
- do {
- if (!buffer_mapped(bh))
- continue;
- /*
- * If it's a fully non-blocking write attempt and we cannot
- * lock the buffer then redirty the page. Note that this can
- * potentially cause a busy-wait loop from pdflush and kswapd
- * activity, but those code paths have their own higher-level
- * throttling.
- */
- if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
- lock_buffer(bh);
- } else if (test_set_buffer_locked(bh)) {
- redirty_page_for_writepage(wbc, page);
- continue;
- }
- if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) {
- struct btrfs_trans_handle *trans;
- int ret;
- u64 off = page->index << PAGE_CACHE_SHIFT;
- char *kaddr;
-
- off += bh_offset(bh);
- mutex_lock(&root->fs_info->fs_mutex);
- trans = btrfs_start_transaction(root, 1);
- btrfs_set_trans_block_group(trans, inode);
- kaddr = kmap(page);
- btrfs_csum_file_block(trans, root, inode->i_ino,
- off, kaddr + bh_offset(bh),
- bh->b_size);
- kunmap(page);
- ret = btrfs_end_transaction(trans, root);
- BUG_ON(ret);
- mutex_unlock(&root->fs_info->fs_mutex);
- mark_buffer_async_write(bh);
- } else {
- unlock_buffer(bh);
- }
- } while ((bh = bh->b_this_page) != head);
-
- /*
- * The page and its buffers are protected by PageWriteback(), so we can
- * drop the bh refcounts early.
- */
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
-
- do {
- struct buffer_head *next = bh->b_this_page;
- if (buffer_async_write(bh)) {
- submit_bh(WRITE, bh);
- nr_underway++;
- }
- bh = next;
- } while (bh != head);
- unlock_page(page);
-
- err = 0;
-done:
- if (nr_underway == 0) {
- /*
- * The page was marked dirty, but the buffers were
- * clean. Someone wrote them back by hand with
- * ll_rw_block/submit_bh. A rare case.
- */
- int uptodate = 1;
- do {
- if (!buffer_uptodate(bh)) {
- uptodate = 0;
- break;
- }
- bh = bh->b_this_page;
- } while (bh != head);
- if (uptodate)
- SetPageUptodate(page);
- end_page_writeback(page);
+ tree = &BTRFS_I(page->mapping->host)->extent_tree;
+ ret = try_release_extent_mapping(tree, page);
+ if (ret == 1) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
}
- return err;
-
-recover:
- /*
- * ENOSPC, or some other error. We may already have added some
- * blocks to the file, so we need to write these out to avoid
- * exposing stale data.
- * The page is currently locked and not marked for writeback
- */
- bh = head;
- /* Recovery: lock and submit the mapped buffers */
- do {
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
- lock_buffer(bh);
- mark_buffer_async_write(bh);
- } else {
- /*
- * The buffer may have been set dirty during
- * attachment to a dirty page.
- */
- clear_buffer_dirty(bh);
- }
- } while ((bh = bh->b_this_page) != head);
- SetPageError(page);
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- do {
- struct buffer_head *next = bh->b_this_page;
- if (buffer_async_write(bh)) {
- clear_buffer_dirty(bh);
- submit_bh(WRITE, bh);
- nr_underway++;
- }
- bh = next;
- } while (bh != head);
- unlock_page(page);
- goto done;
+ return ret;
}
-static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+static void btrfs_invalidatepage(struct page *page, unsigned long offset)
{
- struct inode * const inode = page->mapping->host;
- loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
- unsigned offset;
- void *kaddr;
-
- /* Is the page fully inside i_size? */
- if (page->index < end_index)
- return __btrfs_write_full_page(inode, page, wbc);
-
- /* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
- if (page->index >= end_index+1 || !offset) {
- /*
- * The page may have dirty, unmapped buffers. For example,
- * they may have been added in ext3_writepage(). Make them
- * freeable here, so the page does not leak.
- */
- block_invalidatepage(page, 0);
- unlock_page(page);
- return 0; /* don't care */
- }
+ struct extent_map_tree *tree;
- /*
- * The page straddles i_size. It must be zeroed out on each and every
- * writepage invokation because it may be mmapped. "A file is mapped
- * in multiples of the page size. For a file that is not a multiple of
- * the page size, the remaining memory is zeroed when mapped, and
- * writes to that region are not written out to the file."
- */
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- return __btrfs_write_full_page(inode, page, wbc);
+ tree = &BTRFS_I(page->mapping->host)->extent_tree;
+ extent_invalidatepage(tree, page, offset);
+ btrfs_releasepage(page, GFP_NOFS);
}
/*
int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
{
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
unsigned long end;
loff_t size;
int ret = -EINVAL;
+ u64 page_start;
lock_page(page);
wait_on_page_writeback(page);
size = i_size_read(inode);
+ page_start = page->index << PAGE_CACHE_SHIFT;
+
if ((page->mapping != inode->i_mapping) ||
- ((page->index << PAGE_CACHE_SHIFT) > size)) {
+ (page_start > size)) {
/* page got truncated out from underneath us */
goto out_unlock;
}
/* page is wholly or partially inside EOF */
- if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
+ if (page_start + PAGE_CACHE_SIZE > size)
end = size & ~PAGE_CACHE_MASK;
else
end = PAGE_CACHE_SIZE;
- ret = btrfs_prepare_write(NULL, page, 0, end);
- if (!ret)
- ret = btrfs_commit_write(NULL, page, 0, end);
+ mutex_lock(&root->fs_info->fs_mutex);
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_cow_one_page(trans, inode, page, end);
+ btrfs_end_transaction(trans, root);
+ mutex_unlock(&root->fs_info->fs_mutex);
+ set_extent_dirty(&BTRFS_I(inode)->extent_tree,
+ page_start, page_start + PAGE_CACHE_SIZE - 1,
+ GFP_NOFS);
+ set_page_dirty(page);
out_unlock:
unlock_page(page);
int btrfs_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
- struct inode *inode = page->mapping->host;
- struct buffer_head *bh;
- loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-
- SetPageUptodate(page);
- bh = page_buffers(page);
- set_buffer_uptodate(bh);
- if (buffer_mapped(bh) && bh->b_blocknr != 0) {
- set_page_dirty(page);
- }
- if (pos > inode->i_size) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
- return 0;
+ return extent_commit_write(&BTRFS_I(page->mapping->host)->extent_tree,
+ page->mapping->host, page, from, to);
}
static int create_subvol(struct btrfs_root *root, char *name, int namelen)
inode->i_mapping->a_ops = &btrfs_aops;
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
+ extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+ inode->i_mapping, GFP_NOFS);
}
dir->i_sb->s_dirt = 1;
btrfs_update_inode_block_group(trans, inode);
.prepare_write = btrfs_prepare_write,
.commit_write = btrfs_commit_write,
.bmap = btrfs_bmap,
+ .invalidatepage = btrfs_invalidatepage,
+ .releasepage = btrfs_releasepage,
+ .set_page_dirty = __set_page_dirty_nobuffers,
};
static struct address_space_operations btrfs_symlink_aops = {