3 * Copyright (C) 2007 Oracle. All rights reserved.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
21 #include <sys/types.h>
26 #include "kerncompat.h"
27 #include "extent_io.h"
34 void extent_io_tree_init(struct extent_io_tree *tree)
36 cache_tree_init(&tree->state);
37 cache_tree_init(&tree->cache);
38 INIT_LIST_HEAD(&tree->lru);
40 tree->max_cache_size = (u64)total_memory() / 4;
43 void extent_io_tree_init_cache_max(struct extent_io_tree *tree,
46 extent_io_tree_init(tree);
47 tree->max_cache_size = max_cache_size;
50 static struct extent_state *alloc_extent_state(void)
52 struct extent_state *state;
54 state = malloc(sizeof(*state));
57 state->cache_node.objectid = 0;
64 static void btrfs_free_extent_state(struct extent_state *state)
67 BUG_ON(state->refs < 0);
72 static void free_extent_state_func(struct cache_extent *cache)
74 struct extent_state *es;
76 es = container_of(cache, struct extent_state, cache_node);
77 btrfs_free_extent_state(es);
80 static void free_extent_buffer_final(struct extent_buffer *eb);
81 void extent_io_tree_cleanup(struct extent_io_tree *tree)
83 struct extent_buffer *eb;
85 while(!list_empty(&tree->lru)) {
86 eb = list_entry(tree->lru.next, struct extent_buffer, lru);
89 "extent buffer leak: start %llu len %u\n",
90 (unsigned long long)eb->start, eb->len);
91 free_extent_buffer_nocache(eb);
93 free_extent_buffer_final(eb);
97 cache_tree_free_extents(&tree->state, free_extent_state_func);
100 static inline void update_extent_state(struct extent_state *state)
102 state->cache_node.start = state->start;
103 state->cache_node.size = state->end + 1 - state->start;
107 * Utility function to look for merge candidates inside a given range.
108 * Any extents with matching state are merged together into a single
109 * extent in the tree. Extents with EXTENT_IO in their state field are
112 static int merge_state(struct extent_io_tree *tree,
113 struct extent_state *state)
115 struct extent_state *other;
116 struct cache_extent *other_node;
118 if (state->state & EXTENT_IOBITS)
121 other_node = prev_cache_extent(&state->cache_node);
123 other = container_of(other_node, struct extent_state,
125 if (other->end == state->start - 1 &&
126 other->state == state->state) {
127 state->start = other->start;
128 update_extent_state(state);
129 remove_cache_extent(&tree->state, &other->cache_node);
130 btrfs_free_extent_state(other);
133 other_node = next_cache_extent(&state->cache_node);
135 other = container_of(other_node, struct extent_state,
137 if (other->start == state->end + 1 &&
138 other->state == state->state) {
139 other->start = state->start;
140 update_extent_state(other);
141 remove_cache_extent(&tree->state, &state->cache_node);
142 btrfs_free_extent_state(state);
149 * insert an extent_state struct into the tree. 'bits' are set on the
150 * struct before it is inserted.
152 static int insert_state(struct extent_io_tree *tree,
153 struct extent_state *state, u64 start, u64 end,
159 state->state |= bits;
160 state->start = start;
162 update_extent_state(state);
163 ret = insert_cache_extent(&tree->state, &state->cache_node);
165 merge_state(tree, state);
170 * split a given extent state struct in two, inserting the preallocated
171 * struct 'prealloc' as the newly created second half. 'split' indicates an
172 * offset inside 'orig' where it should be split.
174 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
175 struct extent_state *prealloc, u64 split)
178 prealloc->start = orig->start;
179 prealloc->end = split - 1;
180 prealloc->state = orig->state;
181 update_extent_state(prealloc);
183 update_extent_state(orig);
184 ret = insert_cache_extent(&tree->state, &prealloc->cache_node);
190 * clear some bits on a range in the tree.
192 static int clear_state_bit(struct extent_io_tree *tree,
193 struct extent_state *state, int bits)
195 int ret = state->state & bits;
197 state->state &= ~bits;
198 if (state->state == 0) {
199 remove_cache_extent(&tree->state, &state->cache_node);
200 btrfs_free_extent_state(state);
202 merge_state(tree, state);
208 * clear some bits on a range in the tree.
210 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits)
212 struct extent_state *state;
213 struct extent_state *prealloc = NULL;
214 struct cache_extent *node;
221 prealloc = alloc_extent_state();
227 * this search will find the extents that end after
230 node = search_cache_extent(&tree->state, start);
233 state = container_of(node, struct extent_state, cache_node);
234 if (state->start > end)
236 last_end = state->end;
239 * | ---- desired range ---- |
241 * | ------------- state -------------- |
243 * We need to split the extent we found, and may flip
244 * bits on second half.
246 * If the extent we found extends past our range, we
247 * just split and search again. It'll get split again
248 * the next time though.
250 * If the extent we found is inside our range, we clear
251 * the desired bit on it.
253 if (state->start < start) {
254 err = split_state(tree, state, prealloc, start);
255 BUG_ON(err == -EEXIST);
259 if (state->end <= end) {
260 set |= clear_state_bit(tree, state, bits);
261 if (last_end == (u64)-1)
263 start = last_end + 1;
265 start = state->start;
270 * | ---- desired range ---- |
272 * We need to split the extent, and clear the bit
275 if (state->start <= end && state->end > end) {
276 err = split_state(tree, state, prealloc, end + 1);
277 BUG_ON(err == -EEXIST);
279 set |= clear_state_bit(tree, prealloc, bits);
284 start = state->end + 1;
285 set |= clear_state_bit(tree, state, bits);
286 if (last_end == (u64)-1)
288 start = last_end + 1;
292 btrfs_free_extent_state(prealloc);
302 * set some bits on a range in the tree.
304 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits)
306 struct extent_state *state;
307 struct extent_state *prealloc = NULL;
308 struct cache_extent *node;
314 prealloc = alloc_extent_state();
320 * this search will find the extents that end after
323 node = search_cache_extent(&tree->state, start);
325 err = insert_state(tree, prealloc, start, end, bits);
326 BUG_ON(err == -EEXIST);
331 state = container_of(node, struct extent_state, cache_node);
332 last_start = state->start;
333 last_end = state->end;
336 * | ---- desired range ---- |
339 * Just lock what we found and keep going
341 if (state->start == start && state->end <= end) {
342 state->state |= bits;
343 merge_state(tree, state);
344 if (last_end == (u64)-1)
346 start = last_end + 1;
350 * | ---- desired range ---- |
353 * | ------------- state -------------- |
355 * We need to split the extent we found, and may flip bits on
358 * If the extent we found extends past our
359 * range, we just split and search again. It'll get split
360 * again the next time though.
362 * If the extent we found is inside our range, we set the
365 if (state->start < start) {
366 err = split_state(tree, state, prealloc, start);
367 BUG_ON(err == -EEXIST);
371 if (state->end <= end) {
372 state->state |= bits;
373 start = state->end + 1;
374 merge_state(tree, state);
375 if (last_end == (u64)-1)
377 start = last_end + 1;
379 start = state->start;
384 * | ---- desired range ---- |
385 * | state | or | state |
387 * There's a hole, we need to insert something in it and
388 * ignore the extent we found.
390 if (state->start > start) {
392 if (end < last_start)
395 this_end = last_start -1;
396 err = insert_state(tree, prealloc, start, this_end,
398 BUG_ON(err == -EEXIST);
402 start = this_end + 1;
406 * | ---- desired range ---- |
407 * | ---------- state ---------- |
408 * We need to split the extent, and set the bit
411 err = split_state(tree, state, prealloc, end + 1);
412 BUG_ON(err == -EEXIST);
414 state->state |= bits;
415 merge_state(tree, prealloc);
419 btrfs_free_extent_state(prealloc);
427 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end)
429 return set_extent_bits(tree, start, end, EXTENT_DIRTY);
432 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end)
434 return clear_extent_bits(tree, start, end, EXTENT_DIRTY);
437 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
438 u64 *start_ret, u64 *end_ret, int bits)
440 struct cache_extent *node;
441 struct extent_state *state;
445 * this search will find all the extents that end after
448 node = search_cache_extent(&tree->state, start);
453 state = container_of(node, struct extent_state, cache_node);
454 if (state->end >= start && (state->state & bits)) {
455 *start_ret = state->start;
456 *end_ret = state->end;
460 node = next_cache_extent(node);
468 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
469 int bits, int filled)
471 struct extent_state *state = NULL;
472 struct cache_extent *node;
475 node = search_cache_extent(&tree->state, start);
476 while (node && start <= end) {
477 state = container_of(node, struct extent_state, cache_node);
479 if (filled && state->start > start) {
483 if (state->start > end)
485 if (state->state & bits) {
493 start = state->end + 1;
496 node = next_cache_extent(node);
506 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
508 struct cache_extent *node;
509 struct extent_state *state;
512 node = search_cache_extent(&tree->state, start);
517 state = container_of(node, struct extent_state, cache_node);
518 if (state->start != start) {
522 state->xprivate = private;
527 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
529 struct cache_extent *node;
530 struct extent_state *state;
533 node = search_cache_extent(&tree->state, start);
538 state = container_of(node, struct extent_state, cache_node);
539 if (state->start != start) {
543 *private = state->xprivate;
548 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
549 u64 bytenr, u32 blocksize)
551 struct extent_buffer *eb;
553 eb = calloc(1, sizeof(struct extent_buffer) + blocksize);
563 eb->dev_bytenr = (u64)-1;
564 eb->cache_node.start = bytenr;
565 eb->cache_node.size = blocksize;
566 INIT_LIST_HEAD(&eb->recow);
571 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
573 struct extent_buffer *new;
575 new = __alloc_extent_buffer(NULL, src->start, src->len);
579 copy_extent_buffer(new, src, 0, 0, src->len);
580 new->flags |= EXTENT_BUFFER_DUMMY;
585 static void free_extent_buffer_final(struct extent_buffer *eb)
587 struct extent_io_tree *tree = eb->tree;
590 BUG_ON(tree->cache_size < eb->len);
591 list_del_init(&eb->lru);
592 if (!(eb->flags & EXTENT_BUFFER_DUMMY)) {
593 remove_cache_extent(&tree->cache, &eb->cache_node);
594 tree->cache_size -= eb->len;
599 static void free_extent_buffer_internal(struct extent_buffer *eb, bool free_now)
601 if (!eb || IS_ERR(eb))
605 BUG_ON(eb->refs < 0);
607 BUG_ON(eb->flags & EXTENT_DIRTY);
608 list_del_init(&eb->recow);
609 if (eb->flags & EXTENT_BUFFER_DUMMY || free_now)
610 free_extent_buffer_final(eb);
614 void free_extent_buffer(struct extent_buffer *eb)
616 free_extent_buffer_internal(eb, 0);
619 void free_extent_buffer_nocache(struct extent_buffer *eb)
621 free_extent_buffer_internal(eb, 1);
624 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
625 u64 bytenr, u32 blocksize)
627 struct extent_buffer *eb = NULL;
628 struct cache_extent *cache;
630 cache = lookup_cache_extent(&tree->cache, bytenr, blocksize);
631 if (cache && cache->start == bytenr &&
632 cache->size == blocksize) {
633 eb = container_of(cache, struct extent_buffer, cache_node);
634 list_move_tail(&eb->lru, &tree->lru);
640 struct extent_buffer *find_first_extent_buffer(struct extent_io_tree *tree,
643 struct extent_buffer *eb = NULL;
644 struct cache_extent *cache;
646 cache = search_cache_extent(&tree->cache, start);
648 eb = container_of(cache, struct extent_buffer, cache_node);
649 list_move_tail(&eb->lru, &tree->lru);
655 static void trim_extent_buffer_cache(struct extent_io_tree *tree)
657 struct extent_buffer *eb, *tmp;
659 list_for_each_entry_safe(eb, tmp, &tree->lru, lru) {
661 free_extent_buffer_final(eb);
662 if (tree->cache_size <= ((tree->max_cache_size * 9) / 10))
667 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
668 u64 bytenr, u32 blocksize)
670 struct extent_buffer *eb;
671 struct cache_extent *cache;
673 cache = lookup_cache_extent(&tree->cache, bytenr, blocksize);
674 if (cache && cache->start == bytenr &&
675 cache->size == blocksize) {
676 eb = container_of(cache, struct extent_buffer, cache_node);
677 list_move_tail(&eb->lru, &tree->lru);
683 eb = container_of(cache, struct extent_buffer,
685 free_extent_buffer(eb);
687 eb = __alloc_extent_buffer(tree, bytenr, blocksize);
690 ret = insert_cache_extent(&tree->cache, &eb->cache_node);
695 list_add_tail(&eb->lru, &tree->lru);
696 tree->cache_size += blocksize;
697 if (tree->cache_size >= tree->max_cache_size)
698 trim_extent_buffer_cache(tree);
703 int read_extent_from_disk(struct extent_buffer *eb,
704 unsigned long offset, unsigned long len)
707 ret = pread(eb->fd, eb->data + offset, len, eb->dev_bytenr);
721 int write_extent_to_disk(struct extent_buffer *eb)
724 ret = pwrite(eb->fd, eb->data, eb->len, eb->dev_bytenr);
727 if (ret != eb->len) {
736 int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 offset,
737 u64 bytes, int mirror)
739 struct btrfs_multi_bio *multi = NULL;
740 struct btrfs_device *device;
741 u64 bytes_left = bytes;
747 read_len = bytes_left;
748 ret = btrfs_map_block(info, READ, offset, &read_len, &multi,
751 fprintf(stderr, "Couldn't map the block %Lu\n",
755 device = multi->stripes[0].dev;
757 read_len = min(bytes_left, read_len);
758 if (device->fd <= 0) {
763 ret = pread(device->fd, buf + total_read, read_len,
764 multi->stripes[0].physical);
767 fprintf(stderr, "Error reading %Lu, %d\n", offset,
771 if (ret != read_len) {
772 fprintf(stderr, "Short read for %Lu, read %d, "
773 "read_len %Lu\n", offset, ret, read_len);
777 bytes_left -= read_len;
779 total_read += read_len;
785 int write_data_to_disk(struct btrfs_fs_info *info, void *buf, u64 offset,
786 u64 bytes, int mirror)
788 struct btrfs_multi_bio *multi = NULL;
789 struct btrfs_device *device;
790 u64 bytes_left = bytes;
793 u64 *raid_map = NULL;
798 while (bytes_left > 0) {
799 this_len = bytes_left;
802 ret = btrfs_map_block(info, WRITE, offset, &this_len, &multi,
805 fprintf(stderr, "Couldn't map the block %Lu\n",
811 struct extent_buffer *eb;
812 u64 stripe_len = this_len;
814 this_len = min(this_len, bytes_left);
815 this_len = min(this_len, (u64)info->nodesize);
817 eb = malloc(sizeof(struct extent_buffer) + this_len);
819 fprintf(stderr, "cannot allocate memory for eb\n");
824 memset(eb, 0, sizeof(struct extent_buffer) + this_len);
828 memcpy(eb->data, buf + total_write, this_len);
829 ret = write_raid56_with_parity(info, eb, multi,
830 stripe_len, raid_map);
836 } else while (dev_nr < multi->num_stripes) {
837 device = multi->stripes[dev_nr].dev;
838 if (device->fd <= 0) {
843 dev_bytenr = multi->stripes[dev_nr].physical;
844 this_len = min(this_len, bytes_left);
847 ret = pwrite(device->fd, buf + total_write, this_len, dev_bytenr);
848 if (ret != this_len) {
850 fprintf(stderr, "Error writing to "
851 "device %d\n", errno);
856 fprintf(stderr, "Short write\n");
863 BUG_ON(bytes_left < this_len);
865 bytes_left -= this_len;
867 total_write += this_len;
879 int set_extent_buffer_dirty(struct extent_buffer *eb)
881 struct extent_io_tree *tree = eb->tree;
882 if (!(eb->flags & EXTENT_DIRTY)) {
883 eb->flags |= EXTENT_DIRTY;
884 set_extent_dirty(tree, eb->start, eb->start + eb->len - 1);
885 extent_buffer_get(eb);
890 int clear_extent_buffer_dirty(struct extent_buffer *eb)
892 struct extent_io_tree *tree = eb->tree;
893 if (eb->flags & EXTENT_DIRTY) {
894 eb->flags &= ~EXTENT_DIRTY;
895 clear_extent_dirty(tree, eb->start, eb->start + eb->len - 1);
896 free_extent_buffer(eb);
901 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
902 unsigned long start, unsigned long len)
904 return memcmp(eb->data + start, ptrv, len);
907 void read_extent_buffer(struct extent_buffer *eb, void *dst,
908 unsigned long start, unsigned long len)
910 memcpy(dst, eb->data + start, len);
913 void write_extent_buffer(struct extent_buffer *eb, const void *src,
914 unsigned long start, unsigned long len)
916 memcpy(eb->data + start, src, len);
919 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
920 unsigned long dst_offset, unsigned long src_offset,
923 memcpy(dst->data + dst_offset, src->data + src_offset, len);
926 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
927 unsigned long src_offset, unsigned long len)
929 memmove(dst->data + dst_offset, dst->data + src_offset, len);
932 void memset_extent_buffer(struct extent_buffer *eb, char c,
933 unsigned long start, unsigned long len)
935 memset(eb->data + start, c, len);
938 int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
941 return le_test_bit(nr, (u8 *)eb->data + start);