3 * Copyright (C) 2007 Oracle. All rights reserved.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
21 #include <sys/types.h>
26 #include "kerncompat.h"
27 #include "extent_io.h"
34 void extent_io_tree_init(struct extent_io_tree *tree)
36 cache_tree_init(&tree->state);
37 cache_tree_init(&tree->cache);
38 INIT_LIST_HEAD(&tree->lru);
40 tree->max_cache_size = (u64)total_memory() / 4;
43 void extent_io_tree_init_cache_max(struct extent_io_tree *tree,
46 extent_io_tree_init(tree);
47 tree->max_cache_size = max_cache_size;
50 static struct extent_state *alloc_extent_state(void)
52 struct extent_state *state;
54 state = malloc(sizeof(*state));
57 state->cache_node.objectid = 0;
64 static void btrfs_free_extent_state(struct extent_state *state)
67 BUG_ON(state->refs < 0);
72 static void free_extent_state_func(struct cache_extent *cache)
74 struct extent_state *es;
76 es = container_of(cache, struct extent_state, cache_node);
77 btrfs_free_extent_state(es);
80 static void free_extent_buffer_final(struct extent_buffer *eb);
81 void extent_io_tree_cleanup(struct extent_io_tree *tree)
83 struct extent_buffer *eb;
85 while(!list_empty(&tree->lru)) {
86 eb = list_entry(tree->lru.next, struct extent_buffer, lru);
89 "extent buffer leak: start %llu len %u\n",
90 (unsigned long long)eb->start, eb->len);
91 free_extent_buffer_nocache(eb);
93 free_extent_buffer_final(eb);
97 cache_tree_free_extents(&tree->state, free_extent_state_func);
100 static inline void update_extent_state(struct extent_state *state)
102 state->cache_node.start = state->start;
103 state->cache_node.size = state->end + 1 - state->start;
107 * Utility function to look for merge candidates inside a given range.
108 * Any extents with matching state are merged together into a single
109 * extent in the tree. Extents with EXTENT_IO in their state field are
112 static int merge_state(struct extent_io_tree *tree,
113 struct extent_state *state)
115 struct extent_state *other;
116 struct cache_extent *other_node;
118 if (state->state & EXTENT_IOBITS)
121 other_node = prev_cache_extent(&state->cache_node);
123 other = container_of(other_node, struct extent_state,
125 if (other->end == state->start - 1 &&
126 other->state == state->state) {
127 state->start = other->start;
128 update_extent_state(state);
129 remove_cache_extent(&tree->state, &other->cache_node);
130 btrfs_free_extent_state(other);
133 other_node = next_cache_extent(&state->cache_node);
135 other = container_of(other_node, struct extent_state,
137 if (other->start == state->end + 1 &&
138 other->state == state->state) {
139 other->start = state->start;
140 update_extent_state(other);
141 remove_cache_extent(&tree->state, &state->cache_node);
142 btrfs_free_extent_state(state);
149 * insert an extent_state struct into the tree. 'bits' are set on the
150 * struct before it is inserted.
152 static int insert_state(struct extent_io_tree *tree,
153 struct extent_state *state, u64 start, u64 end,
159 state->state |= bits;
160 state->start = start;
162 update_extent_state(state);
163 ret = insert_cache_extent(&tree->state, &state->cache_node);
165 merge_state(tree, state);
170 * split a given extent state struct in two, inserting the preallocated
171 * struct 'prealloc' as the newly created second half. 'split' indicates an
172 * offset inside 'orig' where it should be split.
174 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
175 struct extent_state *prealloc, u64 split)
178 prealloc->start = orig->start;
179 prealloc->end = split - 1;
180 prealloc->state = orig->state;
181 update_extent_state(prealloc);
183 update_extent_state(orig);
184 ret = insert_cache_extent(&tree->state, &prealloc->cache_node);
190 * clear some bits on a range in the tree.
192 static int clear_state_bit(struct extent_io_tree *tree,
193 struct extent_state *state, int bits)
195 int ret = state->state & bits;
197 state->state &= ~bits;
198 if (state->state == 0) {
199 remove_cache_extent(&tree->state, &state->cache_node);
200 btrfs_free_extent_state(state);
202 merge_state(tree, state);
208 * clear some bits on a range in the tree.
210 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits)
212 struct extent_state *state;
213 struct extent_state *prealloc = NULL;
214 struct cache_extent *node;
221 prealloc = alloc_extent_state();
227 * this search will find the extents that end after
230 node = search_cache_extent(&tree->state, start);
233 state = container_of(node, struct extent_state, cache_node);
234 if (state->start > end)
236 last_end = state->end;
239 * | ---- desired range ---- |
241 * | ------------- state -------------- |
243 * We need to split the extent we found, and may flip
244 * bits on second half.
246 * If the extent we found extends past our range, we
247 * just split and search again. It'll get split again
248 * the next time though.
250 * If the extent we found is inside our range, we clear
251 * the desired bit on it.
253 if (state->start < start) {
254 err = split_state(tree, state, prealloc, start);
255 BUG_ON(err == -EEXIST);
259 if (state->end <= end) {
260 set |= clear_state_bit(tree, state, bits);
261 if (last_end == (u64)-1)
263 start = last_end + 1;
265 start = state->start;
270 * | ---- desired range ---- |
272 * We need to split the extent, and clear the bit
275 if (state->start <= end && state->end > end) {
276 err = split_state(tree, state, prealloc, end + 1);
277 BUG_ON(err == -EEXIST);
279 set |= clear_state_bit(tree, prealloc, bits);
284 start = state->end + 1;
285 set |= clear_state_bit(tree, state, bits);
286 if (last_end == (u64)-1)
288 start = last_end + 1;
292 btrfs_free_extent_state(prealloc);
302 * set some bits on a range in the tree.
304 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits)
306 struct extent_state *state;
307 struct extent_state *prealloc = NULL;
308 struct cache_extent *node;
314 prealloc = alloc_extent_state();
320 * this search will find the extents that end after
323 node = search_cache_extent(&tree->state, start);
325 err = insert_state(tree, prealloc, start, end, bits);
326 BUG_ON(err == -EEXIST);
331 state = container_of(node, struct extent_state, cache_node);
332 last_start = state->start;
333 last_end = state->end;
336 * | ---- desired range ---- |
339 * Just lock what we found and keep going
341 if (state->start == start && state->end <= end) {
342 state->state |= bits;
343 merge_state(tree, state);
344 if (last_end == (u64)-1)
346 start = last_end + 1;
350 * | ---- desired range ---- |
353 * | ------------- state -------------- |
355 * We need to split the extent we found, and may flip bits on
358 * If the extent we found extends past our
359 * range, we just split and search again. It'll get split
360 * again the next time though.
362 * If the extent we found is inside our range, we set the
365 if (state->start < start) {
366 err = split_state(tree, state, prealloc, start);
367 BUG_ON(err == -EEXIST);
371 if (state->end <= end) {
372 state->state |= bits;
373 start = state->end + 1;
374 merge_state(tree, state);
375 if (last_end == (u64)-1)
377 start = last_end + 1;
379 start = state->start;
384 * | ---- desired range ---- |
385 * | state | or | state |
387 * There's a hole, we need to insert something in it and
388 * ignore the extent we found.
390 if (state->start > start) {
392 if (end < last_start)
395 this_end = last_start -1;
396 err = insert_state(tree, prealloc, start, this_end,
398 BUG_ON(err == -EEXIST);
402 start = this_end + 1;
406 * | ---- desired range ---- |
407 * | ---------- state ---------- |
408 * We need to split the extent, and set the bit
411 err = split_state(tree, state, prealloc, end + 1);
412 BUG_ON(err == -EEXIST);
414 state->state |= bits;
415 merge_state(tree, prealloc);
419 btrfs_free_extent_state(prealloc);
427 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end)
429 return set_extent_bits(tree, start, end, EXTENT_DIRTY);
432 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end)
434 return clear_extent_bits(tree, start, end, EXTENT_DIRTY);
437 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
438 u64 *start_ret, u64 *end_ret, int bits)
440 struct cache_extent *node;
441 struct extent_state *state;
445 * this search will find all the extents that end after
448 node = search_cache_extent(&tree->state, start);
453 state = container_of(node, struct extent_state, cache_node);
454 if (state->end >= start && (state->state & bits)) {
455 *start_ret = state->start;
456 *end_ret = state->end;
460 node = next_cache_extent(node);
468 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
469 int bits, int filled)
471 struct extent_state *state = NULL;
472 struct cache_extent *node;
475 node = search_cache_extent(&tree->state, start);
476 while (node && start <= end) {
477 state = container_of(node, struct extent_state, cache_node);
479 if (filled && state->start > start) {
483 if (state->start > end)
485 if (state->state & bits) {
493 start = state->end + 1;
496 node = next_cache_extent(node);
506 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
508 struct cache_extent *node;
509 struct extent_state *state;
512 node = search_cache_extent(&tree->state, start);
517 state = container_of(node, struct extent_state, cache_node);
518 if (state->start != start) {
522 state->xprivate = private;
527 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
529 struct cache_extent *node;
530 struct extent_state *state;
533 node = search_cache_extent(&tree->state, start);
538 state = container_of(node, struct extent_state, cache_node);
539 if (state->start != start) {
543 *private = state->xprivate;
548 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
549 u64 bytenr, u32 blocksize)
551 struct extent_buffer *eb;
553 eb = calloc(1, sizeof(struct extent_buffer) + blocksize);
563 eb->dev_bytenr = (u64)-1;
564 eb->cache_node.start = bytenr;
565 eb->cache_node.size = blocksize;
566 INIT_LIST_HEAD(&eb->recow);
567 INIT_LIST_HEAD(&eb->lru);
572 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
574 struct extent_buffer *new;
576 new = __alloc_extent_buffer(NULL, src->start, src->len);
580 copy_extent_buffer(new, src, 0, 0, src->len);
581 new->flags |= EXTENT_BUFFER_DUMMY;
586 static void free_extent_buffer_final(struct extent_buffer *eb)
588 struct extent_io_tree *tree = eb->tree;
591 BUG_ON(tree && tree->cache_size < eb->len);
592 list_del_init(&eb->lru);
593 if (!(eb->flags & EXTENT_BUFFER_DUMMY)) {
594 remove_cache_extent(&tree->cache, &eb->cache_node);
595 tree->cache_size -= eb->len;
600 static void free_extent_buffer_internal(struct extent_buffer *eb, bool free_now)
602 if (!eb || IS_ERR(eb))
606 BUG_ON(eb->refs < 0);
608 BUG_ON(eb->flags & EXTENT_DIRTY);
609 list_del_init(&eb->recow);
610 if (eb->flags & EXTENT_BUFFER_DUMMY || free_now)
611 free_extent_buffer_final(eb);
615 void free_extent_buffer(struct extent_buffer *eb)
617 free_extent_buffer_internal(eb, 0);
620 void free_extent_buffer_nocache(struct extent_buffer *eb)
622 free_extent_buffer_internal(eb, 1);
625 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
626 u64 bytenr, u32 blocksize)
628 struct extent_buffer *eb = NULL;
629 struct cache_extent *cache;
631 cache = lookup_cache_extent(&tree->cache, bytenr, blocksize);
632 if (cache && cache->start == bytenr &&
633 cache->size == blocksize) {
634 eb = container_of(cache, struct extent_buffer, cache_node);
635 list_move_tail(&eb->lru, &tree->lru);
641 struct extent_buffer *find_first_extent_buffer(struct extent_io_tree *tree,
644 struct extent_buffer *eb = NULL;
645 struct cache_extent *cache;
647 cache = search_cache_extent(&tree->cache, start);
649 eb = container_of(cache, struct extent_buffer, cache_node);
650 list_move_tail(&eb->lru, &tree->lru);
656 static void trim_extent_buffer_cache(struct extent_io_tree *tree)
658 struct extent_buffer *eb, *tmp;
660 list_for_each_entry_safe(eb, tmp, &tree->lru, lru) {
662 free_extent_buffer_final(eb);
663 if (tree->cache_size <= ((tree->max_cache_size * 9) / 10))
668 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
669 u64 bytenr, u32 blocksize)
671 struct extent_buffer *eb;
672 struct cache_extent *cache;
674 cache = lookup_cache_extent(&tree->cache, bytenr, blocksize);
675 if (cache && cache->start == bytenr &&
676 cache->size == blocksize) {
677 eb = container_of(cache, struct extent_buffer, cache_node);
678 list_move_tail(&eb->lru, &tree->lru);
684 eb = container_of(cache, struct extent_buffer,
686 free_extent_buffer(eb);
688 eb = __alloc_extent_buffer(tree, bytenr, blocksize);
691 ret = insert_cache_extent(&tree->cache, &eb->cache_node);
696 list_add_tail(&eb->lru, &tree->lru);
697 tree->cache_size += blocksize;
698 if (tree->cache_size >= tree->max_cache_size)
699 trim_extent_buffer_cache(tree);
704 int read_extent_from_disk(struct extent_buffer *eb,
705 unsigned long offset, unsigned long len)
708 ret = pread(eb->fd, eb->data + offset, len, eb->dev_bytenr);
722 int write_extent_to_disk(struct extent_buffer *eb)
725 ret = pwrite(eb->fd, eb->data, eb->len, eb->dev_bytenr);
728 if (ret != eb->len) {
737 int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 offset,
738 u64 bytes, int mirror)
740 struct btrfs_multi_bio *multi = NULL;
741 struct btrfs_device *device;
742 u64 bytes_left = bytes;
748 read_len = bytes_left;
749 ret = btrfs_map_block(info, READ, offset, &read_len, &multi,
752 fprintf(stderr, "Couldn't map the block %Lu\n",
756 device = multi->stripes[0].dev;
758 read_len = min(bytes_left, read_len);
759 if (device->fd <= 0) {
764 ret = pread(device->fd, buf + total_read, read_len,
765 multi->stripes[0].physical);
768 fprintf(stderr, "Error reading %Lu, %d\n", offset,
772 if (ret != read_len) {
773 fprintf(stderr, "Short read for %Lu, read %d, "
774 "read_len %Lu\n", offset, ret, read_len);
778 bytes_left -= read_len;
780 total_read += read_len;
786 int write_data_to_disk(struct btrfs_fs_info *info, void *buf, u64 offset,
787 u64 bytes, int mirror)
789 struct btrfs_multi_bio *multi = NULL;
790 struct btrfs_device *device;
791 u64 bytes_left = bytes;
794 u64 *raid_map = NULL;
799 while (bytes_left > 0) {
800 this_len = bytes_left;
803 ret = btrfs_map_block(info, WRITE, offset, &this_len, &multi,
806 fprintf(stderr, "Couldn't map the block %Lu\n",
812 struct extent_buffer *eb;
813 u64 stripe_len = this_len;
815 this_len = min(this_len, bytes_left);
816 this_len = min(this_len, (u64)info->nodesize);
818 eb = malloc(sizeof(struct extent_buffer) + this_len);
820 fprintf(stderr, "cannot allocate memory for eb\n");
825 memset(eb, 0, sizeof(struct extent_buffer) + this_len);
829 memcpy(eb->data, buf + total_write, this_len);
830 ret = write_raid56_with_parity(info, eb, multi,
831 stripe_len, raid_map);
837 } else while (dev_nr < multi->num_stripes) {
838 device = multi->stripes[dev_nr].dev;
839 if (device->fd <= 0) {
844 dev_bytenr = multi->stripes[dev_nr].physical;
845 this_len = min(this_len, bytes_left);
848 ret = pwrite(device->fd, buf + total_write, this_len, dev_bytenr);
849 if (ret != this_len) {
851 fprintf(stderr, "Error writing to "
852 "device %d\n", errno);
857 fprintf(stderr, "Short write\n");
864 BUG_ON(bytes_left < this_len);
866 bytes_left -= this_len;
868 total_write += this_len;
880 int set_extent_buffer_dirty(struct extent_buffer *eb)
882 struct extent_io_tree *tree = eb->tree;
883 if (!(eb->flags & EXTENT_DIRTY)) {
884 eb->flags |= EXTENT_DIRTY;
885 set_extent_dirty(tree, eb->start, eb->start + eb->len - 1);
886 extent_buffer_get(eb);
891 int clear_extent_buffer_dirty(struct extent_buffer *eb)
893 struct extent_io_tree *tree = eb->tree;
894 if (eb->flags & EXTENT_DIRTY) {
895 eb->flags &= ~EXTENT_DIRTY;
896 clear_extent_dirty(tree, eb->start, eb->start + eb->len - 1);
897 free_extent_buffer(eb);
902 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
903 unsigned long start, unsigned long len)
905 return memcmp(eb->data + start, ptrv, len);
908 void read_extent_buffer(struct extent_buffer *eb, void *dst,
909 unsigned long start, unsigned long len)
911 memcpy(dst, eb->data + start, len);
914 void write_extent_buffer(struct extent_buffer *eb, const void *src,
915 unsigned long start, unsigned long len)
917 memcpy(eb->data + start, src, len);
920 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
921 unsigned long dst_offset, unsigned long src_offset,
924 memcpy(dst->data + dst_offset, src->data + src_offset, len);
927 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
928 unsigned long src_offset, unsigned long len)
930 memmove(dst->data + dst_offset, dst->data + src_offset, len);
933 void memset_extent_buffer(struct extent_buffer *eb, char c,
934 unsigned long start, unsigned long len)
936 memset(eb->data + start, c, len);
939 int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
942 return le_test_bit(nr, (u8 *)eb->data + start);