3 #include "kerncompat.h"
4 #include "radix-tree.h"
8 static inline void init_path(struct ctree_path *p)
10 memset(p, 0, sizeof(*p));
13 static void release_path(struct ctree_root *root, struct ctree_path *p)
16 for (i = 0; i < MAX_LEVEL; i++) {
19 tree_block_release(root, p->nodes[i]);
23 static inline unsigned int leaf_data_end(struct leaf *leaf)
25 unsigned int nr = leaf->header.nritems;
27 return ARRAY_SIZE(leaf->data);
28 return leaf->items[nr-1].offset;
31 static inline int leaf_free_space(struct leaf *leaf)
33 int data_end = leaf_data_end(leaf);
34 int nritems = leaf->header.nritems;
35 char *items_end = (char *)(leaf->items + nritems + 1);
36 return (char *)(leaf->data + data_end) - (char *)items_end;
39 int comp_keys(struct key *k1, struct key *k2)
41 if (k1->objectid > k2->objectid)
43 if (k1->objectid < k2->objectid)
45 if (k1->flags > k2->flags)
47 if (k1->flags < k2->flags)
49 if (k1->offset > k2->offset)
51 if (k1->offset < k2->offset)
55 int generic_bin_search(char *p, int item_size, struct key *key,
65 mid = (low + high) / 2;
66 tmp = (struct key *)(p + mid * item_size);
67 ret = comp_keys(tmp, key);
82 int bin_search(struct node *c, struct key *key, int *slot)
84 if (is_leaf(c->header.flags)) {
85 struct leaf *l = (struct leaf *)c;
86 return generic_bin_search((void *)l->items, sizeof(struct item),
87 key, c->header.nritems, slot);
89 return generic_bin_search((void *)c->keys, sizeof(struct key),
90 key, c->header.nritems, slot);
95 int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
97 struct tree_buffer *b = root->node;
106 level = node_level(c->header.flags);
108 ret = bin_search(c, key, &slot);
109 if (!is_leaf(c->header.flags)) {
112 p->slots[level] = slot;
113 b = read_tree_block(root, c->blockptrs[slot]);
116 p->slots[level] = slot;
123 static void fixup_low_keys(struct ctree_root *root,
124 struct ctree_path *path, struct key *key,
128 /* adjust the pointers going up the tree */
129 for (i = level; i < MAX_LEVEL; i++) {
131 int tslot = path->slots[i];
134 t = &path->nodes[i]->node;
135 memcpy(t->keys + tslot, key, sizeof(*key));
136 write_tree_block(root, path->nodes[i]);
142 int __insert_ptr(struct ctree_root *root,
143 struct ctree_path *path, struct key *key,
144 u64 blocknr, int slot, int level)
148 struct key *lower_key;
150 /* need a new root */
151 if (!path->nodes[level]) {
152 struct tree_buffer *t;
153 t = alloc_free_block(root);
155 memset(c, 0, sizeof(c));
156 c->header.nritems = 2;
157 c->header.flags = node_level(level);
158 c->header.blocknr = t->blocknr;
159 lower = &path->nodes[level-1]->node;
160 if (is_leaf(lower->header.flags))
161 lower_key = &((struct leaf *)lower)->items[0].key;
163 lower_key = lower->keys;
164 memcpy(c->keys, lower_key, sizeof(struct key));
165 memcpy(c->keys + 1, key, sizeof(struct key));
166 c->blockptrs[0] = path->nodes[level-1]->blocknr;
167 c->blockptrs[1] = blocknr;
168 /* the path has an extra ref to root->node */
169 tree_block_release(root, root->node);
172 write_tree_block(root, t);
173 path->nodes[level] = t;
174 path->slots[level] = 0;
175 if (c->keys[1].objectid == 0)
179 lower = &path->nodes[level]->node;
180 nritems = lower->header.nritems;
183 if (nritems == NODEPTRS_PER_BLOCK)
185 if (slot != nritems) {
186 memmove(lower->keys + slot + 1, lower->keys + slot,
187 (nritems - slot) * sizeof(struct key));
188 memmove(lower->blockptrs + slot + 1, lower->blockptrs + slot,
189 (nritems - slot) * sizeof(u64));
191 memcpy(lower->keys + slot, key, sizeof(struct key));
192 lower->blockptrs[slot] = blocknr;
193 lower->header.nritems++;
194 if (lower->keys[1].objectid == 0)
196 write_tree_block(root, path->nodes[level]);
200 int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
208 struct tree_buffer *t;
209 struct tree_buffer *right_buf;
211 if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
213 slot = path->slots[level + 1];
217 t = read_tree_block(root,
218 path->nodes[level + 1]->node.blockptrs[slot - 1]);
220 right_buf = path->nodes[level];
221 right = &right_buf->node;
222 left_nritems = left->header.nritems;
223 right_nritems = right->header.nritems;
224 push_items = NODEPTRS_PER_BLOCK - (left_nritems + 1);
225 if (push_items <= 0) {
226 tree_block_release(root, t);
230 if (right_nritems < push_items)
231 push_items = right_nritems;
232 memcpy(left->keys + left_nritems, right->keys,
233 push_items * sizeof(struct key));
234 memcpy(left->blockptrs + left_nritems, right->blockptrs,
235 push_items * sizeof(u64));
236 memmove(right->keys, right->keys + push_items,
237 (right_nritems - push_items) * sizeof(struct key));
238 memmove(right->blockptrs, right->blockptrs + push_items,
239 (right_nritems - push_items) * sizeof(u64));
240 right->header.nritems -= push_items;
241 left->header.nritems += push_items;
243 /* adjust the pointers going up the tree */
244 fixup_low_keys(root, path, right->keys, level + 1);
246 write_tree_block(root, t);
247 write_tree_block(root, right_buf);
249 /* then fixup the leaf pointer in the path */
250 if (path->slots[level] < push_items) {
251 path->slots[level] += left_nritems;
252 tree_block_release(root, path->nodes[level]);
253 path->nodes[level] = t;
254 path->slots[level + 1] -= 1;
256 path->slots[level] -= push_items;
257 tree_block_release(root, t);
262 int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
265 struct tree_buffer *t;
266 struct tree_buffer *src_buffer;
273 if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
275 slot = path->slots[level + 1];
276 if (slot == NODEPTRS_PER_BLOCK - 1)
279 if (slot >= path->nodes[level + 1]->node.header.nritems -1)
282 t = read_tree_block(root,
283 path->nodes[level + 1]->node.blockptrs[slot + 1]);
285 src_buffer = path->nodes[level];
286 src = &src_buffer->node;
287 dst_nritems = dst->header.nritems;
288 src_nritems = src->header.nritems;
289 push_items = NODEPTRS_PER_BLOCK - (dst_nritems + 1);
290 if (push_items <= 0) {
291 tree_block_release(root, t);
295 if (src_nritems < push_items)
296 push_items = src_nritems;
297 memmove(dst->keys + push_items, dst->keys,
298 dst_nritems * sizeof(struct key));
299 memcpy(dst->keys, src->keys + src_nritems - push_items,
300 push_items * sizeof(struct key));
302 memmove(dst->blockptrs + push_items, dst->blockptrs,
303 dst_nritems * sizeof(u64));
304 memcpy(dst->blockptrs, src->blockptrs + src_nritems - push_items,
305 push_items * sizeof(u64));
307 src->header.nritems -= push_items;
308 dst->header.nritems += push_items;
310 /* adjust the pointers going up the tree */
311 memcpy(path->nodes[level + 1]->node.keys + path->slots[level + 1] + 1,
312 dst->keys, sizeof(struct key));
314 write_tree_block(root, path->nodes[level + 1]);
315 write_tree_block(root, t);
316 write_tree_block(root, src_buffer);
318 /* then fixup the leaf pointer in the path */
319 if (path->slots[level] >= src->header.nritems) {
320 path->slots[level] -= src->header.nritems;
321 tree_block_release(root, path->nodes[level]);
322 path->nodes[level] = t;
323 path->slots[level + 1] += 1;
325 tree_block_release(root, t);
330 int insert_ptr(struct ctree_root *root,
331 struct ctree_path *path, struct key *key,
332 u64 blocknr, int level)
334 struct tree_buffer *t = path->nodes[level];
335 struct node *c = &path->nodes[level]->node;
337 struct tree_buffer *b_buffer;
338 struct tree_buffer *bal[MAX_LEVEL];
339 int bal_level = level;
343 memset(bal, 0, ARRAY_SIZE(bal));
344 while(t && t->node.header.nritems == NODEPTRS_PER_BLOCK) {
346 if (push_node_left(root, path,
347 node_level(c->header.flags)) == 0)
349 if (push_node_right(root, path,
350 node_level(c->header.flags)) == 0)
352 bal_start = bal_level;
353 if (bal_level == MAX_LEVEL - 1)
355 b_buffer = alloc_free_block(root);
357 b->header.flags = c->header.flags;
358 b->header.blocknr = b_buffer->blocknr;
359 mid = (c->header.nritems + 1) / 2;
360 memcpy(b->keys, c->keys + mid,
361 (c->header.nritems - mid) * sizeof(struct key));
362 memcpy(b->blockptrs, c->blockptrs + mid,
363 (c->header.nritems - mid) * sizeof(u64));
364 b->header.nritems = c->header.nritems - mid;
365 c->header.nritems = mid;
367 write_tree_block(root, t);
368 write_tree_block(root, b_buffer);
370 bal[bal_level] = b_buffer;
371 if (bal_level == MAX_LEVEL - 1)
374 t = path->nodes[bal_level];
376 while(bal_start > 0) {
377 b_buffer = bal[bal_start];
378 c = &path->nodes[bal_start]->node;
379 __insert_ptr(root, path, b_buffer->node.keys, b_buffer->blocknr,
380 path->slots[bal_start + 1] + 1, bal_start + 1);
381 if (path->slots[bal_start] >= c->header.nritems) {
382 path->slots[bal_start] -= c->header.nritems;
383 tree_block_release(root, path->nodes[bal_start]);
384 path->nodes[bal_start] = b_buffer;
385 path->slots[bal_start + 1] += 1;
387 tree_block_release(root, b_buffer);
393 return __insert_ptr(root, path, key, blocknr, path->slots[level] + 1,
397 int leaf_space_used(struct leaf *l, int start, int nr)
400 int end = start + nr - 1;
404 data_len = l->items[start].offset + l->items[start].size;
405 data_len = data_len - l->items[end].offset;
406 data_len += sizeof(struct item) * nr;
410 int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
413 struct tree_buffer *right_buf = path->nodes[0];
414 struct leaf *right = &right_buf->leaf;
415 struct tree_buffer *t;
423 int old_left_nritems;
425 slot = path->slots[1];
429 if (!path->nodes[1]) {
432 t = read_tree_block(root, path->nodes[1]->node.blockptrs[slot - 1]);
434 free_space = leaf_free_space(left);
435 if (free_space < data_size + sizeof(struct item)) {
436 tree_block_release(root, t);
439 for (i = 0; i < right->header.nritems; i++) {
440 item = right->items + i;
441 if (path->slots[0] == i)
442 push_space += data_size + sizeof(*item);
443 if (item->size + sizeof(*item) + push_space > free_space)
446 push_space += item->size + sizeof(*item);
448 if (push_items == 0) {
449 tree_block_release(root, t);
452 /* push data from right to left */
453 memcpy(left->items + left->header.nritems,
454 right->items, push_items * sizeof(struct item));
455 push_space = LEAF_DATA_SIZE - right->items[push_items -1].offset;
456 memcpy(left->data + leaf_data_end(left) - push_space,
457 right->data + right->items[push_items - 1].offset,
459 old_left_nritems = left->header.nritems;
460 BUG_ON(old_left_nritems < 0);
462 for(i = old_left_nritems; i < old_left_nritems + push_items; i++) {
463 left->items[i].offset -= LEAF_DATA_SIZE -
464 left->items[old_left_nritems -1].offset;
466 left->header.nritems += push_items;
468 /* fixup right node */
469 push_space = right->items[push_items-1].offset - leaf_data_end(right);
470 memmove(right->data + LEAF_DATA_SIZE - push_space, right->data +
471 leaf_data_end(right), push_space);
472 memmove(right->items, right->items + push_items,
473 (right->header.nritems - push_items) * sizeof(struct item));
474 right->header.nritems -= push_items;
475 push_space = LEAF_DATA_SIZE;
477 for (i = 0; i < right->header.nritems; i++) {
478 right->items[i].offset = push_space - right->items[i].size;
479 push_space = right->items[i].offset;
482 write_tree_block(root, t);
483 write_tree_block(root, right_buf);
485 fixup_low_keys(root, path, &right->items[0].key, 1);
487 /* then fixup the leaf pointer in the path */
488 if (path->slots[0] < push_items) {
489 path->slots[0] += old_left_nritems;
490 tree_block_release(root, path->nodes[0]);
494 tree_block_release(root, t);
495 path->slots[0] -= push_items;
497 BUG_ON(path->slots[0] < 0);
501 int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
503 struct tree_buffer *l_buf = path->nodes[0];
504 struct leaf *l = &l_buf->leaf;
509 struct tree_buffer *right_buffer;
510 int space_needed = data_size + sizeof(struct item);
516 if (push_leaf_left(root, path, data_size) == 0) {
517 l_buf = path->nodes[0];
519 if (leaf_free_space(l) >= sizeof(struct item) + data_size)
522 slot = path->slots[0];
523 nritems = l->header.nritems;
524 mid = (nritems + 1)/ 2;
526 right_buffer = alloc_free_block(root);
527 BUG_ON(!right_buffer);
528 BUG_ON(mid == nritems);
529 right = &right_buffer->leaf;
530 memset(right, 0, sizeof(*right));
532 if (leaf_space_used(l, mid, nritems - mid) + space_needed >
536 if (leaf_space_used(l, 0, mid + 1) + space_needed >
540 right->header.nritems = nritems - mid;
541 right->header.blocknr = right_buffer->blocknr;
542 right->header.flags = node_level(0);
543 data_copy_size = l->items[mid].offset + l->items[mid].size -
545 memcpy(right->items, l->items + mid,
546 (nritems - mid) * sizeof(struct item));
547 memcpy(right->data + LEAF_DATA_SIZE - data_copy_size,
548 l->data + leaf_data_end(l), data_copy_size);
549 rt_data_off = LEAF_DATA_SIZE -
550 (l->items[mid].offset + l->items[mid].size);
551 for (i = 0; i < right->header.nritems; i++) {
552 right->items[i].offset += rt_data_off;
554 l->header.nritems = mid;
555 ret = insert_ptr(root, path, &right->items[0].key,
556 right_buffer->blocknr, 1);
558 write_tree_block(root, right_buffer);
559 write_tree_block(root, l_buf);
561 BUG_ON(path->slots[0] != slot);
563 tree_block_release(root, path->nodes[0]);
564 path->nodes[0] = right_buffer;
565 path->slots[0] -= mid;
568 tree_block_release(root, right_buffer);
569 BUG_ON(path->slots[0] < 0);
573 int insert_item(struct ctree_root *root, struct key *key,
574 void *data, int data_size)
580 struct tree_buffer *leaf_buf;
581 unsigned int nritems;
582 unsigned int data_end;
583 struct ctree_path path;
586 struct tree_buffer *t;
587 t = alloc_free_block(root);
589 t->node.header.nritems = 0;
590 t->node.header.flags = node_level(0);
591 t->node.header.blocknr = t->blocknr;
593 write_tree_block(root, t);
596 ret = search_slot(root, key, &path);
598 release_path(root, &path);
602 slot_orig = path.slots[0];
603 leaf_buf = path.nodes[0];
604 leaf = &leaf_buf->leaf;
605 if (leaf_free_space(leaf) < sizeof(struct item) + data_size) {
606 split_leaf(root, &path, data_size);
607 leaf_buf = path.nodes[0];
608 leaf = &path.nodes[0]->leaf;
610 nritems = leaf->header.nritems;
611 data_end = leaf_data_end(leaf);
613 if (leaf_free_space(leaf) < sizeof(struct item) + data_size)
616 slot = path.slots[0];
619 fixup_low_keys(root, &path, key, 1);
620 if (slot != nritems) {
622 unsigned int old_data = leaf->items[slot].offset +
623 leaf->items[slot].size;
626 * item0..itemN ... dataN.offset..dataN.size .. data0.size
628 /* first correct the data pointers */
629 for (i = slot; i < nritems; i++)
630 leaf->items[i].offset -= data_size;
632 /* shift the items */
633 memmove(leaf->items + slot + 1, leaf->items + slot,
634 (nritems - slot) * sizeof(struct item));
637 memmove(leaf->data + data_end - data_size, leaf->data +
638 data_end, old_data - data_end);
641 memcpy(&leaf->items[slot].key, key, sizeof(struct key));
642 leaf->items[slot].offset = data_end - data_size;
643 leaf->items[slot].size = data_size;
644 memcpy(leaf->data + data_end - data_size, data, data_size);
645 leaf->header.nritems += 1;
646 write_tree_block(root, leaf_buf);
647 if (leaf_free_space(leaf) < 0)
649 release_path(root, &path);
653 int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
656 struct tree_buffer *t;
661 t = path->nodes[level];
665 slot = path->slots[level];
666 nritems = node->header.nritems;
668 if (slot != nritems -1) {
669 memmove(node->keys + slot, node->keys + slot + 1,
670 sizeof(struct key) * (nritems - slot - 1));
671 memmove(node->blockptrs + slot,
672 node->blockptrs + slot + 1,
673 sizeof(u64) * (nritems - slot - 1));
675 node->header.nritems--;
676 write_tree_block(root, t);
677 if (node->header.nritems != 0) {
680 fixup_low_keys(root, path, node->keys,
682 tslot = path->slots[level+1];
684 push_node_left(root, path, level);
685 if (node->header.nritems) {
686 push_node_right(root, path, level);
688 if (node->header.nritems) {
689 tree_block_release(root, t);
692 tree_block_release(root, t);
693 path->slots[level+1] = tslot;
695 if (t == root->node) {
696 /* just turn the root into a leaf and break */
697 root->node->node.header.flags = node_level(0);
698 write_tree_block(root, t);
702 if (!path->nodes[level])
708 int del_item(struct ctree_root *root, struct ctree_path *path)
712 struct tree_buffer *leaf_buf;
716 leaf_buf = path->nodes[0];
717 leaf = &leaf_buf->leaf;
718 slot = path->slots[0];
719 doff = leaf->items[slot].offset;
720 dsize = leaf->items[slot].size;
722 if (slot != leaf->header.nritems - 1) {
724 int data_end = leaf_data_end(leaf);
725 memmove(leaf->data + data_end + dsize,
726 leaf->data + data_end,
728 for (i = slot + 1; i < leaf->header.nritems; i++)
729 leaf->items[i].offset += dsize;
730 memmove(leaf->items + slot, leaf->items + slot + 1,
731 sizeof(struct item) *
732 (leaf->header.nritems - slot - 1));
734 leaf->header.nritems -= 1;
735 if (leaf->header.nritems == 0) {
736 if (leaf_buf == root->node) {
737 leaf->header.flags = node_level(0);
738 write_tree_block(root, leaf_buf);
740 del_ptr(root, path, 1);
743 fixup_low_keys(root, path, &leaf->items[0].key, 1);
744 write_tree_block(root, leaf_buf);
745 if (leaf_space_used(leaf, 0, leaf->header.nritems) <
746 LEAF_DATA_SIZE / 4) {
747 /* push_leaf_left fixes the path.
748 * make sure the path still points to our leaf
749 * for possible call to del_ptr below
751 slot = path->slots[1];
753 push_leaf_left(root, path, 1);
754 if (leaf->header.nritems == 0) {
755 path->slots[1] = slot;
756 del_ptr(root, path, 1);
758 tree_block_release(root, leaf_buf);
764 void print_leaf(struct leaf *l)
767 int nr = l->header.nritems;
769 printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr,
772 for (i = 0 ; i < nr ; i++) {
774 printf("\titem %d key (%lu %u %lu) itemoff %d itemsize %d\n",
776 item->key.objectid, item->key.flags, item->key.offset,
777 item->offset, item->size);
779 printf("\t\titem data %.*s\n", item->size, l->data+item->offset);
783 void print_tree(struct ctree_root *root, struct tree_buffer *t)
792 nr = c->header.nritems;
793 if (c->header.blocknr != t->blocknr)
795 if (is_leaf(c->header.flags)) {
796 print_leaf((struct leaf *)c);
799 printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr,
800 node_level(c->header.flags), c->header.nritems,
801 NODEPTRS_PER_BLOCK - c->header.nritems);
803 for (i = 0; i < nr; i++) {
804 printf("\tkey %d (%lu %u %lu) block %lu\n",
806 c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
810 for (i = 0; i < nr; i++) {
811 struct tree_buffer *next_buf = read_tree_block(root,
813 struct node *next = &next_buf->node;
814 if (is_leaf(next->header.flags) &&
815 node_level(c->header.flags) != 1)
817 if (node_level(next->header.flags) !=
818 node_level(c->header.flags) - 1)
820 print_tree(root, next_buf);
821 tree_block_release(root, next_buf);
826 /* for testing only */
827 int next_key(int i, int max_key) {
828 return rand() % max_key;
833 struct ctree_root *root;
835 struct key last = { (u64)-1, 0, 0};
840 int run_size = 1000000;
841 int max_key = 100000000;
843 struct ctree_path path;
848 root = open_ctree("dbfile");
851 for (i = 0; i < run_size; i++) {
853 num = next_key(i, max_key);
855 sprintf(buf, "string-%d", num);
856 // printf("insert %d\n", num);
860 ret = insert_item(root, &ins, buf, strlen(buf));
865 root = open_ctree("dbfile");
866 printf("starting search\n");
868 for (i = 0; i < run_size; i++) {
869 num = next_key(i, max_key);
872 ret = search_slot(root, &ins, &path);
874 print_tree(root, root->node);
875 printf("unable to find %d\n", num);
878 release_path(root, &path);
881 root = open_ctree("dbfile");
882 printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
883 node_level(root->node->node.header.flags),
884 root->node->node.header.nritems,
885 NODEPTRS_PER_BLOCK - root->node->node.header.nritems);
886 printf("all searches good, deleting some items\n");
889 for (i = 0 ; i < run_size/4; i++) {
890 num = next_key(i, max_key);
893 ret = search_slot(root, &ins, &path);
896 ret = del_item(root, &path);
899 release_path(root, &path);
903 for (i = 0; i < run_size; i++) {
905 num = next_key(i, max_key);
906 sprintf(buf, "string-%d", num);
908 ret = insert_item(root, &ins, buf, strlen(buf));
913 root = open_ctree("dbfile");
914 printf("starting search2\n");
916 for (i = 0; i < run_size; i++) {
917 num = next_key(i, max_key);
920 ret = search_slot(root, &ins, &path);
922 print_tree(root, root->node);
923 printf("unable to find %d\n", num);
926 release_path(root, &path);
928 printf("starting big long delete run\n");
929 while(root->node && root->node->node.header.nritems > 0) {
932 ins.objectid = (u64)-1;
934 ret = search_slot(root, &ins, &path);
938 leaf = &path.nodes[0]->leaf;
939 slot = path.slots[0];
940 if (slot != leaf->header.nritems)
942 while(path.slots[0] > 0) {
944 slot = path.slots[0];
945 leaf = &path.nodes[0]->leaf;
947 if (comp_keys(&last, &leaf->items[slot].key) <= 0)
949 memcpy(&last, &leaf->items[slot].key, sizeof(last));
950 ret = del_item(root, &path);
952 printf("del_item returned %d\n", ret);
957 release_path(root, &path);
960 printf("tree size is now %d\n", tree_size);