extent fixes
[platform/upstream/btrfs-progs.git] / ctree.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include "kerncompat.h"
4 #include "radix-tree.h"
5 #include "ctree.h"
6 #include "disk-io.h"
7
8 static int refill_alloc_extent(struct ctree_root *root);
9
10 static inline void init_path(struct ctree_path *p)
11 {
12         memset(p, 0, sizeof(*p));
13 }
14
15 static void release_path(struct ctree_root *root, struct ctree_path *p)
16 {
17         int i;
18         for (i = 0; i < MAX_LEVEL; i++) {
19                 if (!p->nodes[i])
20                         break;
21                 tree_block_release(root, p->nodes[i]);
22         }
23 }
24
25 /*
26  * The leaf data grows from end-to-front in the node.
27  * this returns the address of the start of the last item,
28  * which is the stop of the leaf data stack
29  */
30 static inline unsigned int leaf_data_end(struct leaf *leaf)
31 {
32         unsigned int nr = leaf->header.nritems;
33         if (nr == 0)
34                 return sizeof(leaf->data);
35         return leaf->items[nr-1].offset;
36 }
37
38 /*
39  * The space between the end of the leaf items and
40  * the start of the leaf data.  IOW, how much room
41  * the leaf has left for both items and data
42  */
43 static inline int leaf_free_space(struct leaf *leaf)
44 {
45         int data_end = leaf_data_end(leaf);
46         int nritems = leaf->header.nritems;
47         char *items_end = (char *)(leaf->items + nritems + 1);
48         return (char *)(leaf->data + data_end) - (char *)items_end;
49 }
50
51 /*
52  * compare two keys in a memcmp fashion
53  */
54 int comp_keys(struct key *k1, struct key *k2)
55 {
56         if (k1->objectid > k2->objectid)
57                 return 1;
58         if (k1->objectid < k2->objectid)
59                 return -1;
60         if (k1->flags > k2->flags)
61                 return 1;
62         if (k1->flags < k2->flags)
63                 return -1;
64         if (k1->offset > k2->offset)
65                 return 1;
66         if (k1->offset < k2->offset)
67                 return -1;
68         return 0;
69 }
70
71 /*
72  * search for key in the array p.  items p are item_size apart
73  * and there are 'max' items in p
74  * the slot in the array is returned via slot, and it points to
75  * the place where you would insert key if it is not found in
76  * the array.
77  *
78  * slot may point to max if the key is bigger than all of the keys
79  */
80 int generic_bin_search(char *p, int item_size, struct key *key,
81                        int max, int *slot)
82 {
83         int low = 0;
84         int high = max;
85         int mid;
86         int ret;
87         struct key *tmp;
88
89         while(low < high) {
90                 mid = (low + high) / 2;
91                 tmp = (struct key *)(p + mid * item_size);
92                 ret = comp_keys(tmp, key);
93
94                 if (ret < 0)
95                         low = mid + 1;
96                 else if (ret > 0)
97                         high = mid;
98                 else {
99                         *slot = mid;
100                         return 0;
101                 }
102         }
103         *slot = low;
104         return 1;
105 }
106
107 int bin_search(struct node *c, struct key *key, int *slot)
108 {
109         if (is_leaf(c->header.flags)) {
110                 struct leaf *l = (struct leaf *)c;
111                 return generic_bin_search((void *)l->items, sizeof(struct item),
112                                           key, c->header.nritems, slot);
113         } else {
114                 return generic_bin_search((void *)c->keys, sizeof(struct key),
115                                           key, c->header.nritems, slot);
116         }
117         return -1;
118 }
119
120 /*
121  * look for key in the tree.  path is filled in with nodes along the way
122  * if key is found, we return zero and you can find the item in the leaf
123  * level of the path (level 0)
124  *
125  * If the key isn't found, the path points to the slot where it should
126  * be inserted.
127  */
128 int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
129 {
130         struct tree_buffer *b = root->node;
131         struct node *c;
132
133         int slot;
134         int ret;
135         int level;
136         b->count++;
137         while (b) {
138                 c = &b->node;
139                 level = node_level(c->header.flags);
140                 p->nodes[level] = b;
141                 ret = bin_search(c, key, &slot);
142                 if (!is_leaf(c->header.flags)) {
143                         if (ret && slot > 0)
144                                 slot -= 1;
145                         p->slots[level] = slot;
146                         b = read_tree_block(root, c->blockptrs[slot]);
147                         continue;
148                 } else {
149                         p->slots[level] = slot;
150                         return ret;
151                 }
152         }
153         return -1;
154 }
155
156 /*
157  * adjust the pointers going up the tree, starting at level
158  * making sure the right key of each node is points to 'key'.
159  * This is used after shifting pointers to the left, so it stops
160  * fixing up pointers when a given leaf/node is not in slot 0 of the
161  * higher levels
162  */
163 static void fixup_low_keys(struct ctree_root *root,
164                            struct ctree_path *path, struct key *key,
165                            int level)
166 {
167         int i;
168         for (i = level; i < MAX_LEVEL; i++) {
169                 struct node *t;
170                 int tslot = path->slots[i];
171                 if (!path->nodes[i])
172                         break;
173                 t = &path->nodes[i]->node;
174                 memcpy(t->keys + tslot, key, sizeof(*key));
175                 write_tree_block(root, path->nodes[i]);
176                 if (tslot != 0)
177                         break;
178         }
179 }
180
181 /*
182  * try to push data from one node into the next node left in the
183  * tree.  The src node is found at specified level in the path.
184  * If some bytes were pushed, return 0, otherwise return 1.
185  *
186  * Lower nodes/leaves in the path are not touched, higher nodes may
187  * be modified to reflect the push.
188  *
189  * The path is altered to reflect the push.
190  */
191 int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
192 {
193         int slot;
194         struct node *left;
195         struct node *right;
196         int push_items = 0;
197         int left_nritems;
198         int right_nritems;
199         struct tree_buffer *t;
200         struct tree_buffer *right_buf;
201
202         if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
203                 return 1;
204         slot = path->slots[level + 1];
205         if (slot == 0)
206                 return 1;
207
208         t = read_tree_block(root,
209                             path->nodes[level + 1]->node.blockptrs[slot - 1]);
210         left = &t->node;
211         right_buf = path->nodes[level];
212         right = &right_buf->node;
213         left_nritems = left->header.nritems;
214         right_nritems = right->header.nritems;
215         push_items = NODEPTRS_PER_BLOCK - (left_nritems + 1);
216         if (push_items <= 0) {
217                 tree_block_release(root, t);
218                 return 1;
219         }
220
221         if (right_nritems < push_items)
222                 push_items = right_nritems;
223         memcpy(left->keys + left_nritems, right->keys,
224                 push_items * sizeof(struct key));
225         memcpy(left->blockptrs + left_nritems, right->blockptrs,
226                 push_items * sizeof(u64));
227         memmove(right->keys, right->keys + push_items,
228                 (right_nritems - push_items) * sizeof(struct key));
229         memmove(right->blockptrs, right->blockptrs + push_items,
230                 (right_nritems - push_items) * sizeof(u64));
231         right->header.nritems -= push_items;
232         left->header.nritems += push_items;
233
234         /* adjust the pointers going up the tree */
235         fixup_low_keys(root, path, right->keys, level + 1);
236
237         write_tree_block(root, t);
238         write_tree_block(root, right_buf);
239
240         /* then fixup the leaf pointer in the path */
241         if (path->slots[level] < push_items) {
242                 path->slots[level] += left_nritems;
243                 tree_block_release(root, path->nodes[level]);
244                 path->nodes[level] = t;
245                 path->slots[level + 1] -= 1;
246         } else {
247                 path->slots[level] -= push_items;
248                 tree_block_release(root, t);
249         }
250         return 0;
251 }
252
253 /*
254  * try to push data from one node into the next node right in the
255  * tree.  The src node is found at specified level in the path.
256  * If some bytes were pushed, return 0, otherwise return 1.
257  *
258  * Lower nodes/leaves in the path are not touched, higher nodes may
259  * be modified to reflect the push.
260  *
261  * The path is altered to reflect the push.
262  */
263 int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
264 {
265         int slot;
266         struct tree_buffer *t;
267         struct tree_buffer *src_buffer;
268         struct node *dst;
269         struct node *src;
270         int push_items = 0;
271         int dst_nritems;
272         int src_nritems;
273
274         /* can't push from the root */
275         if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
276                 return 1;
277
278         /* only try to push inside the node higher up */
279         slot = path->slots[level + 1];
280         if (slot == NODEPTRS_PER_BLOCK - 1)
281                 return 1;
282
283         if (slot >= path->nodes[level + 1]->node.header.nritems -1)
284                 return 1;
285
286         t = read_tree_block(root,
287                             path->nodes[level + 1]->node.blockptrs[slot + 1]);
288         dst = &t->node;
289         src_buffer = path->nodes[level];
290         src = &src_buffer->node;
291         dst_nritems = dst->header.nritems;
292         src_nritems = src->header.nritems;
293         push_items = NODEPTRS_PER_BLOCK - (dst_nritems + 1);
294         if (push_items <= 0) {
295                 tree_block_release(root, t);
296                 return 1;
297         }
298
299         if (src_nritems < push_items)
300                 push_items = src_nritems;
301         memmove(dst->keys + push_items, dst->keys,
302                 dst_nritems * sizeof(struct key));
303         memcpy(dst->keys, src->keys + src_nritems - push_items,
304                 push_items * sizeof(struct key));
305
306         memmove(dst->blockptrs + push_items, dst->blockptrs,
307                 dst_nritems * sizeof(u64));
308         memcpy(dst->blockptrs, src->blockptrs + src_nritems - push_items,
309                 push_items * sizeof(u64));
310
311         src->header.nritems -= push_items;
312         dst->header.nritems += push_items;
313
314         /* adjust the pointers going up the tree */
315         memcpy(path->nodes[level + 1]->node.keys + path->slots[level + 1] + 1,
316                 dst->keys, sizeof(struct key));
317
318         write_tree_block(root, path->nodes[level + 1]);
319         write_tree_block(root, t);
320         write_tree_block(root, src_buffer);
321
322         /* then fixup the pointers in the path */
323         if (path->slots[level] >= src->header.nritems) {
324                 path->slots[level] -= src->header.nritems;
325                 tree_block_release(root, path->nodes[level]);
326                 path->nodes[level] = t;
327                 path->slots[level + 1] += 1;
328         } else {
329                 tree_block_release(root, t);
330         }
331         return 0;
332 }
333
334 /*
335  * worker function to insert a single pointer in a node.
336  * the node should have enough room for the pointer already
337  * slot and level indicate where you want the key to go, and
338  * blocknr is the block the key points to.
339  */
340 int __insert_ptr(struct ctree_root *root,
341                 struct ctree_path *path, struct key *key,
342                 u64 blocknr, int slot, int level)
343 {
344         struct node *c;
345         struct node *lower;
346         struct key *lower_key;
347         int nritems;
348         /* need a new root */
349         if (!path->nodes[level]) {
350                 struct tree_buffer *t;
351                 t = alloc_free_block(root);
352                 c = &t->node;
353                 memset(c, 0, sizeof(c));
354                 c->header.nritems = 2;
355                 c->header.flags = node_level(level);
356                 c->header.blocknr = t->blocknr;
357                 c->header.parentid = root->node->node.header.parentid;
358                 lower = &path->nodes[level-1]->node;
359                 if (is_leaf(lower->header.flags))
360                         lower_key = &((struct leaf *)lower)->items[0].key;
361                 else
362                         lower_key = lower->keys;
363                 memcpy(c->keys, lower_key, sizeof(struct key));
364                 memcpy(c->keys + 1, key, sizeof(struct key));
365                 c->blockptrs[0] = path->nodes[level-1]->blocknr;
366                 c->blockptrs[1] = blocknr;
367                 /* the super has an extra ref to root->node */
368                 tree_block_release(root, root->node);
369                 root->node = t;
370                 t->count++;
371                 write_tree_block(root, t);
372                 path->nodes[level] = t;
373                 path->slots[level] = 0;
374                 if (c->keys[1].objectid == 0)
375                         BUG();
376                 return 0;
377         }
378         lower = &path->nodes[level]->node;
379         nritems = lower->header.nritems;
380         if (slot > nritems)
381                 BUG();
382         if (nritems == NODEPTRS_PER_BLOCK)
383                 BUG();
384         if (slot != nritems) {
385                 memmove(lower->keys + slot + 1, lower->keys + slot,
386                         (nritems - slot) * sizeof(struct key));
387                 memmove(lower->blockptrs + slot + 1, lower->blockptrs + slot,
388                         (nritems - slot) * sizeof(u64));
389         }
390         memcpy(lower->keys + slot, key, sizeof(struct key));
391         lower->blockptrs[slot] = blocknr;
392         lower->header.nritems++;
393         if (lower->keys[1].objectid == 0)
394                         BUG();
395         write_tree_block(root, path->nodes[level]);
396         return 0;
397 }
398
399
400 /*
401  * insert a key,blocknr pair into the tree at a given level
402  * If the node at that level in the path doesn't have room,
403  * it is split or shifted as appropriate.
404  */
405 int insert_ptr(struct ctree_root *root,
406                 struct ctree_path *path, struct key *key,
407                 u64 blocknr, int level)
408 {
409         struct tree_buffer *t = path->nodes[level];
410         struct node *c = &path->nodes[level]->node;
411         struct node *b;
412         struct tree_buffer *b_buffer;
413         struct tree_buffer *bal[MAX_LEVEL];
414         int bal_level = level;
415         int mid;
416         int bal_start = -1;
417
418         /*
419          * check to see if we need to make room in the node for this
420          * pointer.  If we do, keep walking the tree, making sure there
421          * is enough room in each level for the required insertions.
422          *
423          * The bal array is filled in with any nodes to be inserted
424          * due to splitting.  Once we've done all the splitting required
425          * do the inserts based on the data in the bal array.
426          */
427         memset(bal, 0, sizeof(bal));
428         while(t && t->node.header.nritems == NODEPTRS_PER_BLOCK) {
429                 c = &t->node;
430                 if (push_node_left(root, path,
431                    node_level(c->header.flags)) == 0)
432                         break;
433                 if (push_node_right(root, path,
434                    node_level(c->header.flags)) == 0)
435                         break;
436                 bal_start = bal_level;
437                 if (bal_level == MAX_LEVEL - 1)
438                         BUG();
439                 b_buffer = alloc_free_block(root);
440                 b = &b_buffer->node;
441                 b->header.flags = c->header.flags;
442                 b->header.blocknr = b_buffer->blocknr;
443                 b->header.parentid = root->node->node.header.parentid;
444                 mid = (c->header.nritems + 1) / 2;
445                 memcpy(b->keys, c->keys + mid,
446                         (c->header.nritems - mid) * sizeof(struct key));
447                 memcpy(b->blockptrs, c->blockptrs + mid,
448                         (c->header.nritems - mid) * sizeof(u64));
449                 b->header.nritems = c->header.nritems - mid;
450                 c->header.nritems = mid;
451
452                 write_tree_block(root, t);
453                 write_tree_block(root, b_buffer);
454
455                 bal[bal_level] = b_buffer;
456                 if (bal_level == MAX_LEVEL - 1)
457                         break;
458                 bal_level += 1;
459                 t = path->nodes[bal_level];
460         }
461         /*
462          * bal_start tells us the first level in the tree that needed to
463          * be split.  Go through the bal array inserting the new nodes
464          * as needed.  The path is fixed as we go.
465          */
466         while(bal_start > 0) {
467                 b_buffer = bal[bal_start];
468                 c = &path->nodes[bal_start]->node;
469                 __insert_ptr(root, path, b_buffer->node.keys, b_buffer->blocknr,
470                                 path->slots[bal_start + 1] + 1, bal_start + 1);
471                 if (path->slots[bal_start] >= c->header.nritems) {
472                         path->slots[bal_start] -= c->header.nritems;
473                         tree_block_release(root, path->nodes[bal_start]);
474                         path->nodes[bal_start] = b_buffer;
475                         path->slots[bal_start + 1] += 1;
476                 } else {
477                         tree_block_release(root, b_buffer);
478                 }
479                 bal_start--;
480                 if (!bal[bal_start])
481                         break;
482         }
483         /* Now that the tree has room, insert the requested pointer */
484         return __insert_ptr(root, path, key, blocknr, path->slots[level] + 1,
485                             level);
486 }
487
488 /*
489  * how many bytes are required to store the items in a leaf.  start
490  * and nr indicate which items in the leaf to check.  This totals up the
491  * space used both by the item structs and the item data
492  */
493 int leaf_space_used(struct leaf *l, int start, int nr)
494 {
495         int data_len;
496         int end = start + nr - 1;
497
498         if (!nr)
499                 return 0;
500         data_len = l->items[start].offset + l->items[start].size;
501         data_len = data_len - l->items[end].offset;
502         data_len += sizeof(struct item) * nr;
503         return data_len;
504 }
505
506 /*
507  * push some data in the path leaf to the left, trying to free up at
508  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
509  */
510 int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
511                    int data_size)
512 {
513         struct tree_buffer *right_buf = path->nodes[0];
514         struct leaf *right = &right_buf->leaf;
515         struct tree_buffer *t;
516         struct leaf *left;
517         int slot;
518         int i;
519         int free_space;
520         int push_space = 0;
521         int push_items = 0;
522         struct item *item;
523         int old_left_nritems;
524
525         slot = path->slots[1];
526         if (slot == 0) {
527                 return 1;
528         }
529         if (!path->nodes[1]) {
530                 return 1;
531         }
532         t = read_tree_block(root, path->nodes[1]->node.blockptrs[slot - 1]);
533         left = &t->leaf;
534         free_space = leaf_free_space(left);
535         if (free_space < data_size + sizeof(struct item)) {
536                 tree_block_release(root, t);
537                 return 1;
538         }
539         for (i = 0; i < right->header.nritems; i++) {
540                 item = right->items + i;
541                 if (path->slots[0] == i)
542                         push_space += data_size + sizeof(*item);
543                 if (item->size + sizeof(*item) + push_space > free_space)
544                         break;
545                 push_items++;
546                 push_space += item->size + sizeof(*item);
547         }
548         if (push_items == 0) {
549                 tree_block_release(root, t);
550                 return 1;
551         }
552         /* push data from right to left */
553         memcpy(left->items + left->header.nritems,
554                 right->items, push_items * sizeof(struct item));
555         push_space = LEAF_DATA_SIZE - right->items[push_items -1].offset;
556         memcpy(left->data + leaf_data_end(left) - push_space,
557                 right->data + right->items[push_items - 1].offset,
558                 push_space);
559         old_left_nritems = left->header.nritems;
560         BUG_ON(old_left_nritems < 0);
561
562         for(i = old_left_nritems; i < old_left_nritems + push_items; i++) {
563                 left->items[i].offset -= LEAF_DATA_SIZE -
564                         left->items[old_left_nritems -1].offset;
565         }
566         left->header.nritems += push_items;
567
568         /* fixup right node */
569         push_space = right->items[push_items-1].offset - leaf_data_end(right);
570         memmove(right->data + LEAF_DATA_SIZE - push_space, right->data +
571                 leaf_data_end(right), push_space);
572         memmove(right->items, right->items + push_items,
573                 (right->header.nritems - push_items) * sizeof(struct item));
574         right->header.nritems -= push_items;
575         push_space = LEAF_DATA_SIZE;
576
577         for (i = 0; i < right->header.nritems; i++) {
578                 right->items[i].offset = push_space - right->items[i].size;
579                 push_space = right->items[i].offset;
580         }
581
582         write_tree_block(root, t);
583         write_tree_block(root, right_buf);
584
585         fixup_low_keys(root, path, &right->items[0].key, 1);
586
587         /* then fixup the leaf pointer in the path */
588         if (path->slots[0] < push_items) {
589                 path->slots[0] += old_left_nritems;
590                 tree_block_release(root, path->nodes[0]);
591                 path->nodes[0] = t;
592                 path->slots[1] -= 1;
593         } else {
594                 tree_block_release(root, t);
595                 path->slots[0] -= push_items;
596         }
597         BUG_ON(path->slots[0] < 0);
598         return 0;
599 }
600
601 /*
602  * split the path's leaf in two, making sure there is at least data_size
603  * available for the resulting leaf level of the path.
604  */
605 int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
606 {
607         struct tree_buffer *l_buf = path->nodes[0];
608         struct leaf *l = &l_buf->leaf;
609         int nritems;
610         int mid;
611         int slot;
612         struct leaf *right;
613         struct tree_buffer *right_buffer;
614         int space_needed = data_size + sizeof(struct item);
615         int data_copy_size;
616         int rt_data_off;
617         int i;
618         int ret;
619
620         if (push_leaf_left(root, path, data_size) == 0) {
621                 l_buf = path->nodes[0];
622                 l = &l_buf->leaf;
623                 if (leaf_free_space(l) >= sizeof(struct item) + data_size)
624                         return 0;
625         }
626         slot = path->slots[0];
627         nritems = l->header.nritems;
628         mid = (nritems + 1)/ 2;
629
630         right_buffer = alloc_free_block(root);
631         BUG_ON(!right_buffer);
632         BUG_ON(mid == nritems);
633         right = &right_buffer->leaf;
634         memset(right, 0, sizeof(*right));
635         if (mid <= slot) {
636                 if (leaf_space_used(l, mid, nritems - mid) + space_needed >
637                         LEAF_DATA_SIZE)
638                         BUG();
639         } else {
640                 if (leaf_space_used(l, 0, mid + 1) + space_needed >
641                         LEAF_DATA_SIZE)
642                         BUG();
643         }
644         right->header.nritems = nritems - mid;
645         right->header.blocknr = right_buffer->blocknr;
646         right->header.flags = node_level(0);
647         right->header.parentid = root->node->node.header.parentid;
648         data_copy_size = l->items[mid].offset + l->items[mid].size -
649                          leaf_data_end(l);
650         memcpy(right->items, l->items + mid,
651                (nritems - mid) * sizeof(struct item));
652         memcpy(right->data + LEAF_DATA_SIZE - data_copy_size,
653                l->data + leaf_data_end(l), data_copy_size);
654         rt_data_off = LEAF_DATA_SIZE -
655                      (l->items[mid].offset + l->items[mid].size);
656
657         for (i = 0; i < right->header.nritems; i++)
658                 right->items[i].offset += rt_data_off;
659
660         l->header.nritems = mid;
661         ret = insert_ptr(root, path, &right->items[0].key,
662                           right_buffer->blocknr, 1);
663
664         write_tree_block(root, right_buffer);
665         write_tree_block(root, l_buf);
666
667         BUG_ON(path->slots[0] != slot);
668         if (mid <= slot) {
669                 tree_block_release(root, path->nodes[0]);
670                 path->nodes[0] = right_buffer;
671                 path->slots[0] -= mid;
672                 path->slots[1] += 1;
673         } else
674                 tree_block_release(root, right_buffer);
675         BUG_ON(path->slots[0] < 0);
676         return ret;
677 }
678
679 /*
680  * Given a key and some data, insert an item into the tree.
681  * This does all the path init required, making room in the tree if needed.
682  */
683 int insert_item(struct ctree_root *root, struct key *key,
684                           void *data, int data_size)
685 {
686         int ret;
687         int slot;
688         int slot_orig;
689         struct leaf *leaf;
690         struct tree_buffer *leaf_buf;
691         unsigned int nritems;
692         unsigned int data_end;
693         struct ctree_path path;
694
695         refill_alloc_extent(root);
696
697         /* create a root if there isn't one */
698         if (!root->node) {
699                 BUG();
700 #if 0
701                 struct tree_buffer *t;
702                 t = alloc_free_block(root);
703                 BUG_ON(!t);
704                 t->node.header.nritems = 0;
705                 t->node.header.flags = node_level(0);
706                 t->node.header.blocknr = t->blocknr;
707                 root->node = t;
708                 write_tree_block(root, t);
709 #endif
710         }
711         init_path(&path);
712         ret = search_slot(root, key, &path);
713         if (ret == 0) {
714                 release_path(root, &path);
715                 return -EEXIST;
716         }
717
718         slot_orig = path.slots[0];
719         leaf_buf = path.nodes[0];
720         leaf = &leaf_buf->leaf;
721
722         /* make room if needed */
723         if (leaf_free_space(leaf) <  sizeof(struct item) + data_size) {
724                 split_leaf(root, &path, data_size);
725                 leaf_buf = path.nodes[0];
726                 leaf = &path.nodes[0]->leaf;
727         }
728         nritems = leaf->header.nritems;
729         data_end = leaf_data_end(leaf);
730
731         if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
732                 BUG();
733
734         slot = path.slots[0];
735         BUG_ON(slot < 0);
736         if (slot == 0)
737                 fixup_low_keys(root, &path, key, 1);
738         if (slot != nritems) {
739                 int i;
740                 unsigned int old_data = leaf->items[slot].offset +
741                                         leaf->items[slot].size;
742
743                 /*
744                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
745                  */
746                 /* first correct the data pointers */
747                 for (i = slot; i < nritems; i++)
748                         leaf->items[i].offset -= data_size;
749
750                 /* shift the items */
751                 memmove(leaf->items + slot + 1, leaf->items + slot,
752                         (nritems - slot) * sizeof(struct item));
753
754                 /* shift the data */
755                 memmove(leaf->data + data_end - data_size, leaf->data +
756                         data_end, old_data - data_end);
757                 data_end = old_data;
758         }
759         /* copy the new data in */
760         memcpy(&leaf->items[slot].key, key, sizeof(struct key));
761         leaf->items[slot].offset = data_end - data_size;
762         leaf->items[slot].size = data_size;
763         memcpy(leaf->data + data_end - data_size, data, data_size);
764         leaf->header.nritems += 1;
765         write_tree_block(root, leaf_buf);
766         if (leaf_free_space(leaf) < 0)
767                 BUG();
768         release_path(root, &path);
769         return 0;
770 }
771
772 /*
773  * delete the pointer from a given level in the path.  The path is not
774  * fixed up, so after calling this it is not valid at that level.
775  *
776  * If the delete empties a node, the node is removed from the tree,
777  * continuing all the way the root if required.  The root is converted into
778  * a leaf if all the nodes are emptied.
779  */
780 int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
781 {
782         int slot;
783         struct tree_buffer *t;
784         struct node *node;
785         int nritems;
786
787         while(1) {
788                 t = path->nodes[level];
789                 if (!t)
790                         break;
791                 node = &t->node;
792                 slot = path->slots[level];
793                 nritems = node->header.nritems;
794
795                 if (slot != nritems -1) {
796                         memmove(node->keys + slot, node->keys + slot + 1,
797                                 sizeof(struct key) * (nritems - slot - 1));
798                         memmove(node->blockptrs + slot,
799                                 node->blockptrs + slot + 1,
800                                 sizeof(u64) * (nritems - slot - 1));
801                 }
802                 node->header.nritems--;
803                 write_tree_block(root, t);
804                 if (node->header.nritems != 0) {
805                         int tslot;
806                         if (slot == 0)
807                                 fixup_low_keys(root, path, node->keys,
808                                                level + 1);
809                         tslot = path->slots[level+1];
810                         t->count++;
811                         push_node_left(root, path, level);
812                         if (node->header.nritems) {
813                                 push_node_right(root, path, level);
814                         }
815                         if (node->header.nritems) {
816                                 tree_block_release(root, t);
817                                 break;
818                         }
819                         tree_block_release(root, t);
820                         path->slots[level+1] = tslot;
821                 }
822                 if (t == root->node) {
823                         /* just turn the root into a leaf and break */
824                         root->node->node.header.flags = node_level(0);
825                         write_tree_block(root, t);
826                         break;
827                 }
828                 level++;
829                 if (!path->nodes[level])
830                         BUG();
831         }
832         return 0;
833 }
834
835 /*
836  * delete the item at the leaf level in path.  If that empties
837  * the leaf, remove it from the tree
838  */
839 int del_item(struct ctree_root *root, struct ctree_path *path)
840 {
841         int slot;
842         struct leaf *leaf;
843         struct tree_buffer *leaf_buf;
844         int doff;
845         int dsize;
846
847         leaf_buf = path->nodes[0];
848         leaf = &leaf_buf->leaf;
849         slot = path->slots[0];
850         doff = leaf->items[slot].offset;
851         dsize = leaf->items[slot].size;
852
853         if (slot != leaf->header.nritems - 1) {
854                 int i;
855                 int data_end = leaf_data_end(leaf);
856                 memmove(leaf->data + data_end + dsize,
857                         leaf->data + data_end,
858                         doff - data_end);
859                 for (i = slot + 1; i < leaf->header.nritems; i++)
860                         leaf->items[i].offset += dsize;
861                 memmove(leaf->items + slot, leaf->items + slot + 1,
862                         sizeof(struct item) *
863                         (leaf->header.nritems - slot - 1));
864         }
865         leaf->header.nritems -= 1;
866         /* delete the leaf if we've emptied it */
867         if (leaf->header.nritems == 0) {
868                 if (leaf_buf == root->node) {
869                         leaf->header.flags = node_level(0);
870                         write_tree_block(root, leaf_buf);
871                 } else
872                         del_ptr(root, path, 1);
873         } else {
874                 if (slot == 0)
875                         fixup_low_keys(root, path, &leaf->items[0].key, 1);
876                 write_tree_block(root, leaf_buf);
877                 /* delete the leaf if it is mostly empty */
878                 if (leaf_space_used(leaf, 0, leaf->header.nritems) <
879                     LEAF_DATA_SIZE / 4) {
880                         /* push_leaf_left fixes the path.
881                          * make sure the path still points to our leaf
882                          * for possible call to del_ptr below
883                          */
884                         slot = path->slots[1];
885                         leaf_buf->count++;
886                         push_leaf_left(root, path, 1);
887                         if (leaf->header.nritems == 0) {
888                                 path->slots[1] = slot;
889                                 del_ptr(root, path, 1);
890                         }
891                         tree_block_release(root, leaf_buf);
892                 }
893         }
894         return 0;
895 }
896
897 int next_leaf(struct ctree_root *root, struct ctree_path *path)
898 {
899         int slot;
900         int level = 1;
901         u64 blocknr;
902         struct tree_buffer *c;
903         struct tree_buffer *next = NULL;
904
905         while(level < MAX_LEVEL) {
906                 if (!path->nodes[level])
907                         return -1;
908                 slot = path->slots[level] + 1;
909                 c = path->nodes[level];
910                 if (slot >= c->node.header.nritems) {
911                         level++;
912                         continue;
913                 }
914                 blocknr = c->node.blockptrs[slot];
915                 if (next)
916                         tree_block_release(root, next);
917                 next = read_tree_block(root, blocknr);
918                 break;
919         }
920         path->slots[level] = slot;
921         while(1) {
922                 level--;
923                 c = path->nodes[level];
924                 tree_block_release(root, c);
925                 path->nodes[level] = next;
926                 path->slots[level] = 0;
927                 if (!level)
928                         break;
929                 next = read_tree_block(root, next->node.blockptrs[0]);
930         }
931         return 0;
932 }
933
934 int alloc_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start,
935                  u64 search_end, u64 owner, struct key *ins)
936 {
937         struct ctree_path path;
938         struct key *key;
939         int ret;
940         u64 hole_size = 0;
941         int slot = 0;
942         u64 last_block;
943         int start_found = 0;
944         struct leaf *l;
945         struct extent_item extent_item;
946         struct ctree_root * root = orig_root->extent_root;
947
948         init_path(&path);
949         ins->objectid = search_start;
950         ins->offset = 0;
951         ins->flags = 0;
952
953         ret = search_slot(root, ins, &path);
954         while (1) {
955                 l = &path.nodes[0]->leaf;
956                 slot = path.slots[0];
957                 if (!l) {
958                         // FIXME allocate root
959                 }
960                 if (slot >= l->header.nritems) {
961                         ret = next_leaf(root, &path);
962                         if (ret == 0)
963                                 continue;
964                         if (!start_found) {
965                                 ins->objectid = search_start;
966                                 ins->offset = num_blocks;
967                                 hole_size = search_end - search_start;
968                                 goto insert;
969                         }
970                         ins->objectid = last_block;
971                         ins->offset = num_blocks;
972                         hole_size = search_end - last_block;
973                         goto insert;
974                 }
975                 key = &l->items[slot].key;
976                 if (start_found) {
977                         hole_size = key->objectid - last_block;
978                         if (hole_size > num_blocks) {
979                                 ins->objectid = last_block;
980                                 ins->offset = num_blocks;
981                                 goto insert;
982                         }
983                 } else
984                         start_found = 1;
985                 last_block = key->objectid + key->offset;
986                 path.slots[0]++;
987         }
988         // FIXME -ENOSPC
989 insert:
990         release_path(root, &path);
991         extent_item.refs = 1;
992         extent_item.owner = owner;
993         if (root == orig_root && root->reserve_extent->num_blocks == 0) {
994                 root->reserve_extent->blocknr = ins->objectid;
995                 root->reserve_extent->num_blocks = ins->offset;
996                 root->reserve_extent->num_used = 0;
997         }
998         ret = insert_item(root->extent_root, ins, &extent_item, sizeof(extent_item));
999         return ret;
1000 }
1001
1002 static int refill_alloc_extent(struct ctree_root *root)
1003 {
1004         struct alloc_extent *ae = root->alloc_extent;
1005         struct key key;
1006         int ret;
1007         int min_blocks = MAX_LEVEL * 2;
1008
1009         if (ae->num_blocks > ae->num_used && ae->num_blocks - ae->num_used >
1010             min_blocks)
1011                 return 0;
1012         ae = root->reserve_extent;
1013         if (ae->num_blocks > ae->num_used) {
1014                 if (root->alloc_extent->num_blocks == 0) {
1015                         /* we should swap reserve/alloc_extent when alloc
1016                          * fills up
1017                          */
1018                         BUG();
1019                 }
1020                 if (ae->num_blocks - ae->num_used < min_blocks)
1021                         BUG();
1022                 return 0;
1023         }
1024         ret = alloc_extent(root,
1025                            min_blocks * 2, 0, (unsigned long)-1,
1026                            root->node->node.header.parentid, &key);
1027         ae->blocknr = key.objectid;
1028         ae->num_blocks = key.offset;
1029         ae->num_used = 0;
1030         return ret;
1031 }
1032
1033 void print_leaf(struct leaf *l)
1034 {
1035         int i;
1036         int nr = l->header.nritems;
1037         struct item *item;
1038         struct extent_item *ei;
1039         printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr,
1040                leaf_free_space(l));
1041         fflush(stdout);
1042         for (i = 0 ; i < nr ; i++) {
1043                 item = l->items + i;
1044                 printf("\titem %d key (%lu %u %lu) itemoff %d itemsize %d\n",
1045                         i,
1046                         item->key.objectid, item->key.flags, item->key.offset,
1047                         item->offset, item->size);
1048                 fflush(stdout);
1049                 printf("\t\titem data %.*s\n", item->size, l->data+item->offset);
1050                 ei = (struct extent_item *)(l->data + item->offset);
1051                 printf("\t\textent data %u %lu\n", ei->refs, ei->owner);
1052                 fflush(stdout);
1053         }
1054 }
1055 void print_tree(struct ctree_root *root, struct tree_buffer *t)
1056 {
1057         int i;
1058         int nr;
1059         struct node *c;
1060
1061         if (!t)
1062                 return;
1063         c = &t->node;
1064         nr = c->header.nritems;
1065         if (c->header.blocknr != t->blocknr)
1066                 BUG();
1067         if (is_leaf(c->header.flags)) {
1068                 print_leaf((struct leaf *)c);
1069                 return;
1070         }
1071         printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr,
1072                 node_level(c->header.flags), c->header.nritems,
1073                 NODEPTRS_PER_BLOCK - c->header.nritems);
1074         fflush(stdout);
1075         for (i = 0; i < nr; i++) {
1076                 printf("\tkey %d (%lu %u %lu) block %lu\n",
1077                        i,
1078                        c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
1079                        c->blockptrs[i]);
1080                 fflush(stdout);
1081         }
1082         for (i = 0; i < nr; i++) {
1083                 struct tree_buffer *next_buf = read_tree_block(root,
1084                                                             c->blockptrs[i]);
1085                 struct node *next = &next_buf->node;
1086                 if (is_leaf(next->header.flags) &&
1087                     node_level(c->header.flags) != 1)
1088                         BUG();
1089                 if (node_level(next->header.flags) !=
1090                         node_level(c->header.flags) - 1)
1091                         BUG();
1092                 print_tree(root, next_buf);
1093                 tree_block_release(root, next_buf);
1094         }
1095
1096 }
1097
1098 /* for testing only */
1099 int next_key(int i, int max_key) {
1100         return rand() % max_key;
1101         // return i;
1102 }
1103
1104 int main() {
1105         struct ctree_root *root;
1106         struct key ins;
1107         struct key last = { (u64)-1, 0, 0};
1108         char *buf;
1109         int i;
1110         int num;
1111         int ret;
1112         int run_size = 10000;
1113         int max_key = 100000000;
1114         int tree_size = 0;
1115         struct ctree_path path;
1116         struct ctree_super_block super;
1117
1118         radix_tree_init();
1119
1120
1121         root = open_ctree("dbfile", &super);
1122         printf("root tree\n");
1123         print_tree(root, root->node);
1124         printf("map tree\n");
1125         print_tree(root->extent_root, root->extent_root->node);
1126
1127         srand(55);
1128         for (i = 0; i < run_size; i++) {
1129                 buf = malloc(64);
1130                 num = next_key(i, max_key);
1131                 // num = i;
1132                 sprintf(buf, "string-%d", num);
1133                 // printf("insert %d\n", num);
1134                 ins.objectid = num;
1135                 ins.offset = 0;
1136                 ins.flags = 0;
1137                 ret = insert_item(root, &ins, buf, strlen(buf));
1138                 if (!ret)
1139                         tree_size++;
1140         }
1141         printf("root used: %lu\n", root->alloc_extent->num_used);
1142         printf("root tree\n");
1143         // print_tree(root, root->node);
1144         printf("map tree\n");
1145         printf("map used: %lu\n", root->extent_root->alloc_extent->num_used);
1146         // print_tree(root->extent_root, root->extent_root->node);
1147         write_ctree_super(root, &super);
1148         close_ctree(root);
1149
1150         root = open_ctree("dbfile", &super);
1151         printf("starting search\n");
1152         srand(55);
1153         for (i = 0; i < run_size; i++) {
1154                 num = next_key(i, max_key);
1155                 ins.objectid = num;
1156                 init_path(&path);
1157                 ret = search_slot(root, &ins, &path);
1158                 if (ret) {
1159                         print_tree(root, root->node);
1160                         printf("unable to find %d\n", num);
1161                         exit(1);
1162                 }
1163                 release_path(root, &path);
1164         }
1165         write_ctree_super(root, &super);
1166         close_ctree(root);
1167         root = open_ctree("dbfile", &super);
1168         printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
1169                 node_level(root->node->node.header.flags),
1170                 root->node->node.header.nritems,
1171                 NODEPTRS_PER_BLOCK - root->node->node.header.nritems);
1172         printf("all searches good, deleting some items\n");
1173         i = 0;
1174         srand(55);
1175         for (i = 0 ; i < run_size/4; i++) {
1176                 num = next_key(i, max_key);
1177                 ins.objectid = num;
1178                 init_path(&path);
1179                 ret = search_slot(root, &ins, &path);
1180                 if (ret)
1181                         continue;
1182                 ret = del_item(root, &path);
1183                 if (ret != 0)
1184                         BUG();
1185                 release_path(root, &path);
1186                 tree_size--;
1187         }
1188         srand(128);
1189         for (i = 0; i < run_size; i++) {
1190                 buf = malloc(64);
1191                 num = next_key(i, max_key);
1192                 sprintf(buf, "string-%d", num);
1193                 ins.objectid = num;
1194                 ret = insert_item(root, &ins, buf, strlen(buf));
1195                 if (!ret)
1196                         tree_size++;
1197         }
1198         write_ctree_super(root, &super);
1199         close_ctree(root);
1200         root = open_ctree("dbfile", &super);
1201         printf("starting search2\n");
1202         srand(128);
1203         for (i = 0; i < run_size; i++) {
1204                 num = next_key(i, max_key);
1205                 ins.objectid = num;
1206                 init_path(&path);
1207                 ret = search_slot(root, &ins, &path);
1208                 if (ret) {
1209                         print_tree(root, root->node);
1210                         printf("unable to find %d\n", num);
1211                         exit(1);
1212                 }
1213                 release_path(root, &path);
1214         }
1215         printf("starting big long delete run\n");
1216         while(root->node && root->node->node.header.nritems > 0) {
1217                 struct leaf *leaf;
1218                 int slot;
1219                 ins.objectid = (u64)-1;
1220                 init_path(&path);
1221                 ret = search_slot(root, &ins, &path);
1222                 if (ret == 0)
1223                         BUG();
1224
1225                 leaf = &path.nodes[0]->leaf;
1226                 slot = path.slots[0];
1227                 if (slot != leaf->header.nritems)
1228                         BUG();
1229                 while(path.slots[0] > 0) {
1230                         path.slots[0] -= 1;
1231                         slot = path.slots[0];
1232                         leaf = &path.nodes[0]->leaf;
1233
1234                         if (comp_keys(&last, &leaf->items[slot].key) <= 0)
1235                                 BUG();
1236                         memcpy(&last, &leaf->items[slot].key, sizeof(last));
1237                         ret = del_item(root, &path);
1238                         if (ret != 0) {
1239                                 printf("del_item returned %d\n", ret);
1240                                 BUG();
1241                         }
1242                         tree_size--;
1243                 }
1244                 release_path(root, &path);
1245         }
1246         write_ctree_super(root, &super);
1247         close_ctree(root);
1248         printf("tree size is now %d\n", tree_size);
1249         return 0;
1250 }