Block sized tree extents and extent deletion
[platform/upstream/btrfs-progs.git] / ctree.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include "kerncompat.h"
4 #include "radix-tree.h"
5 #include "ctree.h"
6 #include "disk-io.h"
7
8 #define SEARCH_READ 0
9 #define SEARCH_WRITE 1
10
11 #define CTREE_EXTENT_PENDING 0
12
13 int split_node(struct ctree_root *root, struct ctree_path *path, int level);
14 int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size);
15 struct tree_buffer *alloc_free_block(struct ctree_root *root);
16 int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks);
17
18 static inline void init_path(struct ctree_path *p)
19 {
20         memset(p, 0, sizeof(*p));
21 }
22
23 static void release_path(struct ctree_root *root, struct ctree_path *p)
24 {
25         int i;
26         for (i = 0; i < MAX_LEVEL; i++) {
27                 if (!p->nodes[i])
28                         break;
29                 tree_block_release(root, p->nodes[i]);
30         }
31 }
32
33 /*
34  * The leaf data grows from end-to-front in the node.
35  * this returns the address of the start of the last item,
36  * which is the stop of the leaf data stack
37  */
38 static inline unsigned int leaf_data_end(struct leaf *leaf)
39 {
40         unsigned int nr = leaf->header.nritems;
41         if (nr == 0)
42                 return sizeof(leaf->data);
43         return leaf->items[nr-1].offset;
44 }
45
46 /*
47  * The space between the end of the leaf items and
48  * the start of the leaf data.  IOW, how much room
49  * the leaf has left for both items and data
50  */
51 static inline int leaf_free_space(struct leaf *leaf)
52 {
53         int data_end = leaf_data_end(leaf);
54         int nritems = leaf->header.nritems;
55         char *items_end = (char *)(leaf->items + nritems + 1);
56         return (char *)(leaf->data + data_end) - (char *)items_end;
57 }
58
59 /*
60  * compare two keys in a memcmp fashion
61  */
62 int comp_keys(struct key *k1, struct key *k2)
63 {
64         if (k1->objectid > k2->objectid)
65                 return 1;
66         if (k1->objectid < k2->objectid)
67                 return -1;
68         if (k1->flags > k2->flags)
69                 return 1;
70         if (k1->flags < k2->flags)
71                 return -1;
72         if (k1->offset > k2->offset)
73                 return 1;
74         if (k1->offset < k2->offset)
75                 return -1;
76         return 0;
77 }
78
79 /*
80  * search for key in the array p.  items p are item_size apart
81  * and there are 'max' items in p
82  * the slot in the array is returned via slot, and it points to
83  * the place where you would insert key if it is not found in
84  * the array.
85  *
86  * slot may point to max if the key is bigger than all of the keys
87  */
88 int generic_bin_search(char *p, int item_size, struct key *key,
89                        int max, int *slot)
90 {
91         int low = 0;
92         int high = max;
93         int mid;
94         int ret;
95         struct key *tmp;
96
97         while(low < high) {
98                 mid = (low + high) / 2;
99                 tmp = (struct key *)(p + mid * item_size);
100                 ret = comp_keys(tmp, key);
101
102                 if (ret < 0)
103                         low = mid + 1;
104                 else if (ret > 0)
105                         high = mid;
106                 else {
107                         *slot = mid;
108                         return 0;
109                 }
110         }
111         *slot = low;
112         return 1;
113 }
114
115 int bin_search(struct node *c, struct key *key, int *slot)
116 {
117         if (is_leaf(c->header.flags)) {
118                 struct leaf *l = (struct leaf *)c;
119                 return generic_bin_search((void *)l->items, sizeof(struct item),
120                                           key, c->header.nritems, slot);
121         } else {
122                 return generic_bin_search((void *)c->keys, sizeof(struct key),
123                                           key, c->header.nritems, slot);
124         }
125         return -1;
126 }
127
128 /*
129  * look for key in the tree.  path is filled in with nodes along the way
130  * if key is found, we return zero and you can find the item in the leaf
131  * level of the path (level 0)
132  *
133  * If the key isn't found, the path points to the slot where it should
134  * be inserted.
135  */
136 int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p, int ins_len)
137 {
138         struct tree_buffer *b = root->node;
139         struct node *c;
140         int slot;
141         int ret;
142         int level;
143
144         b->count++;
145         while (b) {
146                 c = &b->node;
147                 level = node_level(c->header.flags);
148                 p->nodes[level] = b;
149                 ret = bin_search(c, key, &slot);
150                 if (!is_leaf(c->header.flags)) {
151                         if (ret && slot > 0)
152                                 slot -= 1;
153                         p->slots[level] = slot;
154                         if (ins_len && c->header.nritems == NODEPTRS_PER_BLOCK) {
155                                 int sret = split_node(root, p, level);
156                                 BUG_ON(sret > 0);
157                                 if (sret)
158                                         return sret;
159                                 b = p->nodes[level];
160                                 c = &b->node;
161                                 slot = p->slots[level];
162                         }
163                         b = read_tree_block(root, c->blockptrs[slot]);
164                         continue;
165                 } else {
166                         struct leaf *l = (struct leaf *)c;
167                         p->slots[level] = slot;
168                         if (ins_len && leaf_free_space(l) <  sizeof(struct item) + ins_len) {
169                                 int sret = split_leaf(root, p, ins_len);
170                                 BUG_ON(sret > 0);
171                                 if (sret)
172                                         return sret;
173                         }
174                         return ret;
175                 }
176         }
177         return -1;
178 }
179
180 /*
181  * adjust the pointers going up the tree, starting at level
182  * making sure the right key of each node is points to 'key'.
183  * This is used after shifting pointers to the left, so it stops
184  * fixing up pointers when a given leaf/node is not in slot 0 of the
185  * higher levels
186  */
187 static void fixup_low_keys(struct ctree_root *root,
188                            struct ctree_path *path, struct key *key,
189                            int level)
190 {
191         int i;
192         for (i = level; i < MAX_LEVEL; i++) {
193                 struct node *t;
194                 int tslot = path->slots[i];
195                 if (!path->nodes[i])
196                         break;
197                 t = &path->nodes[i]->node;
198                 memcpy(t->keys + tslot, key, sizeof(*key));
199                 write_tree_block(root, path->nodes[i]);
200                 if (tslot != 0)
201                         break;
202         }
203 }
204
205 /*
206  * try to push data from one node into the next node left in the
207  * tree.  The src node is found at specified level in the path.
208  * If some bytes were pushed, return 0, otherwise return 1.
209  *
210  * Lower nodes/leaves in the path are not touched, higher nodes may
211  * be modified to reflect the push.
212  *
213  * The path is altered to reflect the push.
214  */
215 int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
216 {
217         int slot;
218         struct node *left;
219         struct node *right;
220         int push_items = 0;
221         int left_nritems;
222         int right_nritems;
223         struct tree_buffer *t;
224         struct tree_buffer *right_buf;
225
226         if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
227                 return 1;
228         slot = path->slots[level + 1];
229         if (slot == 0)
230                 return 1;
231
232         t = read_tree_block(root,
233                             path->nodes[level + 1]->node.blockptrs[slot - 1]);
234         left = &t->node;
235         right_buf = path->nodes[level];
236         right = &right_buf->node;
237         left_nritems = left->header.nritems;
238         right_nritems = right->header.nritems;
239         push_items = NODEPTRS_PER_BLOCK - (left_nritems + 1);
240         if (push_items <= 0) {
241                 tree_block_release(root, t);
242                 return 1;
243         }
244
245         if (right_nritems < push_items)
246                 push_items = right_nritems;
247         memcpy(left->keys + left_nritems, right->keys,
248                 push_items * sizeof(struct key));
249         memcpy(left->blockptrs + left_nritems, right->blockptrs,
250                 push_items * sizeof(u64));
251         memmove(right->keys, right->keys + push_items,
252                 (right_nritems - push_items) * sizeof(struct key));
253         memmove(right->blockptrs, right->blockptrs + push_items,
254                 (right_nritems - push_items) * sizeof(u64));
255         right->header.nritems -= push_items;
256         left->header.nritems += push_items;
257
258         /* adjust the pointers going up the tree */
259         fixup_low_keys(root, path, right->keys, level + 1);
260
261         write_tree_block(root, t);
262         write_tree_block(root, right_buf);
263
264         /* then fixup the leaf pointer in the path */
265         if (path->slots[level] < push_items) {
266                 path->slots[level] += left_nritems;
267                 tree_block_release(root, path->nodes[level]);
268                 path->nodes[level] = t;
269                 path->slots[level + 1] -= 1;
270         } else {
271                 path->slots[level] -= push_items;
272                 tree_block_release(root, t);
273         }
274         return 0;
275 }
276
277 /*
278  * try to push data from one node into the next node right in the
279  * tree.  The src node is found at specified level in the path.
280  * If some bytes were pushed, return 0, otherwise return 1.
281  *
282  * Lower nodes/leaves in the path are not touched, higher nodes may
283  * be modified to reflect the push.
284  *
285  * The path is altered to reflect the push.
286  */
287 int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
288 {
289         int slot;
290         struct tree_buffer *t;
291         struct tree_buffer *src_buffer;
292         struct node *dst;
293         struct node *src;
294         int push_items = 0;
295         int dst_nritems;
296         int src_nritems;
297
298         /* can't push from the root */
299         if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
300                 return 1;
301
302         /* only try to push inside the node higher up */
303         slot = path->slots[level + 1];
304         if (slot == NODEPTRS_PER_BLOCK - 1)
305                 return 1;
306
307         if (slot >= path->nodes[level + 1]->node.header.nritems -1)
308                 return 1;
309
310         t = read_tree_block(root,
311                             path->nodes[level + 1]->node.blockptrs[slot + 1]);
312         dst = &t->node;
313         src_buffer = path->nodes[level];
314         src = &src_buffer->node;
315         dst_nritems = dst->header.nritems;
316         src_nritems = src->header.nritems;
317         push_items = NODEPTRS_PER_BLOCK - (dst_nritems + 1);
318         if (push_items <= 0) {
319                 tree_block_release(root, t);
320                 return 1;
321         }
322
323         if (src_nritems < push_items)
324                 push_items = src_nritems;
325         memmove(dst->keys + push_items, dst->keys,
326                 dst_nritems * sizeof(struct key));
327         memcpy(dst->keys, src->keys + src_nritems - push_items,
328                 push_items * sizeof(struct key));
329
330         memmove(dst->blockptrs + push_items, dst->blockptrs,
331                 dst_nritems * sizeof(u64));
332         memcpy(dst->blockptrs, src->blockptrs + src_nritems - push_items,
333                 push_items * sizeof(u64));
334
335         src->header.nritems -= push_items;
336         dst->header.nritems += push_items;
337
338         /* adjust the pointers going up the tree */
339         memcpy(path->nodes[level + 1]->node.keys + path->slots[level + 1] + 1,
340                 dst->keys, sizeof(struct key));
341
342         write_tree_block(root, path->nodes[level + 1]);
343         write_tree_block(root, t);
344         write_tree_block(root, src_buffer);
345
346         /* then fixup the pointers in the path */
347         if (path->slots[level] >= src->header.nritems) {
348                 path->slots[level] -= src->header.nritems;
349                 tree_block_release(root, path->nodes[level]);
350                 path->nodes[level] = t;
351                 path->slots[level + 1] += 1;
352         } else {
353                 tree_block_release(root, t);
354         }
355         return 0;
356 }
357
358 static int insert_new_root(struct ctree_root *root, struct ctree_path *path, int level)
359 {
360         struct tree_buffer *t;
361         struct node *lower;
362         struct node *c;
363         struct key *lower_key;
364
365         BUG_ON(path->nodes[level]);
366         BUG_ON(path->nodes[level-1] != root->node);
367
368         t = alloc_free_block(root);
369         c = &t->node;
370         memset(c, 0, sizeof(c));
371         c->header.nritems = 1;
372         c->header.flags = node_level(level);
373         c->header.blocknr = t->blocknr;
374         c->header.parentid = root->node->node.header.parentid;
375         lower = &path->nodes[level-1]->node;
376         if (is_leaf(lower->header.flags))
377                 lower_key = &((struct leaf *)lower)->items[0].key;
378         else
379                 lower_key = lower->keys;
380         memcpy(c->keys, lower_key, sizeof(struct key));
381         c->blockptrs[0] = path->nodes[level-1]->blocknr;
382         /* the super has an extra ref to root->node */
383         tree_block_release(root, root->node);
384         root->node = t;
385         t->count++;
386         write_tree_block(root, t);
387         path->nodes[level] = t;
388         path->slots[level] = 0;
389         return 0;
390 }
391
392 /*
393  * worker function to insert a single pointer in a node.
394  * the node should have enough room for the pointer already
395  * slot and level indicate where you want the key to go, and
396  * blocknr is the block the key points to.
397  */
398 int insert_ptr(struct ctree_root *root,
399                 struct ctree_path *path, struct key *key,
400                 u64 blocknr, int slot, int level)
401 {
402         struct node *lower;
403         int nritems;
404
405         BUG_ON(!path->nodes[level]);
406         lower = &path->nodes[level]->node;
407         nritems = lower->header.nritems;
408         if (slot > nritems)
409                 BUG();
410         if (nritems == NODEPTRS_PER_BLOCK)
411                 BUG();
412         if (slot != nritems) {
413                 memmove(lower->keys + slot + 1, lower->keys + slot,
414                         (nritems - slot) * sizeof(struct key));
415                 memmove(lower->blockptrs + slot + 1, lower->blockptrs + slot,
416                         (nritems - slot) * sizeof(u64));
417         }
418         memcpy(lower->keys + slot, key, sizeof(struct key));
419         lower->blockptrs[slot] = blocknr;
420         lower->header.nritems++;
421         if (lower->keys[1].objectid == 0)
422                         BUG();
423         write_tree_block(root, path->nodes[level]);
424         return 0;
425 }
426
427 int split_node(struct ctree_root *root, struct ctree_path *path, int level)
428 {
429         struct tree_buffer *t;
430         struct node *c;
431         struct tree_buffer *split_buffer;
432         struct node *split;
433         int mid;
434         int ret;
435
436         ret = push_node_left(root, path, level);
437         if (!ret)
438                 return 0;
439         ret = push_node_right(root, path, level);
440         if (!ret)
441                 return 0;
442         t = path->nodes[level];
443         c = &t->node;
444         if (t == root->node) {
445                 /* trying to split the root, lets make a new one */
446                 ret = insert_new_root(root, path, level + 1);
447                 if (ret)
448                         return ret;
449         }
450         split_buffer = alloc_free_block(root);
451         split = &split_buffer->node;
452         split->header.flags = c->header.flags;
453         split->header.blocknr = split_buffer->blocknr;
454         split->header.parentid = root->node->node.header.parentid;
455         mid = (c->header.nritems + 1) / 2;
456         memcpy(split->keys, c->keys + mid,
457                 (c->header.nritems - mid) * sizeof(struct key));
458         memcpy(split->blockptrs, c->blockptrs + mid,
459                 (c->header.nritems - mid) * sizeof(u64));
460         split->header.nritems = c->header.nritems - mid;
461         c->header.nritems = mid;
462         write_tree_block(root, t);
463         write_tree_block(root, split_buffer);
464         insert_ptr(root, path, split->keys, split_buffer->blocknr,
465                      path->slots[level + 1] + 1, level + 1);
466         if (path->slots[level] > mid) {
467                 path->slots[level] -= mid;
468                 tree_block_release(root, t);
469                 path->nodes[level] = split_buffer;
470                 path->slots[level + 1] += 1;
471         } else {
472                 tree_block_release(root, split_buffer);
473         }
474         return 0;
475 }
476
477 /*
478  * how many bytes are required to store the items in a leaf.  start
479  * and nr indicate which items in the leaf to check.  This totals up the
480  * space used both by the item structs and the item data
481  */
482 int leaf_space_used(struct leaf *l, int start, int nr)
483 {
484         int data_len;
485         int end = start + nr - 1;
486
487         if (!nr)
488                 return 0;
489         data_len = l->items[start].offset + l->items[start].size;
490         data_len = data_len - l->items[end].offset;
491         data_len += sizeof(struct item) * nr;
492         return data_len;
493 }
494
495 /*
496  * push some data in the path leaf to the left, trying to free up at
497  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
498  */
499 int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
500                    int data_size)
501 {
502         struct tree_buffer *right_buf = path->nodes[0];
503         struct leaf *right = &right_buf->leaf;
504         struct tree_buffer *t;
505         struct leaf *left;
506         int slot;
507         int i;
508         int free_space;
509         int push_space = 0;
510         int push_items = 0;
511         struct item *item;
512         int old_left_nritems;
513
514         slot = path->slots[1];
515         if (slot == 0) {
516                 return 1;
517         }
518         if (!path->nodes[1]) {
519                 return 1;
520         }
521         t = read_tree_block(root, path->nodes[1]->node.blockptrs[slot - 1]);
522         left = &t->leaf;
523         free_space = leaf_free_space(left);
524         if (free_space < data_size + sizeof(struct item)) {
525                 tree_block_release(root, t);
526                 return 1;
527         }
528         for (i = 0; i < right->header.nritems; i++) {
529                 item = right->items + i;
530                 if (path->slots[0] == i)
531                         push_space += data_size + sizeof(*item);
532                 if (item->size + sizeof(*item) + push_space > free_space)
533                         break;
534                 push_items++;
535                 push_space += item->size + sizeof(*item);
536         }
537         if (push_items == 0) {
538                 tree_block_release(root, t);
539                 return 1;
540         }
541         /* push data from right to left */
542         memcpy(left->items + left->header.nritems,
543                 right->items, push_items * sizeof(struct item));
544         push_space = LEAF_DATA_SIZE - right->items[push_items -1].offset;
545         memcpy(left->data + leaf_data_end(left) - push_space,
546                 right->data + right->items[push_items - 1].offset,
547                 push_space);
548         old_left_nritems = left->header.nritems;
549         BUG_ON(old_left_nritems < 0);
550
551         for(i = old_left_nritems; i < old_left_nritems + push_items; i++) {
552                 left->items[i].offset -= LEAF_DATA_SIZE -
553                         left->items[old_left_nritems -1].offset;
554         }
555         left->header.nritems += push_items;
556
557         /* fixup right node */
558         push_space = right->items[push_items-1].offset - leaf_data_end(right);
559         memmove(right->data + LEAF_DATA_SIZE - push_space, right->data +
560                 leaf_data_end(right), push_space);
561         memmove(right->items, right->items + push_items,
562                 (right->header.nritems - push_items) * sizeof(struct item));
563         right->header.nritems -= push_items;
564         push_space = LEAF_DATA_SIZE;
565
566         for (i = 0; i < right->header.nritems; i++) {
567                 right->items[i].offset = push_space - right->items[i].size;
568                 push_space = right->items[i].offset;
569         }
570
571         write_tree_block(root, t);
572         write_tree_block(root, right_buf);
573
574         fixup_low_keys(root, path, &right->items[0].key, 1);
575
576         /* then fixup the leaf pointer in the path */
577         if (path->slots[0] < push_items) {
578                 path->slots[0] += old_left_nritems;
579                 tree_block_release(root, path->nodes[0]);
580                 path->nodes[0] = t;
581                 path->slots[1] -= 1;
582         } else {
583                 tree_block_release(root, t);
584                 path->slots[0] -= push_items;
585         }
586         BUG_ON(path->slots[0] < 0);
587         return 0;
588 }
589
590 /*
591  * split the path's leaf in two, making sure there is at least data_size
592  * available for the resulting leaf level of the path.
593  */
594 int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
595 {
596         struct tree_buffer *l_buf = path->nodes[0];
597         struct leaf *l = &l_buf->leaf;
598         int nritems;
599         int mid;
600         int slot;
601         struct leaf *right;
602         struct tree_buffer *right_buffer;
603         int space_needed = data_size + sizeof(struct item);
604         int data_copy_size;
605         int rt_data_off;
606         int i;
607         int ret;
608
609         if (push_leaf_left(root, path, data_size) == 0) {
610                 l_buf = path->nodes[0];
611                 l = &l_buf->leaf;
612                 if (leaf_free_space(l) >= sizeof(struct item) + data_size)
613                         return 0;
614         }
615         if (!path->nodes[1]) {
616                 ret = insert_new_root(root, path, 1);
617                 if (ret)
618                         return ret;
619         }
620         slot = path->slots[0];
621         nritems = l->header.nritems;
622         mid = (nritems + 1)/ 2;
623
624         right_buffer = alloc_free_block(root);
625         BUG_ON(!right_buffer);
626         BUG_ON(mid == nritems);
627         right = &right_buffer->leaf;
628         memset(right, 0, sizeof(*right));
629         if (mid <= slot) {
630                 if (leaf_space_used(l, mid, nritems - mid) + space_needed >
631                         LEAF_DATA_SIZE)
632                         BUG();
633         } else {
634                 if (leaf_space_used(l, 0, mid + 1) + space_needed >
635                         LEAF_DATA_SIZE)
636                         BUG();
637         }
638         right->header.nritems = nritems - mid;
639         right->header.blocknr = right_buffer->blocknr;
640         right->header.flags = node_level(0);
641         right->header.parentid = root->node->node.header.parentid;
642         data_copy_size = l->items[mid].offset + l->items[mid].size -
643                          leaf_data_end(l);
644         memcpy(right->items, l->items + mid,
645                (nritems - mid) * sizeof(struct item));
646         memcpy(right->data + LEAF_DATA_SIZE - data_copy_size,
647                l->data + leaf_data_end(l), data_copy_size);
648         rt_data_off = LEAF_DATA_SIZE -
649                      (l->items[mid].offset + l->items[mid].size);
650
651         for (i = 0; i < right->header.nritems; i++)
652                 right->items[i].offset += rt_data_off;
653
654         l->header.nritems = mid;
655         ret = insert_ptr(root, path, &right->items[0].key,
656                           right_buffer->blocknr, path->slots[1] + 1, 1);
657         write_tree_block(root, right_buffer);
658         write_tree_block(root, l_buf);
659
660         BUG_ON(path->slots[0] != slot);
661         if (mid <= slot) {
662                 tree_block_release(root, path->nodes[0]);
663                 path->nodes[0] = right_buffer;
664                 path->slots[0] -= mid;
665                 path->slots[1] += 1;
666         } else
667                 tree_block_release(root, right_buffer);
668         BUG_ON(path->slots[0] < 0);
669         return ret;
670 }
671
672 /*
673  * Given a key and some data, insert an item into the tree.
674  * This does all the path init required, making room in the tree if needed.
675  */
676 int insert_item(struct ctree_root *root, struct key *key,
677                           void *data, int data_size)
678 {
679         int ret;
680         int slot;
681         int slot_orig;
682         struct leaf *leaf;
683         struct tree_buffer *leaf_buf;
684         unsigned int nritems;
685         unsigned int data_end;
686         struct ctree_path path;
687
688         /* create a root if there isn't one */
689         if (!root->node)
690                 BUG();
691         init_path(&path);
692         ret = search_slot(root, key, &path, data_size);
693         if (ret == 0) {
694                 release_path(root, &path);
695                 return -EEXIST;
696         }
697
698         slot_orig = path.slots[0];
699         leaf_buf = path.nodes[0];
700         leaf = &leaf_buf->leaf;
701
702         nritems = leaf->header.nritems;
703         data_end = leaf_data_end(leaf);
704
705         if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
706                 BUG();
707
708         slot = path.slots[0];
709         BUG_ON(slot < 0);
710         if (slot == 0)
711                 fixup_low_keys(root, &path, key, 1);
712         if (slot != nritems) {
713                 int i;
714                 unsigned int old_data = leaf->items[slot].offset +
715                                         leaf->items[slot].size;
716
717                 /*
718                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
719                  */
720                 /* first correct the data pointers */
721                 for (i = slot; i < nritems; i++)
722                         leaf->items[i].offset -= data_size;
723
724                 /* shift the items */
725                 memmove(leaf->items + slot + 1, leaf->items + slot,
726                         (nritems - slot) * sizeof(struct item));
727
728                 /* shift the data */
729                 memmove(leaf->data + data_end - data_size, leaf->data +
730                         data_end, old_data - data_end);
731                 data_end = old_data;
732         }
733         /* copy the new data in */
734         memcpy(&leaf->items[slot].key, key, sizeof(struct key));
735         leaf->items[slot].offset = data_end - data_size;
736         leaf->items[slot].size = data_size;
737         memcpy(leaf->data + data_end - data_size, data, data_size);
738         leaf->header.nritems += 1;
739         write_tree_block(root, leaf_buf);
740         if (leaf_free_space(leaf) < 0)
741                 BUG();
742         release_path(root, &path);
743         return 0;
744 }
745
746 /*
747  * delete the pointer from a given level in the path.  The path is not
748  * fixed up, so after calling this it is not valid at that level.
749  *
750  * If the delete empties a node, the node is removed from the tree,
751  * continuing all the way the root if required.  The root is converted into
752  * a leaf if all the nodes are emptied.
753  */
754 int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
755 {
756         int slot;
757         struct tree_buffer *t;
758         struct node *node;
759         int nritems;
760         u64 blocknr;
761
762         while(1) {
763                 t = path->nodes[level];
764                 if (!t)
765                         break;
766                 node = &t->node;
767                 slot = path->slots[level];
768                 nritems = node->header.nritems;
769
770                 if (slot != nritems -1) {
771                         memmove(node->keys + slot, node->keys + slot + 1,
772                                 sizeof(struct key) * (nritems - slot - 1));
773                         memmove(node->blockptrs + slot,
774                                 node->blockptrs + slot + 1,
775                                 sizeof(u64) * (nritems - slot - 1));
776                 }
777                 node->header.nritems--;
778                 write_tree_block(root, t);
779                 blocknr = t->blocknr;
780                 if (node->header.nritems != 0) {
781                         int tslot;
782                         if (slot == 0)
783                                 fixup_low_keys(root, path, node->keys,
784                                                level + 1);
785                         tslot = path->slots[level+1];
786                         t->count++;
787                         push_node_left(root, path, level);
788                         if (node->header.nritems) {
789                                 push_node_right(root, path, level);
790                         }
791                         if (node->header.nritems) {
792                                 tree_block_release(root, t);
793                                 break;
794                         }
795                         tree_block_release(root, t);
796                         path->slots[level+1] = tslot;
797                 }
798                 if (t == root->node) {
799                         /* just turn the root into a leaf and break */
800                         root->node->node.header.flags = node_level(0);
801                         write_tree_block(root, t);
802                         break;
803                 }
804                 level++;
805                 free_extent(root, blocknr, 1);
806                 if (!path->nodes[level])
807                         BUG();
808         }
809         return 0;
810 }
811
812 /*
813  * delete the item at the leaf level in path.  If that empties
814  * the leaf, remove it from the tree
815  */
816 int del_item(struct ctree_root *root, struct ctree_path *path)
817 {
818         int slot;
819         struct leaf *leaf;
820         struct tree_buffer *leaf_buf;
821         int doff;
822         int dsize;
823
824         leaf_buf = path->nodes[0];
825         leaf = &leaf_buf->leaf;
826         slot = path->slots[0];
827         doff = leaf->items[slot].offset;
828         dsize = leaf->items[slot].size;
829
830         if (slot != leaf->header.nritems - 1) {
831                 int i;
832                 int data_end = leaf_data_end(leaf);
833                 memmove(leaf->data + data_end + dsize,
834                         leaf->data + data_end,
835                         doff - data_end);
836                 for (i = slot + 1; i < leaf->header.nritems; i++)
837                         leaf->items[i].offset += dsize;
838                 memmove(leaf->items + slot, leaf->items + slot + 1,
839                         sizeof(struct item) *
840                         (leaf->header.nritems - slot - 1));
841         }
842         leaf->header.nritems -= 1;
843         /* delete the leaf if we've emptied it */
844         if (leaf->header.nritems == 0) {
845                 if (leaf_buf == root->node) {
846                         leaf->header.flags = node_level(0);
847                         write_tree_block(root, leaf_buf);
848                 } else {
849                         del_ptr(root, path, 1);
850                         free_extent(root, leaf_buf->blocknr, 1);
851                 }
852         } else {
853                 if (slot == 0)
854                         fixup_low_keys(root, path, &leaf->items[0].key, 1);
855                 write_tree_block(root, leaf_buf);
856                 /* delete the leaf if it is mostly empty */
857                 if (leaf_space_used(leaf, 0, leaf->header.nritems) <
858                     LEAF_DATA_SIZE / 4) {
859                         /* push_leaf_left fixes the path.
860                          * make sure the path still points to our leaf
861                          * for possible call to del_ptr below
862                          */
863                         slot = path->slots[1];
864                         leaf_buf->count++;
865                         push_leaf_left(root, path, 1);
866                         if (leaf->header.nritems == 0) {
867                                 path->slots[1] = slot;
868                                 del_ptr(root, path, 1);
869                         }
870                         tree_block_release(root, leaf_buf);
871                 }
872         }
873         return 0;
874 }
875
876 static int del_pending_extents(struct ctree_root *extent_root)
877 {
878         int ret;
879         struct key key;
880         struct tree_buffer *gang[4];
881         int i;
882         struct ctree_path path;
883
884         while(1) {
885                 ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
886                                                  (void **)gang, 0, ARRAY_SIZE(gang),
887                                                  CTREE_EXTENT_PENDING);
888                 if (!ret)
889                         break;
890                 for (i = 0; i < ret; i++) {
891                         key.objectid = gang[i]->blocknr;
892                         key.flags = 0;
893                         key.offset = 1;
894                         init_path(&path);
895                         ret = search_slot(extent_root, &key, &path, 0);
896                         if (ret) {
897                                 BUG();
898                                 // FIXME undo it and return sane
899                                 return ret;
900                         }
901                         ret = del_item(extent_root, &path);
902                         if (ret) {
903                                 BUG();
904                                 return ret;
905                         }
906                         release_path(extent_root, &path);
907                         radix_tree_tag_clear(&extent_root->cache_radix, gang[i]->blocknr,
908                                                 CTREE_EXTENT_PENDING);
909                         tree_block_release(extent_root, gang[i]);
910                 }
911         }
912         return 0;
913 }
914
915 int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
916 {
917         struct ctree_path path;
918         struct key key;
919         struct ctree_root *extent_root = root->extent_root;
920         struct tree_buffer *t;
921         int pending_ret;
922         int ret;
923
924         key.objectid = blocknr;
925         key.flags = 0;
926         key.offset = num_blocks;
927         if (root == extent_root) {
928                 t = read_tree_block(root, key.objectid);
929                 radix_tree_tag_set(&root->cache_radix, key.objectid, CTREE_EXTENT_PENDING);
930                 return 0;
931         }
932         init_path(&path);
933         ret = search_slot(extent_root, &key, &path, 0);
934         if (ret)
935                 BUG();
936         ret = del_item(extent_root, &path);
937         release_path(extent_root, &path);
938         pending_ret = del_pending_extents(root->extent_root);
939         return ret ? ret : pending_ret;
940 }
941
942 int next_leaf(struct ctree_root *root, struct ctree_path *path)
943 {
944         int slot;
945         int level = 1;
946         u64 blocknr;
947         struct tree_buffer *c;
948         struct tree_buffer *next = NULL;
949
950         while(level < MAX_LEVEL) {
951                 if (!path->nodes[level])
952                         return -1;
953                 slot = path->slots[level] + 1;
954                 c = path->nodes[level];
955                 if (slot >= c->node.header.nritems) {
956                         level++;
957                         continue;
958                 }
959                 blocknr = c->node.blockptrs[slot];
960                 if (next)
961                         tree_block_release(root, next);
962                 next = read_tree_block(root, blocknr);
963                 break;
964         }
965         path->slots[level] = slot;
966         while(1) {
967                 level--;
968                 c = path->nodes[level];
969                 tree_block_release(root, c);
970                 path->nodes[level] = next;
971                 path->slots[level] = 0;
972                 if (!level)
973                         break;
974                 next = read_tree_block(root, next->node.blockptrs[0]);
975         }
976         return 0;
977 }
978
979 int find_free_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start,
980                          u64 search_end, struct key *ins)
981 {
982         struct ctree_path path;
983         struct key *key;
984         int ret;
985         u64 hole_size = 0;
986         int slot = 0;
987         u64 last_block;
988         int start_found = 0;
989         struct leaf *l;
990         struct ctree_root * root = orig_root->extent_root;
991
992         init_path(&path);
993         ins->objectid = search_start;
994         ins->offset = 0;
995         ins->flags = 0;
996         ret = search_slot(root, ins, &path, 0);
997         while (1) {
998                 l = &path.nodes[0]->leaf;
999                 slot = path.slots[0];
1000                 if (!l) {
1001                         // FIXME allocate root
1002                 }
1003                 if (slot >= l->header.nritems) {
1004                         ret = next_leaf(root, &path);
1005                         if (ret == 0)
1006                                 continue;
1007                         if (!start_found) {
1008                                 ins->objectid = search_start;
1009                                 ins->offset = num_blocks;
1010                                 hole_size = search_end - search_start;
1011                                 start_found = 1;
1012                                 goto insert;
1013                         }
1014                         ins->objectid = last_block;
1015                         ins->offset = num_blocks;
1016                         hole_size = search_end - last_block;
1017                         goto insert;
1018                 }
1019                 key = &l->items[slot].key;
1020                 if (start_found) {
1021                         hole_size = key->objectid - last_block;
1022                         if (hole_size > num_blocks) {
1023                                 ins->objectid = last_block;
1024                                 ins->offset = num_blocks;
1025                                 goto insert;
1026                         }
1027                 } else
1028                         start_found = 1;
1029                 last_block = key->objectid + key->offset;
1030 insert_failed:
1031                 path.slots[0]++;
1032         }
1033         // FIXME -ENOSPC
1034 insert:
1035         if (orig_root->extent_root == orig_root) {
1036                 BUG_ON(num_blocks != 1);
1037                 if ((root->current_insert.objectid <= ins->objectid &&
1038                     root->current_insert.objectid + root->current_insert.offset >
1039                     ins->objectid) ||
1040                    (root->current_insert.objectid > ins->objectid &&
1041                     root->current_insert.objectid <= ins->objectid + ins->offset) ||
1042                    radix_tree_tag_get(&root->cache_radix, ins->objectid,
1043                                       CTREE_EXTENT_PENDING)) {
1044                         last_block = ins->objectid + 1;
1045                         search_start = last_block;
1046                         goto insert_failed;
1047                 }
1048         }
1049         release_path(root, &path);
1050         if (ins->offset != 1)
1051                 BUG();
1052         return 0;
1053 }
1054
1055 static int insert_pending_extents(struct ctree_root *extent_root)
1056 {
1057         int ret;
1058         struct key key;
1059         struct extent_item item;
1060         struct tree_buffer *gang[4];
1061         int i;
1062
1063         // FIXME -ENOSPC
1064         item.refs = 1;
1065         item.owner = extent_root->node->node.header.parentid;
1066         while(1) {
1067                 ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
1068                                                  (void **)gang, 0, ARRAY_SIZE(gang),
1069                                                  CTREE_EXTENT_PENDING);
1070                 if (!ret)
1071                         break;
1072                 for (i = 0; i < ret; i++) {
1073                         key.objectid = gang[i]->blocknr;
1074                         key.flags = 0;
1075                         key.offset = 1;
1076                         ret = insert_item(extent_root, &key, &item, sizeof(item));
1077                         if (ret) {
1078                                 BUG();
1079                                 // FIXME undo it and return sane
1080                                 return ret;
1081                         }
1082                         radix_tree_tag_clear(&extent_root->cache_radix, gang[i]->blocknr,
1083                                                 CTREE_EXTENT_PENDING);
1084                         tree_block_release(extent_root, gang[i]);
1085                 }
1086         }
1087         return 0;
1088 }
1089
1090 int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
1091                          u64 search_end, u64 owner, struct key *ins, struct tree_buffer **buf)
1092 {
1093         int ret;
1094         int pending_ret;
1095         struct extent_item extent_item;
1096
1097         extent_item.refs = 1;
1098         extent_item.owner = owner;
1099
1100         ret = find_free_extent(root, num_blocks, search_start, search_end, ins);
1101         if (ret)
1102                 return ret;
1103
1104         if (root != root->extent_root) {
1105                 memcpy(&root->extent_root->current_insert, ins, sizeof(*ins));
1106                 ret = insert_item(root->extent_root, ins, &extent_item, sizeof(extent_item));
1107                 memset(&root->extent_root->current_insert, 0, sizeof(struct key));
1108                 pending_ret = insert_pending_extents(root->extent_root);
1109                 if (ret)
1110                         return ret;
1111                 if (pending_ret)
1112                         return pending_ret;
1113                 *buf = find_tree_block(root, ins->objectid);
1114                 return 0;
1115         }
1116         /* we're allocating an extent for the extent tree, don't recurse */
1117         BUG_ON(ins->offset != 1);
1118         *buf = find_tree_block(root, ins->objectid);
1119         BUG_ON(!*buf);
1120         radix_tree_tag_set(&root->cache_radix, ins->objectid, CTREE_EXTENT_PENDING);
1121         (*buf)->count++;
1122         return 0;
1123
1124 }
1125
1126 struct tree_buffer *alloc_free_block(struct ctree_root *root)
1127 {
1128         struct key ins;
1129         int ret;
1130         struct tree_buffer *buf = NULL;
1131
1132         ret = alloc_extent(root, 1, 0, (unsigned long)-1, root->node->node.header.parentid,
1133                            &ins, &buf);
1134
1135         if (ret) {
1136                 BUG();
1137                 return NULL;
1138         }
1139         if (root != root->extent_root)
1140                 BUG_ON(radix_tree_tag_get(&root->extent_root->cache_radix, buf->blocknr,
1141                                           CTREE_EXTENT_PENDING));
1142         return buf;
1143 }
1144
1145 void print_leaf(struct leaf *l)
1146 {
1147         int i;
1148         int nr = l->header.nritems;
1149         struct item *item;
1150         struct extent_item *ei;
1151         printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr,
1152                leaf_free_space(l));
1153         fflush(stdout);
1154         for (i = 0 ; i < nr ; i++) {
1155                 item = l->items + i;
1156                 printf("\titem %d key (%lu %u %lu) itemoff %d itemsize %d\n",
1157                         i,
1158                         item->key.objectid, item->key.flags, item->key.offset,
1159                         item->offset, item->size);
1160                 fflush(stdout);
1161                 printf("\t\titem data %.*s\n", item->size, l->data+item->offset);
1162                 ei = (struct extent_item *)(l->data + item->offset);
1163                 printf("\t\textent data %u %lu\n", ei->refs, ei->owner);
1164                 fflush(stdout);
1165         }
1166 }
1167 void print_tree(struct ctree_root *root, struct tree_buffer *t)
1168 {
1169         int i;
1170         int nr;
1171         struct node *c;
1172
1173         if (!t)
1174                 return;
1175         c = &t->node;
1176         nr = c->header.nritems;
1177         if (c->header.blocknr != t->blocknr)
1178                 BUG();
1179         if (is_leaf(c->header.flags)) {
1180                 print_leaf((struct leaf *)c);
1181                 return;
1182         }
1183         printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr,
1184                 node_level(c->header.flags), c->header.nritems,
1185                 NODEPTRS_PER_BLOCK - c->header.nritems);
1186         fflush(stdout);
1187         for (i = 0; i < nr; i++) {
1188                 printf("\tkey %d (%lu %u %lu) block %lu\n",
1189                        i,
1190                        c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
1191                        c->blockptrs[i]);
1192                 fflush(stdout);
1193         }
1194         for (i = 0; i < nr; i++) {
1195                 struct tree_buffer *next_buf = read_tree_block(root,
1196                                                             c->blockptrs[i]);
1197                 struct node *next = &next_buf->node;
1198                 if (is_leaf(next->header.flags) &&
1199                     node_level(c->header.flags) != 1)
1200                         BUG();
1201                 if (node_level(next->header.flags) !=
1202                         node_level(c->header.flags) - 1)
1203                         BUG();
1204                 print_tree(root, next_buf);
1205                 tree_block_release(root, next_buf);
1206         }
1207
1208 }
1209
1210 /* for testing only */
1211 int next_key(int i, int max_key) {
1212         // return rand() % max_key;
1213         return i;
1214 }
1215
1216 int main() {
1217         struct ctree_root *root;
1218         struct key ins;
1219         struct key last = { (u64)-1, 0, 0};
1220         char *buf;
1221         int i;
1222         int num;
1223         int ret;
1224         int run_size = 10000;
1225         int max_key = 100000000;
1226         int tree_size = 0;
1227         struct ctree_path path;
1228         struct ctree_super_block super;
1229
1230         radix_tree_init();
1231
1232
1233         root = open_ctree("dbfile", &super);
1234         printf("root tree\n");
1235         print_tree(root, root->node);
1236         printf("map tree\n");
1237         print_tree(root->extent_root, root->extent_root->node);
1238         fflush(stdout);
1239
1240         srand(55);
1241         for (i = 0; i < run_size; i++) {
1242                 buf = malloc(64);
1243                 num = next_key(i, max_key);
1244                 // num = i;
1245                 sprintf(buf, "string-%d", num);
1246                 // printf("insert %d\n", num);
1247                 ins.objectid = num;
1248                 ins.offset = 0;
1249                 ins.flags = 0;
1250                 ret = insert_item(root, &ins, buf, strlen(buf));
1251                 if (!ret)
1252                         tree_size++;
1253         }
1254         write_ctree_super(root, &super);
1255         close_ctree(root);
1256
1257         root = open_ctree("dbfile", &super);
1258         printf("starting search\n");
1259         srand(55);
1260         for (i = 0; i < run_size; i++) {
1261                 num = next_key(i, max_key);
1262                 ins.objectid = num;
1263                 init_path(&path);
1264                 ret = search_slot(root, &ins, &path, 0);
1265                 if (ret) {
1266                         print_tree(root, root->node);
1267                         printf("unable to find %d\n", num);
1268                         exit(1);
1269                 }
1270                 release_path(root, &path);
1271         }
1272         write_ctree_super(root, &super);
1273         close_ctree(root);
1274         root = open_ctree("dbfile", &super);
1275         printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
1276                 node_level(root->node->node.header.flags),
1277                 root->node->node.header.nritems,
1278                 NODEPTRS_PER_BLOCK - root->node->node.header.nritems);
1279         printf("all searches good, deleting some items\n");
1280         i = 0;
1281         srand(55);
1282         for (i = 0 ; i < run_size/4; i++) {
1283                 num = next_key(i, max_key);
1284                 ins.objectid = num;
1285                 init_path(&path);
1286                 ret = search_slot(root, &ins, &path, 0);
1287                 if (ret)
1288                         continue;
1289                 ret = del_item(root, &path);
1290                 if (ret != 0)
1291                         BUG();
1292                 release_path(root, &path);
1293                 tree_size--;
1294         }
1295         srand(128);
1296         for (i = 0; i < run_size; i++) {
1297                 buf = malloc(64);
1298                 num = next_key(i, max_key);
1299                 sprintf(buf, "string-%d", num);
1300                 ins.objectid = num;
1301                 ret = insert_item(root, &ins, buf, strlen(buf));
1302                 if (!ret)
1303                         tree_size++;
1304                 if (i >= 5) {
1305                         struct key ugh;
1306                         ugh.objectid = 5;
1307                         ugh.flags = 0;
1308                         ugh.offset = 0;
1309                         init_path(&path);
1310                         ret = search_slot(root, &ugh, &path, 0);
1311                         if (ret) {
1312                                 print_tree(root, root->node);
1313                                 printf("unable to find 5 %d\n", num);
1314                                 exit(1);
1315                         }
1316                         release_path(root, &path);
1317
1318                 }
1319         }
1320         write_ctree_super(root, &super);
1321         close_ctree(root);
1322         root = open_ctree("dbfile", &super);
1323         srand(128);
1324         printf("starting search2\n");
1325         for (i = 0; i < run_size; i++) {
1326                 num = next_key(i, max_key);
1327                 ins.objectid = num;
1328                 init_path(&path);
1329                 ret = search_slot(root, &ins, &path, 0);
1330                 if (ret) {
1331                         print_tree(root, root->node);
1332                         printf("unable to find %d\n", num);
1333                         exit(1);
1334                 }
1335                 release_path(root, &path);
1336         }
1337         printf("starting big long delete run\n");
1338         while(root->node && root->node->node.header.nritems > 0) {
1339                 struct leaf *leaf;
1340                 int slot;
1341                 ins.objectid = (u64)-1;
1342                 init_path(&path);
1343                 ret = search_slot(root, &ins, &path, 0);
1344                 if (ret == 0)
1345                         BUG();
1346
1347                 leaf = &path.nodes[0]->leaf;
1348                 slot = path.slots[0];
1349                 if (slot != leaf->header.nritems)
1350                         BUG();
1351                 while(path.slots[0] > 0) {
1352                         path.slots[0] -= 1;
1353                         slot = path.slots[0];
1354                         leaf = &path.nodes[0]->leaf;
1355
1356                         if (comp_keys(&last, &leaf->items[slot].key) <= 0)
1357                                 BUG();
1358                         memcpy(&last, &leaf->items[slot].key, sizeof(last));
1359                         ret = del_item(root, &path);
1360                         if (ret != 0) {
1361                                 printf("del_item returned %d\n", ret);
1362                                 BUG();
1363                         }
1364                         tree_size--;
1365                 }
1366                 release_path(root, &path);
1367         }
1368         write_ctree_super(root, &super);
1369         close_ctree(root);
1370         printf("tree size is now %d\n", tree_size);
1371         printf("map tree\n");
1372         print_tree(root->extent_root, root->extent_root->node);
1373         return 0;
1374 }