bcache: Add struct btree_keys
[platform/adaptation/renesas_rcar/renesas_kernel.git] / drivers / md / bcache / bset.c
1 /*
2  * Code for working with individual keys, and sorted sets of keys with in a
3  * btree node
4  *
5  * Copyright 2012 Google, Inc.
6  */
7
8 #include "bcache.h"
9 #include "btree.h"
10 #include "debug.h"
11
12 #include <linux/random.h>
13 #include <linux/prefetch.h>
14
15 /* Keylists */
16
17 int __bch_keylist_realloc(struct keylist *l, unsigned u64s)
18 {
19         size_t oldsize = bch_keylist_nkeys(l);
20         size_t newsize = oldsize + u64s;
21         uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
22         uint64_t *new_keys;
23
24         newsize = roundup_pow_of_two(newsize);
25
26         if (newsize <= KEYLIST_INLINE ||
27             roundup_pow_of_two(oldsize) == newsize)
28                 return 0;
29
30         new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO);
31
32         if (!new_keys)
33                 return -ENOMEM;
34
35         if (!old_keys)
36                 memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize);
37
38         l->keys_p = new_keys;
39         l->top_p = new_keys + oldsize;
40
41         return 0;
42 }
43
44 struct bkey *bch_keylist_pop(struct keylist *l)
45 {
46         struct bkey *k = l->keys;
47
48         if (k == l->top)
49                 return NULL;
50
51         while (bkey_next(k) != l->top)
52                 k = bkey_next(k);
53
54         return l->top = k;
55 }
56
57 void bch_keylist_pop_front(struct keylist *l)
58 {
59         l->top_p -= bkey_u64s(l->keys);
60
61         memmove(l->keys,
62                 bkey_next(l->keys),
63                 bch_keylist_bytes(l));
64 }
65
66 /* Key/pointer manipulation */
67
68 void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
69                               unsigned i)
70 {
71         BUG_ON(i > KEY_PTRS(src));
72
73         /* Only copy the header, key, and one pointer. */
74         memcpy(dest, src, 2 * sizeof(uint64_t));
75         dest->ptr[0] = src->ptr[i];
76         SET_KEY_PTRS(dest, 1);
77         /* We didn't copy the checksum so clear that bit. */
78         SET_KEY_CSUM(dest, 0);
79 }
80
81 bool __bch_cut_front(const struct bkey *where, struct bkey *k)
82 {
83         unsigned i, len = 0;
84
85         if (bkey_cmp(where, &START_KEY(k)) <= 0)
86                 return false;
87
88         if (bkey_cmp(where, k) < 0)
89                 len = KEY_OFFSET(k) - KEY_OFFSET(where);
90         else
91                 bkey_copy_key(k, where);
92
93         for (i = 0; i < KEY_PTRS(k); i++)
94                 SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len);
95
96         BUG_ON(len > KEY_SIZE(k));
97         SET_KEY_SIZE(k, len);
98         return true;
99 }
100
101 bool __bch_cut_back(const struct bkey *where, struct bkey *k)
102 {
103         unsigned len = 0;
104
105         if (bkey_cmp(where, k) >= 0)
106                 return false;
107
108         BUG_ON(KEY_INODE(where) != KEY_INODE(k));
109
110         if (bkey_cmp(where, &START_KEY(k)) > 0)
111                 len = KEY_OFFSET(where) - KEY_START(k);
112
113         bkey_copy_key(k, where);
114
115         BUG_ON(len > KEY_SIZE(k));
116         SET_KEY_SIZE(k, len);
117         return true;
118 }
119
120 /* Auxiliary search trees */
121
122 /* 32 bits total: */
123 #define BKEY_MID_BITS           3
124 #define BKEY_EXPONENT_BITS      7
125 #define BKEY_MANTISSA_BITS      (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
126 #define BKEY_MANTISSA_MASK      ((1 << BKEY_MANTISSA_BITS) - 1)
127
128 struct bkey_float {
129         unsigned        exponent:BKEY_EXPONENT_BITS;
130         unsigned        m:BKEY_MID_BITS;
131         unsigned        mantissa:BKEY_MANTISSA_BITS;
132 } __packed;
133
134 /*
135  * BSET_CACHELINE was originally intended to match the hardware cacheline size -
136  * it used to be 64, but I realized the lookup code would touch slightly less
137  * memory if it was 128.
138  *
139  * It definites the number of bytes (in struct bset) per struct bkey_float in
140  * the auxiliar search tree - when we're done searching the bset_float tree we
141  * have this many bytes left that we do a linear search over.
142  *
143  * Since (after level 5) every level of the bset_tree is on a new cacheline,
144  * we're touching one fewer cacheline in the bset tree in exchange for one more
145  * cacheline in the linear search - but the linear search might stop before it
146  * gets to the second cacheline.
147  */
148
149 #define BSET_CACHELINE          128
150
151 /* Space required for the btree node keys */
152 static inline size_t btree_keys_bytes(struct btree_keys *b)
153 {
154         return PAGE_SIZE << b->page_order;
155 }
156
157 static inline size_t btree_keys_cachelines(struct btree_keys *b)
158 {
159         return btree_keys_bytes(b) / BSET_CACHELINE;
160 }
161
162 /* Space required for the auxiliary search trees */
163 static inline size_t bset_tree_bytes(struct btree_keys *b)
164 {
165         return btree_keys_cachelines(b) * sizeof(struct bkey_float);
166 }
167
168 /* Space required for the prev pointers */
169 static inline size_t bset_prev_bytes(struct btree_keys *b)
170 {
171         return btree_keys_cachelines(b) * sizeof(uint8_t);
172 }
173
174 /* Memory allocation */
175
176 void bch_btree_keys_free(struct btree_keys *b)
177 {
178         struct bset_tree *t = b->set;
179
180         if (bset_prev_bytes(b) < PAGE_SIZE)
181                 kfree(t->prev);
182         else
183                 free_pages((unsigned long) t->prev,
184                            get_order(bset_prev_bytes(b)));
185
186         if (bset_tree_bytes(b) < PAGE_SIZE)
187                 kfree(t->tree);
188         else
189                 free_pages((unsigned long) t->tree,
190                            get_order(bset_tree_bytes(b)));
191
192         free_pages((unsigned long) t->data, b->page_order);
193
194         t->prev = NULL;
195         t->tree = NULL;
196         t->data = NULL;
197 }
198 EXPORT_SYMBOL(bch_btree_keys_free);
199
200 int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
201 {
202         struct bset_tree *t = b->set;
203
204         BUG_ON(t->data);
205
206         b->page_order = page_order;
207
208         t->data = (void *) __get_free_pages(gfp, b->page_order);
209         if (!t->data)
210                 goto err;
211
212         t->tree = bset_tree_bytes(b) < PAGE_SIZE
213                 ? kmalloc(bset_tree_bytes(b), gfp)
214                 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
215         if (!t->tree)
216                 goto err;
217
218         t->prev = bset_prev_bytes(b) < PAGE_SIZE
219                 ? kmalloc(bset_prev_bytes(b), gfp)
220                 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
221         if (!t->prev)
222                 goto err;
223
224         return 0;
225 err:
226         bch_btree_keys_free(b);
227         return -ENOMEM;
228 }
229 EXPORT_SYMBOL(bch_btree_keys_alloc);
230
231 void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
232                          bool *expensive_debug_checks)
233 {
234         unsigned i;
235
236         b->ops = ops;
237         b->expensive_debug_checks = expensive_debug_checks;
238         b->nsets = 0;
239         b->last_set_unwritten = 0;
240
241         /* XXX: shouldn't be needed */
242         for (i = 0; i < MAX_BSETS; i++)
243                 b->set[i].size = 0;
244         /*
245          * Second loop starts at 1 because b->keys[0]->data is the memory we
246          * allocated
247          */
248         for (i = 1; i < MAX_BSETS; i++)
249                 b->set[i].data = NULL;
250 }
251 EXPORT_SYMBOL(bch_btree_keys_init);
252
253 /* Binary tree stuff for auxiliary search trees */
254
255 static unsigned inorder_next(unsigned j, unsigned size)
256 {
257         if (j * 2 + 1 < size) {
258                 j = j * 2 + 1;
259
260                 while (j * 2 < size)
261                         j *= 2;
262         } else
263                 j >>= ffz(j) + 1;
264
265         return j;
266 }
267
268 static unsigned inorder_prev(unsigned j, unsigned size)
269 {
270         if (j * 2 < size) {
271                 j = j * 2;
272
273                 while (j * 2 + 1 < size)
274                         j = j * 2 + 1;
275         } else
276                 j >>= ffs(j);
277
278         return j;
279 }
280
281 /* I have no idea why this code works... and I'm the one who wrote it
282  *
283  * However, I do know what it does:
284  * Given a binary tree constructed in an array (i.e. how you normally implement
285  * a heap), it converts a node in the tree - referenced by array index - to the
286  * index it would have if you did an inorder traversal.
287  *
288  * Also tested for every j, size up to size somewhere around 6 million.
289  *
290  * The binary tree starts at array index 1, not 0
291  * extra is a function of size:
292  *   extra = (size - rounddown_pow_of_two(size - 1)) << 1;
293  */
294 static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
295 {
296         unsigned b = fls(j);
297         unsigned shift = fls(size - 1) - b;
298
299         j  ^= 1U << (b - 1);
300         j <<= 1;
301         j  |= 1;
302         j <<= shift;
303
304         if (j > extra)
305                 j -= (j - extra) >> 1;
306
307         return j;
308 }
309
310 static unsigned to_inorder(unsigned j, struct bset_tree *t)
311 {
312         return __to_inorder(j, t->size, t->extra);
313 }
314
315 static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
316 {
317         unsigned shift;
318
319         if (j > extra)
320                 j += j - extra;
321
322         shift = ffs(j);
323
324         j >>= shift;
325         j  |= roundup_pow_of_two(size) >> shift;
326
327         return j;
328 }
329
330 static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
331 {
332         return __inorder_to_tree(j, t->size, t->extra);
333 }
334
335 #if 0
336 void inorder_test(void)
337 {
338         unsigned long done = 0;
339         ktime_t start = ktime_get();
340
341         for (unsigned size = 2;
342              size < 65536000;
343              size++) {
344                 unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1;
345                 unsigned i = 1, j = rounddown_pow_of_two(size - 1);
346
347                 if (!(size % 4096))
348                         printk(KERN_NOTICE "loop %u, %llu per us\n", size,
349                                done / ktime_us_delta(ktime_get(), start));
350
351                 while (1) {
352                         if (__inorder_to_tree(i, size, extra) != j)
353                                 panic("size %10u j %10u i %10u", size, j, i);
354
355                         if (__to_inorder(j, size, extra) != i)
356                                 panic("size %10u j %10u i %10u", size, j, i);
357
358                         if (j == rounddown_pow_of_two(size) - 1)
359                                 break;
360
361                         BUG_ON(inorder_prev(inorder_next(j, size), size) != j);
362
363                         j = inorder_next(j, size);
364                         i++;
365                 }
366
367                 done += size - 1;
368         }
369 }
370 #endif
371
372 /*
373  * Cacheline/offset <-> bkey pointer arithmetic:
374  *
375  * t->tree is a binary search tree in an array; each node corresponds to a key
376  * in one cacheline in t->set (BSET_CACHELINE bytes).
377  *
378  * This means we don't have to store the full index of the key that a node in
379  * the binary tree points to; to_inorder() gives us the cacheline, and then
380  * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes.
381  *
382  * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
383  * make this work.
384  *
385  * To construct the bfloat for an arbitrary key we need to know what the key
386  * immediately preceding it is: we have to check if the two keys differ in the
387  * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
388  * of the previous key so we can walk backwards to it from t->tree[j]'s key.
389  */
390
391 static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline,
392                                       unsigned offset)
393 {
394         return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8;
395 }
396
397 static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
398 {
399         return ((void *) k - (void *) t->data) / BSET_CACHELINE;
400 }
401
402 static unsigned bkey_to_cacheline_offset(struct bkey *k)
403 {
404         return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
405 }
406
407 static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
408 {
409         return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m);
410 }
411
412 static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j)
413 {
414         return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]);
415 }
416
417 /*
418  * For the write set - the one we're currently inserting keys into - we don't
419  * maintain a full search tree, we just keep a simple lookup table in t->prev.
420  */
421 static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
422 {
423         return cacheline_to_bkey(t, cacheline, t->prev[cacheline]);
424 }
425
426 static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
427 {
428         low >>= shift;
429         low  |= (high << 1) << (63U - shift);
430         return low;
431 }
432
433 static inline unsigned bfloat_mantissa(const struct bkey *k,
434                                        struct bkey_float *f)
435 {
436         const uint64_t *p = &k->low - (f->exponent >> 6);
437         return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK;
438 }
439
440 static void make_bfloat(struct bset_tree *t, unsigned j)
441 {
442         struct bkey_float *f = &t->tree[j];
443         struct bkey *m = tree_to_bkey(t, j);
444         struct bkey *p = tree_to_prev_bkey(t, j);
445
446         struct bkey *l = is_power_of_2(j)
447                 ? t->data->start
448                 : tree_to_prev_bkey(t, j >> ffs(j));
449
450         struct bkey *r = is_power_of_2(j + 1)
451                 ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end))
452                 : tree_to_bkey(t, j >> (ffz(j) + 1));
453
454         BUG_ON(m < l || m > r);
455         BUG_ON(bkey_next(p) != m);
456
457         if (KEY_INODE(l) != KEY_INODE(r))
458                 f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
459         else
460                 f->exponent = fls64(r->low ^ l->low);
461
462         f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0);
463
464         /*
465          * Setting f->exponent = 127 flags this node as failed, and causes the
466          * lookup code to fall back to comparing against the original key.
467          */
468
469         if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f))
470                 f->mantissa = bfloat_mantissa(m, f) - 1;
471         else
472                 f->exponent = 127;
473 }
474
475 static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t)
476 {
477         if (t != b->set) {
478                 unsigned j = roundup(t[-1].size,
479                                      64 / sizeof(struct bkey_float));
480
481                 t->tree = t[-1].tree + j;
482                 t->prev = t[-1].prev + j;
483         }
484
485         while (t < b->set + MAX_BSETS)
486                 t++->size = 0;
487 }
488
489 static void bch_bset_build_unwritten_tree(struct btree_keys *b)
490 {
491         struct bset_tree *t = bset_tree_last(b);
492
493         BUG_ON(b->last_set_unwritten);
494         b->last_set_unwritten = 1;
495
496         bset_alloc_tree(b, t);
497
498         if (t->tree != b->set->tree + btree_keys_cachelines(b)) {
499                 t->prev[0] = bkey_to_cacheline_offset(t->data->start);
500                 t->size = 1;
501         }
502 }
503
504 void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
505 {
506         if (i != b->set->data) {
507                 b->set[++b->nsets].data = i;
508                 i->seq = b->set->data->seq;
509         } else
510                 get_random_bytes(&i->seq, sizeof(uint64_t));
511
512         i->magic        = magic;
513         i->version      = 0;
514         i->keys         = 0;
515
516         bch_bset_build_unwritten_tree(b);
517 }
518 EXPORT_SYMBOL(bch_bset_init_next);
519
520 void bch_bset_build_written_tree(struct btree_keys *b)
521 {
522         struct bset_tree *t = bset_tree_last(b);
523         struct bkey *k = t->data->start;
524         unsigned j, cacheline = 1;
525
526         b->last_set_unwritten = 0;
527
528         bset_alloc_tree(b, t);
529
530         t->size = min_t(unsigned,
531                         bkey_to_cacheline(t, bset_bkey_last(t->data)),
532                         b->set->tree + btree_keys_cachelines(b) - t->tree);
533
534         if (t->size < 2) {
535                 t->size = 0;
536                 return;
537         }
538
539         t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
540
541         /* First we figure out where the first key in each cacheline is */
542         for (j = inorder_next(0, t->size);
543              j;
544              j = inorder_next(j, t->size)) {
545                 while (bkey_to_cacheline(t, k) != cacheline)
546                         k = bkey_next(k);
547
548                 t->prev[j] = bkey_u64s(k);
549                 k = bkey_next(k);
550                 cacheline++;
551                 t->tree[j].m = bkey_to_cacheline_offset(k);
552         }
553
554         while (bkey_next(k) != bset_bkey_last(t->data))
555                 k = bkey_next(k);
556
557         t->end = *k;
558
559         /* Then we build the tree */
560         for (j = inorder_next(0, t->size);
561              j;
562              j = inorder_next(j, t->size))
563                 make_bfloat(t, j);
564 }
565 EXPORT_SYMBOL(bch_bset_build_written_tree);
566
567 void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k)
568 {
569         struct bset_tree *t;
570         unsigned inorder, j = 1;
571
572         for (t = b->set; t <= bset_tree_last(b); t++)
573                 if (k < bset_bkey_last(t->data))
574                         goto found_set;
575
576         BUG();
577 found_set:
578         if (!t->size || !bset_written(b, t))
579                 return;
580
581         inorder = bkey_to_cacheline(t, k);
582
583         if (k == t->data->start)
584                 goto fix_left;
585
586         if (bkey_next(k) == bset_bkey_last(t->data)) {
587                 t->end = *k;
588                 goto fix_right;
589         }
590
591         j = inorder_to_tree(inorder, t);
592
593         if (j &&
594             j < t->size &&
595             k == tree_to_bkey(t, j))
596 fix_left:       do {
597                         make_bfloat(t, j);
598                         j = j * 2;
599                 } while (j < t->size);
600
601         j = inorder_to_tree(inorder + 1, t);
602
603         if (j &&
604             j < t->size &&
605             k == tree_to_prev_bkey(t, j))
606 fix_right:      do {
607                         make_bfloat(t, j);
608                         j = j * 2 + 1;
609                 } while (j < t->size);
610 }
611 EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
612
613 static void bch_bset_fix_lookup_table(struct btree_keys *b,
614                                       struct bset_tree *t,
615                                       struct bkey *k)
616 {
617         unsigned shift = bkey_u64s(k);
618         unsigned j = bkey_to_cacheline(t, k);
619
620         /* We're getting called from btree_split() or btree_gc, just bail out */
621         if (!t->size)
622                 return;
623
624         /* k is the key we just inserted; we need to find the entry in the
625          * lookup table for the first key that is strictly greater than k:
626          * it's either k's cacheline or the next one
627          */
628         if (j < t->size &&
629             table_to_bkey(t, j) <= k)
630                 j++;
631
632         /* Adjust all the lookup table entries, and find a new key for any that
633          * have gotten too big
634          */
635         for (; j < t->size; j++) {
636                 t->prev[j] += shift;
637
638                 if (t->prev[j] > 7) {
639                         k = table_to_bkey(t, j - 1);
640
641                         while (k < cacheline_to_bkey(t, j, 0))
642                                 k = bkey_next(k);
643
644                         t->prev[j] = bkey_to_cacheline_offset(k);
645                 }
646         }
647
648         if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree)
649                 return;
650
651         /* Possibly add a new entry to the end of the lookup table */
652
653         for (k = table_to_bkey(t, t->size - 1);
654              k != bset_bkey_last(t->data);
655              k = bkey_next(k))
656                 if (t->size == bkey_to_cacheline(t, k)) {
657                         t->prev[t->size] = bkey_to_cacheline_offset(k);
658                         t->size++;
659                 }
660 }
661
662 void bch_bset_insert(struct btree_keys *b, struct bkey *where,
663                      struct bkey *insert)
664 {
665         struct bset_tree *t = bset_tree_last(b);
666
667         BUG_ON(!b->last_set_unwritten);
668         BUG_ON(bset_byte_offset(b, t->data) +
669                __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) >
670                PAGE_SIZE << b->page_order);
671
672         memmove((uint64_t *) where + bkey_u64s(insert),
673                 where,
674                 (void *) bset_bkey_last(t->data) - (void *) where);
675
676         t->data->keys += bkey_u64s(insert);
677         bkey_copy(where, insert);
678         bch_bset_fix_lookup_table(b, t, where);
679 }
680 EXPORT_SYMBOL(bch_bset_insert);
681
682 struct bset_search_iter {
683         struct bkey *l, *r;
684 };
685
686 static struct bset_search_iter bset_search_write_set(struct bset_tree *t,
687                                                      const struct bkey *search)
688 {
689         unsigned li = 0, ri = t->size;
690
691         while (li + 1 != ri) {
692                 unsigned m = (li + ri) >> 1;
693
694                 if (bkey_cmp(table_to_bkey(t, m), search) > 0)
695                         ri = m;
696                 else
697                         li = m;
698         }
699
700         return (struct bset_search_iter) {
701                 table_to_bkey(t, li),
702                 ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data)
703         };
704 }
705
706 static struct bset_search_iter bset_search_tree(struct bset_tree *t,
707                                                 const struct bkey *search)
708 {
709         struct bkey *l, *r;
710         struct bkey_float *f;
711         unsigned inorder, j, n = 1;
712
713         do {
714                 unsigned p = n << 4;
715                 p &= ((int) (p - t->size)) >> 31;
716
717                 prefetch(&t->tree[p]);
718
719                 j = n;
720                 f = &t->tree[j];
721
722                 /*
723                  * n = (f->mantissa > bfloat_mantissa())
724                  *      ? j * 2
725                  *      : j * 2 + 1;
726                  *
727                  * We need to subtract 1 from f->mantissa for the sign bit trick
728                  * to work  - that's done in make_bfloat()
729                  */
730                 if (likely(f->exponent != 127))
731                         n = j * 2 + (((unsigned)
732                                       (f->mantissa -
733                                        bfloat_mantissa(search, f))) >> 31);
734                 else
735                         n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
736                                 ? j * 2
737                                 : j * 2 + 1;
738         } while (n < t->size);
739
740         inorder = to_inorder(j, t);
741
742         /*
743          * n would have been the node we recursed to - the low bit tells us if
744          * we recursed left or recursed right.
745          */
746         if (n & 1) {
747                 l = cacheline_to_bkey(t, inorder, f->m);
748
749                 if (++inorder != t->size) {
750                         f = &t->tree[inorder_next(j, t->size)];
751                         r = cacheline_to_bkey(t, inorder, f->m);
752                 } else
753                         r = bset_bkey_last(t->data);
754         } else {
755                 r = cacheline_to_bkey(t, inorder, f->m);
756
757                 if (--inorder) {
758                         f = &t->tree[inorder_prev(j, t->size)];
759                         l = cacheline_to_bkey(t, inorder, f->m);
760                 } else
761                         l = t->data->start;
762         }
763
764         return (struct bset_search_iter) {l, r};
765 }
766
767 struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
768                                const struct bkey *search)
769 {
770         struct bset_search_iter i;
771
772         /*
773          * First, we search for a cacheline, then lastly we do a linear search
774          * within that cacheline.
775          *
776          * To search for the cacheline, there's three different possibilities:
777          *  * The set is too small to have a search tree, so we just do a linear
778          *    search over the whole set.
779          *  * The set is the one we're currently inserting into; keeping a full
780          *    auxiliary search tree up to date would be too expensive, so we
781          *    use a much simpler lookup table to do a binary search -
782          *    bset_search_write_set().
783          *  * Or we use the auxiliary search tree we constructed earlier -
784          *    bset_search_tree()
785          */
786
787         if (unlikely(!t->size)) {
788                 i.l = t->data->start;
789                 i.r = bset_bkey_last(t->data);
790         } else if (bset_written(&b->keys, t)) {
791                 /*
792                  * Each node in the auxiliary search tree covers a certain range
793                  * of bits, and keys above and below the set it covers might
794                  * differ outside those bits - so we have to special case the
795                  * start and end - handle that here:
796                  */
797
798                 if (unlikely(bkey_cmp(search, &t->end) >= 0))
799                         return bset_bkey_last(t->data);
800
801                 if (unlikely(bkey_cmp(search, t->data->start) < 0))
802                         return t->data->start;
803
804                 i = bset_search_tree(t, search);
805         } else {
806                 BUG_ON(!b->keys.nsets &&
807                        t->size < bkey_to_cacheline(t, bset_bkey_last(t->data)));
808
809                 i = bset_search_write_set(t, search);
810         }
811
812         if (expensive_debug_checks(b->c)) {
813                 BUG_ON(bset_written(&b->keys, t) &&
814                        i.l != t->data->start &&
815                        bkey_cmp(tree_to_prev_bkey(t,
816                           inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
817                                 search) > 0);
818
819                 BUG_ON(i.r != bset_bkey_last(t->data) &&
820                        bkey_cmp(i.r, search) <= 0);
821         }
822
823         while (likely(i.l != i.r) &&
824                bkey_cmp(i.l, search) <= 0)
825                 i.l = bkey_next(i.l);
826
827         return i.l;
828 }
829 EXPORT_SYMBOL(__bch_bset_search);
830
831 /* Btree iterator */
832
833 typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
834                                  struct btree_iter_set);
835
836 static inline bool btree_iter_cmp(struct btree_iter_set l,
837                                   struct btree_iter_set r)
838 {
839         return bkey_cmp(l.k, r.k) > 0;
840 }
841
842 static inline bool btree_iter_end(struct btree_iter *iter)
843 {
844         return !iter->used;
845 }
846
847 void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
848                          struct bkey *end)
849 {
850         if (k != end)
851                 BUG_ON(!heap_add(iter,
852                                  ((struct btree_iter_set) { k, end }),
853                                  btree_iter_cmp));
854 }
855
856 static struct bkey *__bch_btree_iter_init(struct btree *b,
857                                           struct btree_iter *iter,
858                                           struct bkey *search,
859                                           struct bset_tree *start)
860 {
861         struct bkey *ret = NULL;
862         iter->size = ARRAY_SIZE(iter->data);
863         iter->used = 0;
864
865 #ifdef CONFIG_BCACHE_DEBUG
866         iter->b = b;
867 #endif
868
869         for (; start <= bset_tree_last(&b->keys); start++) {
870                 ret = bch_bset_search(b, start, search);
871                 bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
872         }
873
874         return ret;
875 }
876
877 struct bkey *bch_btree_iter_init(struct btree *b,
878                                  struct btree_iter *iter,
879                                  struct bkey *search)
880 {
881         return __bch_btree_iter_init(b, iter, search, b->keys.set);
882 }
883 EXPORT_SYMBOL(bch_btree_iter_init);
884
885 static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
886                                                  btree_iter_cmp_fn *cmp)
887 {
888         struct btree_iter_set unused;
889         struct bkey *ret = NULL;
890
891         if (!btree_iter_end(iter)) {
892                 bch_btree_iter_next_check(iter);
893
894                 ret = iter->data->k;
895                 iter->data->k = bkey_next(iter->data->k);
896
897                 if (iter->data->k > iter->data->end) {
898                         WARN_ONCE(1, "bset was corrupt!\n");
899                         iter->data->k = iter->data->end;
900                 }
901
902                 if (iter->data->k == iter->data->end)
903                         heap_pop(iter, unused, cmp);
904                 else
905                         heap_sift(iter, 0, cmp);
906         }
907
908         return ret;
909 }
910
911 struct bkey *bch_btree_iter_next(struct btree_iter *iter)
912 {
913         return __bch_btree_iter_next(iter, btree_iter_cmp);
914
915 }
916 EXPORT_SYMBOL(bch_btree_iter_next);
917
918 struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
919                                         struct btree_keys *b, ptr_filter_fn fn)
920 {
921         struct bkey *ret;
922
923         do {
924                 ret = bch_btree_iter_next(iter);
925         } while (ret && fn(b, ret));
926
927         return ret;
928 }
929
930 /* Mergesort */
931
932 void bch_bset_sort_state_free(struct bset_sort_state *state)
933 {
934         if (state->pool)
935                 mempool_destroy(state->pool);
936 }
937
938 int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
939 {
940         spin_lock_init(&state->time.lock);
941
942         state->page_order = page_order;
943         state->crit_factor = int_sqrt(1 << page_order);
944
945         state->pool = mempool_create_page_pool(1, page_order);
946         if (!state->pool)
947                 return -ENOMEM;
948
949         return 0;
950 }
951 EXPORT_SYMBOL(bch_bset_sort_state_init);
952
953 static void btree_mergesort(struct btree_keys *b, struct bset *out,
954                             struct btree_iter *iter,
955                             bool fixup, bool remove_stale)
956 {
957         int i;
958         struct bkey *k, *last = NULL;
959         BKEY_PADDED(k) tmp;
960         bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
961                 ? bch_ptr_bad
962                 : bch_ptr_invalid;
963
964         /* Heapify the iterator, using our comparison function */
965         for (i = iter->used / 2 - 1; i >= 0; --i)
966                 heap_sift(iter, i, b->ops->sort_cmp);
967
968         while (!btree_iter_end(iter)) {
969                 if (b->ops->sort_fixup && fixup)
970                         k = b->ops->sort_fixup(iter, &tmp.k);
971                 else
972                         k = NULL;
973
974                 if (!k)
975                         k = __bch_btree_iter_next(iter, b->ops->sort_cmp);
976
977                 if (bad(b, k))
978                         continue;
979
980                 if (!last) {
981                         last = out->start;
982                         bkey_copy(last, k);
983                 } else if (!bch_bkey_try_merge(b, last, k)) {
984                         last = bkey_next(last);
985                         bkey_copy(last, k);
986                 }
987         }
988
989         out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
990
991         pr_debug("sorted %i keys", out->keys);
992 }
993
994 static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
995                          unsigned start, unsigned order, bool fixup,
996                          struct bset_sort_state *state)
997 {
998         uint64_t start_time;
999         bool used_mempool = false;
1000         struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
1001                                                      order);
1002         if (!out) {
1003                 BUG_ON(order > state->page_order);
1004
1005                 out = page_address(mempool_alloc(state->pool, GFP_NOIO));
1006                 used_mempool = true;
1007                 order = state->page_order;
1008         }
1009
1010         start_time = local_clock();
1011
1012         btree_mergesort(b, out, iter, fixup, false);
1013         b->nsets = start;
1014
1015         if (!start && order == b->page_order) {
1016                 /*
1017                  * Our temporary buffer is the same size as the btree node's
1018                  * buffer, we can just swap buffers instead of doing a big
1019                  * memcpy()
1020                  */
1021
1022                 out->magic      = b->set->data->magic;
1023                 out->seq        = b->set->data->seq;
1024                 out->version    = b->set->data->version;
1025                 swap(out, b->set->data);
1026         } else {
1027                 b->set[start].data->keys = out->keys;
1028                 memcpy(b->set[start].data->start, out->start,
1029                        (void *) bset_bkey_last(out) - (void *) out->start);
1030         }
1031
1032         if (used_mempool)
1033                 mempool_free(virt_to_page(out), state->pool);
1034         else
1035                 free_pages((unsigned long) out, order);
1036
1037         bch_bset_build_written_tree(b);
1038
1039         if (!start)
1040                 bch_time_stats_update(&state->time, start_time);
1041 }
1042
1043 void bch_btree_sort_partial(struct btree *b, unsigned start,
1044                             struct bset_sort_state *state)
1045 {
1046         size_t order = b->keys.page_order, keys = 0;
1047         struct btree_iter iter;
1048         int oldsize = bch_count_data(b);
1049
1050         __bch_btree_iter_init(b, &iter, NULL, &b->keys.set[start]);
1051
1052         if (start) {
1053                 unsigned i;
1054
1055                 for (i = start; i <= b->keys.nsets; i++)
1056                         keys += b->keys.set[i].data->keys;
1057
1058                 order = roundup_pow_of_two(__set_bytes(b->keys.set->data,
1059                                                        keys)) / PAGE_SIZE;
1060                 if (order)
1061                         order = ilog2(order);
1062         }
1063
1064         __btree_sort(&b->keys, &iter, start, order, false, state);
1065
1066         EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize);
1067 }
1068 EXPORT_SYMBOL(bch_btree_sort_partial);
1069
1070 void bch_btree_sort_and_fix_extents(struct btree_keys *b,
1071                                     struct btree_iter *iter,
1072                                     struct bset_sort_state *state)
1073 {
1074         __btree_sort(b, iter, 0, b->page_order, true, state);
1075 }
1076
1077 void bch_btree_sort_into(struct btree *b, struct btree *new,
1078                          struct bset_sort_state *state)
1079 {
1080         uint64_t start_time = local_clock();
1081
1082         struct btree_iter iter;
1083         bch_btree_iter_init(b, &iter, NULL);
1084
1085         btree_mergesort(&b->keys, new->keys.set->data, &iter, false, true);
1086
1087         bch_time_stats_update(&state->time, start_time);
1088
1089         new->keys.set->size = 0; // XXX: why?
1090 }
1091
1092 #define SORT_CRIT       (4096 / sizeof(uint64_t))
1093
1094 void bch_btree_sort_lazy(struct btree *b, struct bset_sort_state *state)
1095 {
1096         unsigned crit = SORT_CRIT;
1097         int i;
1098
1099         b->keys.last_set_unwritten = 0;
1100
1101         /* Don't sort if nothing to do */
1102         if (!b->keys.nsets)
1103                 goto out;
1104
1105         for (i = b->keys.nsets - 1; i >= 0; --i) {
1106                 crit *= state->crit_factor;
1107
1108                 if (b->keys.set[i].data->keys < crit) {
1109                         bch_btree_sort_partial(b, i, state);
1110                         return;
1111                 }
1112         }
1113
1114         /* Sort if we'd overflow */
1115         if (b->keys.nsets + 1 == MAX_BSETS) {
1116                 bch_btree_sort(b, state);
1117                 return;
1118         }
1119
1120 out:
1121         bch_bset_build_written_tree(&b->keys);
1122 }
1123 EXPORT_SYMBOL(bch_btree_sort_lazy);
1124
1125 /* Sysfs stuff */
1126
1127 struct bset_stats {
1128         struct btree_op op;
1129         size_t nodes;
1130         size_t sets_written, sets_unwritten;
1131         size_t bytes_written, bytes_unwritten;
1132         size_t floats, failed;
1133 };
1134
1135 static int btree_bset_stats(struct btree_op *op, struct btree *b)
1136 {
1137         struct bset_stats *stats = container_of(op, struct bset_stats, op);
1138         unsigned i;
1139
1140         stats->nodes++;
1141
1142         for (i = 0; i <= b->keys.nsets; i++) {
1143                 struct bset_tree *t = &b->keys.set[i];
1144                 size_t bytes = t->data->keys * sizeof(uint64_t);
1145                 size_t j;
1146
1147                 if (bset_written(&b->keys, t)) {
1148                         stats->sets_written++;
1149                         stats->bytes_written += bytes;
1150
1151                         stats->floats += t->size - 1;
1152
1153                         for (j = 1; j < t->size; j++)
1154                                 if (t->tree[j].exponent == 127)
1155                                         stats->failed++;
1156                 } else {
1157                         stats->sets_unwritten++;
1158                         stats->bytes_unwritten += bytes;
1159                 }
1160         }
1161
1162         return MAP_CONTINUE;
1163 }
1164
1165 int bch_bset_print_stats(struct cache_set *c, char *buf)
1166 {
1167         struct bset_stats t;
1168         int ret;
1169
1170         memset(&t, 0, sizeof(struct bset_stats));
1171         bch_btree_op_init(&t.op, -1);
1172
1173         ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
1174         if (ret < 0)
1175                 return ret;
1176
1177         return snprintf(buf, PAGE_SIZE,
1178                         "btree nodes:           %zu\n"
1179                         "written sets:          %zu\n"
1180                         "unwritten sets:                %zu\n"
1181                         "written key bytes:     %zu\n"
1182                         "unwritten key bytes:   %zu\n"
1183                         "floats:                        %zu\n"
1184                         "failed:                        %zu\n",
1185                         t.nodes,
1186                         t.sets_written, t.sets_unwritten,
1187                         t.bytes_written, t.bytes_unwritten,
1188                         t.floats, t.failed);
1189 }