1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2016 Facebook
4 #include <linux/cpumask.h>
5 #include <linux/spinlock.h>
6 #include <linux/percpu.h>
8 #include "bpf_lru_list.h"
10 #define LOCAL_FREE_TARGET (128)
11 #define LOCAL_NR_SCANS LOCAL_FREE_TARGET
13 #define PERCPU_FREE_TARGET (4)
14 #define PERCPU_NR_SCANS PERCPU_FREE_TARGET
16 /* Helpers to get the local list index */
17 #define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET)
18 #define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
19 #define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
20 #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
22 static int get_next_cpu(int cpu)
24 cpu = cpumask_next(cpu, cpu_possible_mask);
25 if (cpu >= nr_cpu_ids)
26 cpu = cpumask_first(cpu_possible_mask);
30 /* Local list helpers */
31 static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
33 return &loc_l->lists[LOCAL_FREE_LIST_IDX];
36 static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
38 return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
41 /* bpf_lru_node helpers */
42 static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
47 static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
48 enum bpf_lru_list_type type)
50 if (type < NR_BPF_LRU_LIST_COUNT)
54 static void bpf_lru_list_count_dec(struct bpf_lru_list *l,
55 enum bpf_lru_list_type type)
57 if (type < NR_BPF_LRU_LIST_COUNT)
61 static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
62 struct bpf_lru_node *node,
63 struct list_head *free_list,
64 enum bpf_lru_list_type tgt_free_type)
66 if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
69 /* If the removing node is the next_inactive_rotation candidate,
70 * move the next_inactive_rotation pointer also.
72 if (&node->list == l->next_inactive_rotation)
73 l->next_inactive_rotation = l->next_inactive_rotation->prev;
75 bpf_lru_list_count_dec(l, node->type);
77 node->type = tgt_free_type;
78 list_move(&node->list, free_list);
81 /* Move nodes from local list to the LRU list */
82 static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
83 struct bpf_lru_node *node,
84 enum bpf_lru_list_type tgt_type)
86 if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) ||
87 WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
90 bpf_lru_list_count_inc(l, tgt_type);
91 node->type = tgt_type;
93 list_move(&node->list, &l->lists[tgt_type]);
96 /* Move nodes between or within active and inactive list (like
97 * active to inactive, inactive to active or tail of active back to
98 * the head of active).
100 static void __bpf_lru_node_move(struct bpf_lru_list *l,
101 struct bpf_lru_node *node,
102 enum bpf_lru_list_type tgt_type)
104 if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) ||
105 WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
108 if (node->type != tgt_type) {
109 bpf_lru_list_count_dec(l, node->type);
110 bpf_lru_list_count_inc(l, tgt_type);
111 node->type = tgt_type;
115 /* If the moving node is the next_inactive_rotation candidate,
116 * move the next_inactive_rotation pointer also.
118 if (&node->list == l->next_inactive_rotation)
119 l->next_inactive_rotation = l->next_inactive_rotation->prev;
121 list_move(&node->list, &l->lists[tgt_type]);
124 static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l)
126 return l->counts[BPF_LRU_LIST_T_INACTIVE] <
127 l->counts[BPF_LRU_LIST_T_ACTIVE];
130 /* Rotate the active list:
132 * 2. If the node has the ref bit set, it will be rotated
133 * back to the head of active list with the ref bit cleared.
134 * Give this node one more chance to survive in the active list.
135 * 3. If the ref bit is not set, move it to the head of the
137 * 4. It will at most scan nr_scans nodes
139 static void __bpf_lru_list_rotate_active(struct bpf_lru *lru,
140 struct bpf_lru_list *l)
142 struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE];
143 struct bpf_lru_node *node, *tmp_node, *first_node;
146 first_node = list_first_entry(active, struct bpf_lru_node, list);
147 list_for_each_entry_safe_reverse(node, tmp_node, active, list) {
148 if (bpf_lru_node_is_ref(node))
149 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
151 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
153 if (++i == lru->nr_scans || node == first_node)
158 /* Rotate the inactive list. It starts from the next_inactive_rotation
159 * 1. If the node has ref bit set, it will be moved to the head
160 * of active list with the ref bit cleared.
161 * 2. If the node does not have ref bit set, it will leave it
162 * at its current location (i.e. do nothing) so that it can
163 * be considered during the next inactive_shrink.
164 * 3. It will at most scan nr_scans nodes
166 static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru,
167 struct bpf_lru_list *l)
169 struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
170 struct list_head *cur, *last, *next = inactive;
171 struct bpf_lru_node *node;
174 if (list_empty(inactive))
177 last = l->next_inactive_rotation->next;
178 if (last == inactive)
181 cur = l->next_inactive_rotation;
182 while (i < lru->nr_scans) {
183 if (cur == inactive) {
188 node = list_entry(cur, struct bpf_lru_node, list);
190 if (bpf_lru_node_is_ref(node))
191 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
198 l->next_inactive_rotation = next;
201 /* Shrink the inactive list. It starts from the tail of the
202 * inactive list and only move the nodes without the ref bit
203 * set to the designated free list.
206 __bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
207 struct bpf_lru_list *l,
208 unsigned int tgt_nshrink,
209 struct list_head *free_list,
210 enum bpf_lru_list_type tgt_free_type)
212 struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
213 struct bpf_lru_node *node, *tmp_node;
214 unsigned int nshrinked = 0;
217 list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
218 if (bpf_lru_node_is_ref(node)) {
219 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
220 } else if (lru->del_from_htab(lru->del_arg, node)) {
221 __bpf_lru_node_move_to_free(l, node, free_list,
223 if (++nshrinked == tgt_nshrink)
227 if (++i == lru->nr_scans)
234 /* 1. Rotate the active list (if needed)
235 * 2. Always rotate the inactive list
237 static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l)
239 if (bpf_lru_list_inactive_low(l))
240 __bpf_lru_list_rotate_active(lru, l);
242 __bpf_lru_list_rotate_inactive(lru, l);
245 /* Calls __bpf_lru_list_shrink_inactive() to shrink some
246 * ref-bit-cleared nodes and move them to the designated
249 * If it cannot get a free node after calling
250 * __bpf_lru_list_shrink_inactive(). It will just remove
251 * one node from either inactive or active list without
252 * honoring the ref-bit. It prefers inactive list to active
253 * list in this situation.
255 static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
256 struct bpf_lru_list *l,
257 unsigned int tgt_nshrink,
258 struct list_head *free_list,
259 enum bpf_lru_list_type tgt_free_type)
262 struct bpf_lru_node *node, *tmp_node;
263 struct list_head *force_shrink_list;
264 unsigned int nshrinked;
266 nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink,
267 free_list, tgt_free_type);
271 /* Do a force shrink by ignoring the reference bit */
272 if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE]))
273 force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE];
275 force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE];
277 list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
279 if (lru->del_from_htab(lru->del_arg, node)) {
280 __bpf_lru_node_move_to_free(l, node, free_list,
289 /* Flush the nodes from the local pending list to the LRU list */
290 static void __local_list_flush(struct bpf_lru_list *l,
291 struct bpf_lru_locallist *loc_l)
293 struct bpf_lru_node *node, *tmp_node;
295 list_for_each_entry_safe_reverse(node, tmp_node,
296 local_pending_list(loc_l), list) {
297 if (bpf_lru_node_is_ref(node))
298 __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
300 __bpf_lru_node_move_in(l, node,
301 BPF_LRU_LIST_T_INACTIVE);
305 static void bpf_lru_list_push_free(struct bpf_lru_list *l,
306 struct bpf_lru_node *node)
310 if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
313 raw_spin_lock_irqsave(&l->lock, flags);
314 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
315 raw_spin_unlock_irqrestore(&l->lock, flags);
318 static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
319 struct bpf_lru_locallist *loc_l)
321 struct bpf_lru_list *l = &lru->common_lru.lru_list;
322 struct bpf_lru_node *node, *tmp_node;
323 unsigned int nfree = 0;
325 raw_spin_lock(&l->lock);
327 __local_list_flush(l, loc_l);
329 __bpf_lru_list_rotate(lru, l);
331 list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
333 __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
334 BPF_LRU_LOCAL_LIST_T_FREE);
335 if (++nfree == LOCAL_FREE_TARGET)
339 if (nfree < LOCAL_FREE_TARGET)
340 __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
341 local_free_list(loc_l),
342 BPF_LRU_LOCAL_LIST_T_FREE);
344 raw_spin_unlock(&l->lock);
347 static void __local_list_add_pending(struct bpf_lru *lru,
348 struct bpf_lru_locallist *loc_l,
350 struct bpf_lru_node *node,
353 *(u32 *)((void *)node + lru->hash_offset) = hash;
355 node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
357 list_add(&node->list, local_pending_list(loc_l));
360 static struct bpf_lru_node *
361 __local_list_pop_free(struct bpf_lru_locallist *loc_l)
363 struct bpf_lru_node *node;
365 node = list_first_entry_or_null(local_free_list(loc_l),
369 list_del(&node->list);
374 static struct bpf_lru_node *
375 __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
377 struct bpf_lru_node *node;
381 /* Get from the tail (i.e. older element) of the pending list. */
382 list_for_each_entry_reverse(node, local_pending_list(loc_l),
384 if ((!bpf_lru_node_is_ref(node) || force) &&
385 lru->del_from_htab(lru->del_arg, node)) {
386 list_del(&node->list);
399 static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
402 struct list_head *free_list;
403 struct bpf_lru_node *node = NULL;
404 struct bpf_lru_list *l;
406 int cpu = raw_smp_processor_id();
408 l = per_cpu_ptr(lru->percpu_lru, cpu);
410 raw_spin_lock_irqsave(&l->lock, flags);
412 __bpf_lru_list_rotate(lru, l);
414 free_list = &l->lists[BPF_LRU_LIST_T_FREE];
415 if (list_empty(free_list))
416 __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list,
417 BPF_LRU_LIST_T_FREE);
419 if (!list_empty(free_list)) {
420 node = list_first_entry(free_list, struct bpf_lru_node, list);
421 *(u32 *)((void *)node + lru->hash_offset) = hash;
423 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
426 raw_spin_unlock_irqrestore(&l->lock, flags);
431 static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
434 struct bpf_lru_locallist *loc_l, *steal_loc_l;
435 struct bpf_common_lru *clru = &lru->common_lru;
436 struct bpf_lru_node *node;
437 int steal, first_steal;
439 int cpu = raw_smp_processor_id();
441 loc_l = per_cpu_ptr(clru->local_list, cpu);
443 raw_spin_lock_irqsave(&loc_l->lock, flags);
445 node = __local_list_pop_free(loc_l);
447 bpf_lru_list_pop_free_to_local(lru, loc_l);
448 node = __local_list_pop_free(loc_l);
452 __local_list_add_pending(lru, loc_l, cpu, node, hash);
454 raw_spin_unlock_irqrestore(&loc_l->lock, flags);
459 /* No free nodes found from the local free list and
460 * the global LRU list.
462 * Steal from the local free/pending list of the
463 * current CPU and remote CPU in RR. It starts
464 * with the loc_l->next_steal CPU.
467 first_steal = loc_l->next_steal;
470 steal_loc_l = per_cpu_ptr(clru->local_list, steal);
472 raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
474 node = __local_list_pop_free(steal_loc_l);
476 node = __local_list_pop_pending(lru, steal_loc_l);
478 raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
480 steal = get_next_cpu(steal);
481 } while (!node && steal != first_steal);
483 loc_l->next_steal = steal;
486 raw_spin_lock_irqsave(&loc_l->lock, flags);
487 __local_list_add_pending(lru, loc_l, cpu, node, hash);
488 raw_spin_unlock_irqrestore(&loc_l->lock, flags);
494 struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
497 return bpf_percpu_lru_pop_free(lru, hash);
499 return bpf_common_lru_pop_free(lru, hash);
502 static void bpf_common_lru_push_free(struct bpf_lru *lru,
503 struct bpf_lru_node *node)
505 u8 node_type = READ_ONCE(node->type);
508 if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) ||
509 WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE))
512 if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) {
513 struct bpf_lru_locallist *loc_l;
515 loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
517 raw_spin_lock_irqsave(&loc_l->lock, flags);
519 if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
520 raw_spin_unlock_irqrestore(&loc_l->lock, flags);
524 node->type = BPF_LRU_LOCAL_LIST_T_FREE;
526 list_move(&node->list, local_free_list(loc_l));
528 raw_spin_unlock_irqrestore(&loc_l->lock, flags);
533 bpf_lru_list_push_free(&lru->common_lru.lru_list, node);
536 static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
537 struct bpf_lru_node *node)
539 struct bpf_lru_list *l;
542 l = per_cpu_ptr(lru->percpu_lru, node->cpu);
544 raw_spin_lock_irqsave(&l->lock, flags);
546 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
548 raw_spin_unlock_irqrestore(&l->lock, flags);
551 void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
554 bpf_percpu_lru_push_free(lru, node);
556 bpf_common_lru_push_free(lru, node);
559 static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
560 u32 node_offset, u32 elem_size,
563 struct bpf_lru_list *l = &lru->common_lru.lru_list;
566 for (i = 0; i < nr_elems; i++) {
567 struct bpf_lru_node *node;
569 node = (struct bpf_lru_node *)(buf + node_offset);
570 node->type = BPF_LRU_LIST_T_FREE;
572 list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
577 static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
578 u32 node_offset, u32 elem_size,
583 struct bpf_lru_list *l;
585 pcpu_entries = nr_elems / num_possible_cpus();
589 for_each_possible_cpu(cpu) {
590 struct bpf_lru_node *node;
592 l = per_cpu_ptr(lru->percpu_lru, cpu);
594 node = (struct bpf_lru_node *)(buf + node_offset);
596 node->type = BPF_LRU_LIST_T_FREE;
598 list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
603 if (i % pcpu_entries)
608 void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
609 u32 elem_size, u32 nr_elems)
612 bpf_percpu_lru_populate(lru, buf, node_offset, elem_size,
615 bpf_common_lru_populate(lru, buf, node_offset, elem_size,
619 static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
623 for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
624 INIT_LIST_HEAD(&loc_l->lists[i]);
626 loc_l->next_steal = cpu;
628 raw_spin_lock_init(&loc_l->lock);
631 static void bpf_lru_list_init(struct bpf_lru_list *l)
635 for (i = 0; i < NR_BPF_LRU_LIST_T; i++)
636 INIT_LIST_HEAD(&l->lists[i]);
638 for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++)
641 l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
643 raw_spin_lock_init(&l->lock);
646 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
647 del_from_htab_func del_from_htab, void *del_arg)
652 lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
653 if (!lru->percpu_lru)
656 for_each_possible_cpu(cpu) {
657 struct bpf_lru_list *l;
659 l = per_cpu_ptr(lru->percpu_lru, cpu);
660 bpf_lru_list_init(l);
662 lru->nr_scans = PERCPU_NR_SCANS;
664 struct bpf_common_lru *clru = &lru->common_lru;
666 clru->local_list = alloc_percpu(struct bpf_lru_locallist);
667 if (!clru->local_list)
670 for_each_possible_cpu(cpu) {
671 struct bpf_lru_locallist *loc_l;
673 loc_l = per_cpu_ptr(clru->local_list, cpu);
674 bpf_lru_locallist_init(loc_l, cpu);
677 bpf_lru_list_init(&clru->lru_list);
678 lru->nr_scans = LOCAL_NR_SCANS;
681 lru->percpu = percpu;
682 lru->del_from_htab = del_from_htab;
683 lru->del_arg = del_arg;
684 lru->hash_offset = hash_offset;
689 void bpf_lru_destroy(struct bpf_lru *lru)
692 free_percpu(lru->percpu_lru);
694 free_percpu(lru->common_lru.local_list);