bpf: Add bpf_core_add_cands() and wire it into bpf_core_apply_relo_insn().
authorAlexei Starovoitov <ast@kernel.org>
Wed, 1 Dec 2021 18:10:31 +0000 (10:10 -0800)
committerAndrii Nakryiko <andrii@kernel.org>
Thu, 2 Dec 2021 19:18:35 +0000 (11:18 -0800)
Given BPF program's BTF root type name perform the following steps:
. search in vmlinux candidate cache.
. if (present in cache and candidate list >= 1) return candidate list.
. do a linear search through kernel BTFs for possible candidates.
. regardless of number of candidates found populate vmlinux cache.
. if (candidate list >= 1) return candidate list.
. search in module candidate cache.
. if (present in cache) return candidate list (even if list is empty).
. do a linear search through BTFs of all kernel modules
  collecting candidates from all of them.
. regardless of number of candidates found populate module cache.
. return candidate list.
Then wire the result into bpf_core_apply_relo_insn().

When BPF program is trying to CO-RE relocate a type
that doesn't exist in either vmlinux BTF or in modules BTFs
these steps will perform 2 cache lookups when cache is hit.

Note the cache doesn't prevent the abuse by the program that might
have lots of relocations that cannot be resolved. Hence cond_resched().

CO-RE in the kernel requires CAP_BPF, since BTF loading requires it.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-9-alexei.starovoitov@gmail.com
kernel/bpf/btf.c

index dbf1f38..ed4258c 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
 #include <net/sock.h>
+#include "../tools/lib/bpf/relo_core.h"
 
 /* BTF (BPF Type Format) is the meta data format which describes
  * the data types of BPF program/map.  Hence, it basically focus
@@ -6169,6 +6170,8 @@ btf_module_read(struct file *file, struct kobject *kobj,
        return len;
 }
 
+static void purge_cand_cache(struct btf *btf);
+
 static int btf_module_notify(struct notifier_block *nb, unsigned long op,
                             void *module)
 {
@@ -6203,6 +6206,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
                        goto out;
                }
 
+               purge_cand_cache(NULL);
                mutex_lock(&btf_module_mutex);
                btf_mod->module = module;
                btf_mod->btf = btf;
@@ -6245,6 +6249,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
                        list_del(&btf_mod->list);
                        if (btf_mod->sysfs_attr)
                                sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
+                       purge_cand_cache(btf_mod->btf);
                        btf_put(btf_mod->btf);
                        kfree(btf_mod->sysfs_attr);
                        kfree(btf_mod);
@@ -6440,8 +6445,347 @@ size_t bpf_core_essential_name_len(const char *name)
        return n;
 }
 
+struct bpf_cand_cache {
+       const char *name;
+       u32 name_len;
+       u16 kind;
+       u16 cnt;
+       struct {
+               const struct btf *btf;
+               u32 id;
+       } cands[];
+};
+
+static void bpf_free_cands(struct bpf_cand_cache *cands)
+{
+       if (!cands->cnt)
+               /* empty candidate array was allocated on stack */
+               return;
+       kfree(cands);
+}
+
+static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands)
+{
+       kfree(cands->name);
+       kfree(cands);
+}
+
+#define VMLINUX_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
+
+#define MODULE_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
+
+static DEFINE_MUTEX(cand_cache_mutex);
+
+static void __print_cand_cache(struct bpf_verifier_log *log,
+                              struct bpf_cand_cache **cache,
+                              int cache_size)
+{
+       struct bpf_cand_cache *cc;
+       int i, j;
+
+       for (i = 0; i < cache_size; i++) {
+               cc = cache[i];
+               if (!cc)
+                       continue;
+               bpf_log(log, "[%d]%s(", i, cc->name);
+               for (j = 0; j < cc->cnt; j++) {
+                       bpf_log(log, "%d", cc->cands[j].id);
+                       if (j < cc->cnt - 1)
+                               bpf_log(log, " ");
+               }
+               bpf_log(log, "), ");
+       }
+}
+
+static void print_cand_cache(struct bpf_verifier_log *log)
+{
+       mutex_lock(&cand_cache_mutex);
+       bpf_log(log, "vmlinux_cand_cache:");
+       __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+       bpf_log(log, "\nmodule_cand_cache:");
+       __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+       bpf_log(log, "\n");
+       mutex_unlock(&cand_cache_mutex);
+}
+
+static u32 hash_cands(struct bpf_cand_cache *cands)
+{
+       return jhash(cands->name, cands->name_len, 0);
+}
+
+static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands,
+                                              struct bpf_cand_cache **cache,
+                                              int cache_size)
+{
+       struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size];
+
+       if (cc && cc->name_len == cands->name_len &&
+           !strncmp(cc->name, cands->name, cands->name_len))
+               return cc;
+       return NULL;
+}
+
+static size_t sizeof_cands(int cnt)
+{
+       return offsetof(struct bpf_cand_cache, cands[cnt]);
+}
+
+static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
+                                                 struct bpf_cand_cache **cache,
+                                                 int cache_size)
+{
+       struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands;
+
+       if (*cc) {
+               bpf_free_cands_from_cache(*cc);
+               *cc = NULL;
+       }
+       new_cands = kmalloc(sizeof_cands(cands->cnt), GFP_KERNEL);
+       if (!new_cands) {
+               bpf_free_cands(cands);
+               return ERR_PTR(-ENOMEM);
+       }
+       memcpy(new_cands, cands, sizeof_cands(cands->cnt));
+       /* strdup the name, since it will stay in cache.
+        * the cands->name points to strings in prog's BTF and the prog can be unloaded.
+        */
+       new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL);
+       bpf_free_cands(cands);
+       if (!new_cands->name) {
+               kfree(new_cands);
+               return ERR_PTR(-ENOMEM);
+       }
+       *cc = new_cands;
+       return new_cands;
+}
+
+static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache,
+                              int cache_size)
+{
+       struct bpf_cand_cache *cc;
+       int i, j;
+
+       for (i = 0; i < cache_size; i++) {
+               cc = cache[i];
+               if (!cc)
+                       continue;
+               if (!btf) {
+                       /* when new module is loaded purge all of module_cand_cache,
+                        * since new module might have candidates with the name
+                        * that matches cached cands.
+                        */
+                       bpf_free_cands_from_cache(cc);
+                       cache[i] = NULL;
+                       continue;
+               }
+               /* when module is unloaded purge cache entries
+                * that match module's btf
+                */
+               for (j = 0; j < cc->cnt; j++)
+                       if (cc->cands[j].btf == btf) {
+                               bpf_free_cands_from_cache(cc);
+                               cache[i] = NULL;
+                               break;
+                       }
+       }
+
+}
+
+static void purge_cand_cache(struct btf *btf)
+{
+       mutex_lock(&cand_cache_mutex);
+       __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+       mutex_unlock(&cand_cache_mutex);
+}
+
+static struct bpf_cand_cache *
+bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
+                  int targ_start_id)
+{
+       struct bpf_cand_cache *new_cands;
+       const struct btf_type *t;
+       const char *targ_name;
+       size_t targ_essent_len;
+       int n, i;
+
+       n = btf_nr_types(targ_btf);
+       for (i = targ_start_id; i < n; i++) {
+               t = btf_type_by_id(targ_btf, i);
+               if (btf_kind(t) != cands->kind)
+                       continue;
+
+               targ_name = btf_name_by_offset(targ_btf, t->name_off);
+               if (!targ_name)
+                       continue;
+
+               /* the resched point is before strncmp to make sure that search
+                * for non-existing name will have a chance to schedule().
+                */
+               cond_resched();
+
+               if (strncmp(cands->name, targ_name, cands->name_len) != 0)
+                       continue;
+
+               targ_essent_len = bpf_core_essential_name_len(targ_name);
+               if (targ_essent_len != cands->name_len)
+                       continue;
+
+               /* most of the time there is only one candidate for a given kind+name pair */
+               new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL);
+               if (!new_cands) {
+                       bpf_free_cands(cands);
+                       return ERR_PTR(-ENOMEM);
+               }
+
+               memcpy(new_cands, cands, sizeof_cands(cands->cnt));
+               bpf_free_cands(cands);
+               cands = new_cands;
+               cands->cands[cands->cnt].btf = targ_btf;
+               cands->cands[cands->cnt].id = i;
+               cands->cnt++;
+       }
+       return cands;
+}
+
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
+{
+       struct bpf_cand_cache *cands, *cc, local_cand = {};
+       const struct btf *local_btf = ctx->btf;
+       const struct btf_type *local_type;
+       const struct btf *main_btf;
+       size_t local_essent_len;
+       struct btf *mod_btf;
+       const char *name;
+       int id;
+
+       main_btf = bpf_get_btf_vmlinux();
+       if (IS_ERR(main_btf))
+               return (void *)main_btf;
+
+       local_type = btf_type_by_id(local_btf, local_type_id);
+       if (!local_type)
+               return ERR_PTR(-EINVAL);
+
+       name = btf_name_by_offset(local_btf, local_type->name_off);
+       if (str_is_empty(name))
+               return ERR_PTR(-EINVAL);
+       local_essent_len = bpf_core_essential_name_len(name);
+
+       cands = &local_cand;
+       cands->name = name;
+       cands->kind = btf_kind(local_type);
+       cands->name_len = local_essent_len;
+
+       cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+       /* cands is a pointer to stack here */
+       if (cc) {
+               if (cc->cnt)
+                       return cc;
+               goto check_modules;
+       }
+
+       /* Attempt to find target candidates in vmlinux BTF first */
+       cands = bpf_core_add_cands(cands, main_btf, 1);
+       if (IS_ERR(cands))
+               return cands;
+
+       /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */
+
+       /* populate cache even when cands->cnt == 0 */
+       cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+       if (IS_ERR(cc))
+               return cc;
+
+       /* if vmlinux BTF has any candidate, don't go for module BTFs */
+       if (cc->cnt)
+               return cc;
+
+check_modules:
+       /* cands is a pointer to stack here and cands->cnt == 0 */
+       cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+       if (cc)
+               /* if cache has it return it even if cc->cnt == 0 */
+               return cc;
+
+       /* If candidate is not found in vmlinux's BTF then search in module's BTFs */
+       spin_lock_bh(&btf_idr_lock);
+       idr_for_each_entry(&btf_idr, mod_btf, id) {
+               if (!btf_is_module(mod_btf))
+                       continue;
+               /* linear search could be slow hence unlock/lock
+                * the IDR to avoiding holding it for too long
+                */
+               btf_get(mod_btf);
+               spin_unlock_bh(&btf_idr_lock);
+               cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
+               if (IS_ERR(cands)) {
+                       btf_put(mod_btf);
+                       return cands;
+               }
+               spin_lock_bh(&btf_idr_lock);
+               btf_put(mod_btf);
+       }
+       spin_unlock_bh(&btf_idr_lock);
+       /* cands is a pointer to kmalloced memory here if cands->cnt > 0
+        * or pointer to stack if cands->cnd == 0.
+        * Copy it into the cache even when cands->cnt == 0 and
+        * return the result.
+        */
+       return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+}
+
 int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
                   int relo_idx, void *insn)
 {
-       return -EOPNOTSUPP;
+       bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
+       struct bpf_core_cand_list cands = {};
+       int err;
+
+       if (need_cands) {
+               struct bpf_cand_cache *cc;
+               int i;
+
+               mutex_lock(&cand_cache_mutex);
+               cc = bpf_core_find_cands(ctx, relo->type_id);
+               if (IS_ERR(cc)) {
+                       bpf_log(ctx->log, "target candidate search failed for %d\n",
+                               relo->type_id);
+                       err = PTR_ERR(cc);
+                       goto out;
+               }
+               if (cc->cnt) {
+                       cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
+                       if (!cands.cands) {
+                               err = -ENOMEM;
+                               goto out;
+                       }
+               }
+               for (i = 0; i < cc->cnt; i++) {
+                       bpf_log(ctx->log,
+                               "CO-RE relocating %s %s: found target candidate [%d]\n",
+                               btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
+                       cands.cands[i].btf = cc->cands[i].btf;
+                       cands.cands[i].id = cc->cands[i].id;
+               }
+               cands.len = cc->cnt;
+               /* cand_cache_mutex needs to span the cache lookup and
+                * copy of btf pointer into bpf_core_cand_list,
+                * since module can be unloaded while bpf_core_apply_relo_insn
+                * is working with module's btf.
+                */
+       }
+
+       err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
+                                      relo, relo_idx, ctx->btf, &cands);
+out:
+       if (need_cands) {
+               kfree(cands.cands);
+               mutex_unlock(&cand_cache_mutex);
+               if (ctx->log->level & BPF_LOG_LEVEL2)
+                       print_cand_cache(ctx->log);
+       }
+       return err;
 }