bpf: offload: add infrastructure for loading programs for a specific netdev
authorJakub Kicinski <jakub.kicinski@netronome.com>
Fri, 3 Nov 2017 20:56:17 +0000 (13:56 -0700)
committerDavid S. Miller <davem@davemloft.net>
Sun, 5 Nov 2017 13:26:18 +0000 (22:26 +0900)
The fact that we don't know which device the program is going
to be used on is quite limiting in current eBPF infrastructure.
We have to reverse or limit the changes which kernel makes to
the loaded bytecode if we want it to be offloaded to a networking
device.  We also have to invent new APIs for debugging and
troubleshooting support.

Make it possible to load programs for a specific netdev.  This
helps us to bring the debug information closer to the core
eBPF infrastructure (e.g. we will be able to reuse the verifer
log in device JIT).  It allows device JITs to perform translation
on the original bytecode.

__bpf_prog_get() when called to get a reference for an attachment
point will now refuse to give it if program has a device assigned.
Following patches will add a version of that function which passes
the expected netdev in. @type argument in __bpf_prog_get() is
renamed to attach_type to make it clearer that it's only set on
attachment.

All calls to ndo_bpf are protected by rtnl, only verifier callbacks
are not.  We need a wait queue to make sure netdev doesn't get
destroyed while verifier is still running and calling its driver.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/linux/bpf.h
include/linux/bpf_verifier.h
include/linux/netdevice.h
include/uapi/linux/bpf.h
kernel/bpf/Makefile
kernel/bpf/core.c
kernel/bpf/offload.c [new file with mode: 0644]
kernel/bpf/syscall.c
kernel/bpf/verifier.c

index 520aeeb..e45d43f 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/rbtree_latch.h>
 #include <linux/numa.h>
+#include <linux/wait.h>
 
 struct perf_event;
 struct bpf_prog;
@@ -182,6 +183,16 @@ struct bpf_verifier_ops {
                                  struct bpf_prog *prog, u32 *target_size);
 };
 
+struct bpf_dev_offload {
+       struct bpf_prog         *prog;
+       struct net_device       *netdev;
+       void                    *dev_priv;
+       struct list_head        offloads;
+       bool                    dev_state;
+       bool                    verifier_running;
+       wait_queue_head_t       verifier_done;
+};
+
 struct bpf_prog_aux {
        atomic_t refcnt;
        u32 used_map_cnt;
@@ -199,6 +210,7 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
        void *security;
 #endif
+       struct bpf_dev_offload *offload;
        union {
                struct work_struct work;
                struct rcu_head rcu;
@@ -317,6 +329,7 @@ extern const struct file_operations bpf_prog_fops;
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
 
+extern const struct bpf_prog_ops bpf_offload_prog_ops;
 extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
 extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
@@ -491,6 +504,29 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+int bpf_prog_offload_compile(struct bpf_prog *prog);
+void bpf_prog_offload_destroy(struct bpf_prog *prog);
+
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+       return aux->offload;
+}
+#else
+static inline int bpf_prog_offload_init(struct bpf_prog *prog,
+                                       union bpf_attr *attr)
+{
+       return -EOPNOTSUPP;
+}
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+       return false;
+}
+#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
+
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
index 3b0976a..e45011d 100644 (file)
@@ -153,6 +153,7 @@ struct bpf_verifier_env {
        struct bpf_verifier_state *cur_state; /* current verifier state */
        struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
        const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
+       const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
        void *analyzer_priv; /* pointer to external analyzer's private data */
        struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
        u32 used_map_cnt;               /* number of used maps */
@@ -169,6 +170,15 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
        return env->cur_state->regs;
 }
 
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
+#else
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+       return -EOPNOTSUPP;
+}
+#endif
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
                 void *priv);
 
index 9af9fea..fda527c 100644 (file)
@@ -797,8 +797,13 @@ enum bpf_netdev_command {
         * is equivalent to XDP_ATTACHED_DRV.
         */
        XDP_QUERY_PROG,
+       /* BPF program for offload callbacks, invoked at program load time. */
+       BPF_OFFLOAD_VERIFIER_PREP,
+       BPF_OFFLOAD_TRANSLATE,
+       BPF_OFFLOAD_DESTROY,
 };
 
+struct bpf_ext_analyzer_ops;
 struct netlink_ext_ack;
 
 struct netdev_bpf {
@@ -815,6 +820,15 @@ struct netdev_bpf {
                        u8 prog_attached;
                        u32 prog_id;
                };
+               /* BPF_OFFLOAD_VERIFIER_PREP */
+               struct {
+                       struct bpf_prog *prog;
+                       const struct bpf_ext_analyzer_ops *ops; /* callee set */
+               } verifier;
+               /* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */
+               struct {
+                       struct bpf_prog *prog;
+               } offload;
        };
 };
 
index a982067..80d191a 100644 (file)
@@ -260,6 +260,7 @@ union bpf_attr {
                __u32           kern_version;   /* checked when prog_type=kprobe */
                __u32           prog_flags;
                char            prog_name[BPF_OBJ_NAME_LEN];
+               __u32           prog_target_ifindex;    /* ifindex of netdev to prep for */
        };
 
        struct { /* anonymous struct used by BPF_OBJ_* commands */
index 16e95c8..e691da0 100644 (file)
@@ -7,6 +7,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
index 7fe4487..8a6c377 100644 (file)
@@ -1380,7 +1380,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
         * valid program, which in this case would simply not
         * be JITed, but falls back to the interpreter.
         */
-       fp = bpf_int_jit_compile(fp);
+       if (!bpf_prog_is_dev_bound(fp->aux)) {
+               fp = bpf_int_jit_compile(fp);
+       } else {
+               *err = bpf_prog_offload_compile(fp);
+               if (*err)
+                       return fp;
+       }
        bpf_prog_lock_ro(fp);
 
        /* The tail call compatibility check can only be done at
@@ -1549,6 +1555,8 @@ static void bpf_prog_free_deferred(struct work_struct *work)
        struct bpf_prog_aux *aux;
 
        aux = container_of(work, struct bpf_prog_aux, work);
+       if (bpf_prog_is_dev_bound(aux))
+               bpf_prog_offload_destroy(aux->prog);
        bpf_jit_free(aux->prog);
 }
 
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
new file mode 100644 (file)
index 0000000..5553e0e
--- /dev/null
@@ -0,0 +1,182 @@
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+
+/* protected by RTNL */
+static LIST_HEAD(bpf_prog_offload_devs);
+
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+       struct net *net = current->nsproxy->net_ns;
+       struct bpf_dev_offload *offload;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (attr->prog_flags)
+               return -EINVAL;
+
+       offload = kzalloc(sizeof(*offload), GFP_USER);
+       if (!offload)
+               return -ENOMEM;
+
+       offload->prog = prog;
+       init_waitqueue_head(&offload->verifier_done);
+
+       rtnl_lock();
+       offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex);
+       if (!offload->netdev) {
+               rtnl_unlock();
+               kfree(offload);
+               return -EINVAL;
+       }
+
+       prog->aux->offload = offload;
+       list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+       rtnl_unlock();
+
+       return 0;
+}
+
+static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
+                            struct netdev_bpf *data)
+{
+       struct net_device *netdev = prog->aux->offload->netdev;
+
+       ASSERT_RTNL();
+
+       if (!netdev)
+               return -ENODEV;
+       if (!netdev->netdev_ops->ndo_bpf)
+               return -EOPNOTSUPP;
+
+       data->command = cmd;
+
+       return netdev->netdev_ops->ndo_bpf(netdev, data);
+}
+
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+       struct netdev_bpf data = {};
+       int err;
+
+       data.verifier.prog = env->prog;
+
+       rtnl_lock();
+       err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
+       if (err)
+               goto exit_unlock;
+
+       env->dev_ops = data.verifier.ops;
+
+       env->prog->aux->offload->dev_state = true;
+       env->prog->aux->offload->verifier_running = true;
+exit_unlock:
+       rtnl_unlock();
+       return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+       struct bpf_dev_offload *offload = prog->aux->offload;
+       struct netdev_bpf data = {};
+
+       data.offload.prog = prog;
+
+       if (offload->verifier_running)
+               wait_event(offload->verifier_done, !offload->verifier_running);
+
+       if (offload->dev_state)
+               WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+
+       offload->dev_state = false;
+       list_del_init(&offload->offloads);
+       offload->netdev = NULL;
+}
+
+void bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+       struct bpf_dev_offload *offload = prog->aux->offload;
+
+       offload->verifier_running = false;
+       wake_up(&offload->verifier_done);
+
+       rtnl_lock();
+       __bpf_prog_offload_destroy(prog);
+       rtnl_unlock();
+
+       kfree(offload);
+}
+
+static int bpf_prog_offload_translate(struct bpf_prog *prog)
+{
+       struct bpf_dev_offload *offload = prog->aux->offload;
+       struct netdev_bpf data = {};
+       int ret;
+
+       data.offload.prog = prog;
+
+       offload->verifier_running = false;
+       wake_up(&offload->verifier_done);
+
+       rtnl_lock();
+       ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
+       rtnl_unlock();
+
+       return ret;
+}
+
+static unsigned int bpf_prog_warn_on_exec(const void *ctx,
+                                         const struct bpf_insn *insn)
+{
+       WARN(1, "attempt to execute device eBPF program on the host!");
+       return 0;
+}
+
+int bpf_prog_offload_compile(struct bpf_prog *prog)
+{
+       prog->bpf_func = bpf_prog_warn_on_exec;
+
+       return bpf_prog_offload_translate(prog);
+}
+
+const struct bpf_prog_ops bpf_offload_prog_ops = {
+};
+
+static int bpf_offload_notification(struct notifier_block *notifier,
+                                   ulong event, void *ptr)
+{
+       struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+       struct bpf_dev_offload *offload, *tmp;
+
+       ASSERT_RTNL();
+
+       switch (event) {
+       case NETDEV_UNREGISTER:
+               list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
+                                        offloads) {
+                       if (offload->netdev == netdev)
+                               __bpf_prog_offload_destroy(offload->prog);
+               }
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block bpf_offload_notifier = {
+       .notifier_call = bpf_offload_notification,
+};
+
+static int __init bpf_offload_init(void)
+{
+       register_netdevice_notifier(&bpf_offload_notifier);
+       return 0;
+}
+
+subsys_initcall(bpf_offload_init);
index 323be24..1574b9f 100644 (file)
@@ -824,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
        if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
                return -EINVAL;
 
-       prog->aux->ops = bpf_prog_types[type];
+       if (!bpf_prog_is_dev_bound(prog->aux))
+               prog->aux->ops = bpf_prog_types[type];
+       else
+               prog->aux->ops = &bpf_offload_prog_ops;
        prog->type = type;
        return 0;
 }
@@ -1054,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
 {
        struct fd f = fdget(ufd);
        struct bpf_prog *prog;
@@ -1062,7 +1065,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
        prog = ____bpf_prog_get(f);
        if (IS_ERR(prog))
                return prog;
-       if (type && prog->type != *type) {
+       if (attach_type && (prog->type != *attach_type || prog->aux->offload)) {
                prog = ERR_PTR(-EINVAL);
                goto out;
        }
@@ -1089,7 +1092,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define        BPF_PROG_LOAD_LAST_FIELD prog_name
+#define        BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1152,6 +1155,12 @@ static int bpf_prog_load(union bpf_attr *attr)
        atomic_set(&prog->aux->refcnt, 1);
        prog->gpl_compatible = is_gpl ? 1 : 0;
 
+       if (attr->prog_target_ifindex) {
+               err = bpf_prog_offload_init(prog, attr);
+               if (err)
+                       goto free_prog;
+       }
+
        /* find program type: socket_filter vs tracing_filter */
        err = find_prog_type(type, prog);
        if (err < 0)
index 04357ad..51aabb3 100644 (file)
@@ -3736,10 +3736,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
                                  int insn_idx, int prev_insn_idx)
 {
-       if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
-               return 0;
+       if (env->analyzer_ops && env->analyzer_ops->insn_hook)
+               return env->analyzer_ops->insn_hook(env, insn_idx,
+                                                   prev_insn_idx);
+       if (env->dev_ops && env->dev_ops->insn_hook)
+               return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
 
-       return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+       return 0;
 }
 
 static int do_check(struct bpf_verifier_env *env)
@@ -4516,6 +4519,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                env->strict_alignment = true;
 
+       if (env->prog->aux->offload) {
+               ret = bpf_prog_offload_verifier_prep(env);
+               if (ret)
+                       goto err_unlock;
+       }
+
        ret = replace_map_fd_with_map_ptr(env);
        if (ret < 0)
                goto skip_full_check;