Merge branch 'Transit between BPF TCP congestion controls.'

author Martin KaFai Lau <martin.lau@kernel.org>

Thu, 23 Mar 2023 05:49:40 +0000 (22:49 -0700)

committer Martin KaFai Lau <martin.lau@kernel.org>

Thu, 23 Mar 2023 05:53:27 +0000 (22:53 -0700)
author Martin KaFai Lau <martin.lau@kernel.org>
Thu, 23 Mar 2023 05:49:40 +0000 (22:49 -0700)
committer Martin KaFai Lau <martin.lau@kernel.org>
Thu, 23 Mar 2023 05:53:27 +0000 (22:53 -0700)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index ec0df05..2d8f3f6 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1476,6 +1476,8 @@ struct bpf_link_ops {
         void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
         int (*fill_link_info)(const struct bpf_link *link,
                               struct bpf_link_info *info);
+       int (*update_map)(struct bpf_link *link, struct bpf_map *new_map,
+                         struct bpf_map *old_map);
  };
  
  struct bpf_tramp_link {
@@ -1518,6 +1520,8 @@ struct bpf_struct_ops {
                            void *kdata, const void *udata);
         int (*reg)(void *kdata);
         void (*unreg)(void *kdata);
+       int (*update)(void *kdata, void *old_kdata);
+       int (*validate)(void *kdata);
         const struct btf_type *type;
         const struct btf_type *value_type;
         const char *name;
@@ -1552,6 +1556,7 @@ static inline void bpf_module_put(const void *data, struct module *owner)
         else
                 module_put(owner);
  }
+int bpf_struct_ops_link_create(union bpf_attr *attr);
  
  #ifdef CONFIG_NET
  /* Define it here to avoid the use of forward declaration */
@@ -1592,6 +1597,11 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
  {
         return -EINVAL;
  }
+static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+       return -EOPNOTSUPP;
+}
+
  #endif
  
  #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
@@ -1945,6 +1955,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd);
  struct bpf_map *__bpf_map_get(struct fd f);
  void bpf_map_inc(struct bpf_map *map);
  void bpf_map_inc_with_uref(struct bpf_map *map);
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
  struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
  void bpf_map_put_with_uref(struct bpf_map *map);
  void bpf_map_put(struct bpf_map *map);
diff --git a/include/net/tcp.h b/include/net/tcp.h

index db9f828..2abb755 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1117,6 +1117,9 @@ struct tcp_congestion_ops {
  
  int tcp_register_congestion_control(struct tcp_congestion_ops *type);
  void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
+int tcp_update_congestion_control(struct tcp_congestion_ops *type,
+                                 struct tcp_congestion_ops *old_type);
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca);
  
  void tcp_assign_congestion_control(struct sock *sk);
  void tcp_init_congestion_control(struct sock *sk);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index 13129df..e3d3b51 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
         BPF_PERF_EVENT,
         BPF_TRACE_KPROBE_MULTI,
         BPF_LSM_CGROUP,
+       BPF_STRUCT_OPS,
         __MAX_BPF_ATTACH_TYPE
  };
  
@@ -1266,6 +1267,9 @@ enum {
  
  /* Create a map that is suitable to be an inner map with dynamic max entries */
         BPF_F_INNER_MAP         = (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+       BPF_F_LINK              = (1U << 13),
  };
  
  /* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
         } task_fd_query;
  
         struct { /* struct used by BPF_LINK_CREATE command */
-               __u32           prog_fd;        /* eBPF program to attach */
+               union {
+                       __u32           prog_fd;        /* eBPF program to attach */
+                       __u32           map_fd;         /* struct_ops to attach */
+               };
                 union {
                         __u32           target_fd;      /* object to attach to */
                         __u32           target_ifindex; /* target ifindex */
@@ -1548,12 +1555,23 @@ union bpf_attr {
  
         struct { /* struct used by BPF_LINK_UPDATE command */
                 __u32           link_fd;        /* link fd */
-               /* new program fd to update link with */
-               __u32           new_prog_fd;
+               union {
+                       /* new program fd to update link with */
+                       __u32           new_prog_fd;
+                       /* new struct_ops map fd to update link with */
+                       __u32           new_map_fd;
+               };
                 __u32           flags;          /* extra flags */
-               /* expected link's program fd; is specified only if
-                * BPF_F_REPLACE flag is set in flags */
-               __u32           old_prog_fd;
+               union {
+                       /* expected link's program fd; is specified only if
+                        * BPF_F_REPLACE flag is set in flags.
+                        */
+                       __u32           old_prog_fd;
+                       /* expected link's map fd; is specified only
+                        * if BPF_F_REPLACE flag is set.
+                        */
+                       __u32           old_map_fd;
+               };
         } link_update;
  
         struct {
@@ -6379,6 +6397,9 @@ struct bpf_link_info {
                 struct {
                         __u32 ifindex;
                 } xdp;
+               struct {
+                       __u32 map_id;
+               } struct_ops;
         };
  } __attribute__((aligned(8)));
  
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c

index ba7a942..6401dec 100644 (file)
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -11,11 +11,13 @@
  #include <linux/refcount.h>
  #include <linux/mutex.h>
  #include <linux/btf_ids.h>
+#include <linux/rcupdate_wait.h>
  
  enum bpf_struct_ops_state {
         BPF_STRUCT_OPS_STATE_INIT,
         BPF_STRUCT_OPS_STATE_INUSE,
         BPF_STRUCT_OPS_STATE_TOBEFREE,
+       BPF_STRUCT_OPS_STATE_READY,
  };
  
  #define BPF_STRUCT_OPS_COMMON_VALUE                    \
@@ -58,6 +60,13 @@ struct bpf_struct_ops_map {
         struct bpf_struct_ops_value kvalue;
  };
  
+struct bpf_struct_ops_link {
+       struct bpf_link link;
+       struct bpf_map __rcu *map;
+};
+
+static DEFINE_MUTEX(update_mutex);
+
  #define VALUE_PREFIX "bpf_struct_ops_"
  #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
  
@@ -249,6 +258,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
         struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
         struct bpf_struct_ops_value *uvalue, *kvalue;
         enum bpf_struct_ops_state state;
+       s64 refcnt;
  
         if (unlikely(*(u32 *)key != 0))
                 return -ENOENT;
@@ -267,7 +277,14 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
         uvalue = value;
         memcpy(uvalue, st_map->uvalue, map->value_size);
         uvalue->state = state;
-       refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+       /* This value offers the user space a general estimate of how
+        * many sockets are still utilizing this struct_ops for TCP
+        * congestion control. The number might not be exact, but it
+        * should sufficiently meet our present goals.
+        */
+       refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+       refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
  
         return 0;
  }
@@ -491,12 +508,29 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
                 *(unsigned long *)(udata + moff) = prog->aux->id;
         }
  
-       refcount_set(&kvalue->refcnt, 1);
-       bpf_map_inc(map);
+       if (st_map->map.map_flags & BPF_F_LINK) {
+               err = st_ops->validate(kdata);
+               if (err)
+                       goto reset_unlock;
+               set_memory_rox((long)st_map->image, 1);
+               /* Let bpf_link handle registration & unregistration.
+                *
+                * Pair with smp_load_acquire() during lookup_elem().
+                */
+               smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+               goto unlock;
+       }
  
         set_memory_rox((long)st_map->image, 1);
         err = st_ops->reg(kdata);
         if (likely(!err)) {
+               /* This refcnt increment on the map here after
+                * 'st_ops->reg()' is secure since the state of the
+                * map must be set to INIT at this moment, and thus
+                * bpf_struct_ops_map_delete_elem() can't unregister
+                * or transition it to TOBEFREE concurrently.
+                */
+               bpf_map_inc(map);
                 /* Pair with smp_load_acquire() during lookup_elem().
                  * It ensures the above udata updates (e.g. prog->aux->id)
                  * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -512,7 +546,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
          */
         set_memory_nx((long)st_map->image, 1);
         set_memory_rw((long)st_map->image, 1);
-       bpf_map_put(map);
  
  reset_unlock:
         bpf_struct_ops_map_put_progs(st_map);
@@ -530,14 +563,16 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
         struct bpf_struct_ops_map *st_map;
  
         st_map = (struct bpf_struct_ops_map *)map;
+       if (st_map->map.map_flags & BPF_F_LINK)
+               return -EOPNOTSUPP;
+
         prev_state = cmpxchg(&st_map->kvalue.state,
                              BPF_STRUCT_OPS_STATE_INUSE,
                              BPF_STRUCT_OPS_STATE_TOBEFREE);
         switch (prev_state) {
         case BPF_STRUCT_OPS_STATE_INUSE:
                 st_map->st_ops->unreg(&st_map->kvalue.data);
-               if (refcount_dec_and_test(&st_map->kvalue.refcnt))
-                       bpf_map_put(map);
+               bpf_map_put(map);
                 return 0;
         case BPF_STRUCT_OPS_STATE_TOBEFREE:
                 return -EINPROGRESS;
@@ -570,7 +605,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
         kfree(value);
  }
  
-static void bpf_struct_ops_map_free(struct bpf_map *map)
+static void __bpf_struct_ops_map_free(struct bpf_map *map)
  {
         struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
  
@@ -582,10 +617,32 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
         bpf_map_area_free(st_map);
  }
  
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+       /* The struct_ops's function may switch to another struct_ops.
+        *
+        * For example, bpf_tcp_cc_x->init() may switch to
+        * another tcp_cc_y by calling
+        * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+        * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
+        * and its refcount may reach 0 which then free its
+        * trampoline image while tcp_cc_x is still running.
+        *
+        * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+        * to finish. bpf-tcp-cc prog is non sleepable.
+        * A rcu_tasks gp is to wait for the last few insn
+        * in the tramopline image to finish before releasing
+        * the trampoline image.
+        */
+       synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+
+       __bpf_struct_ops_map_free(map);
+}
+
  static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
  {
         if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
-           attr->map_flags || !attr->btf_vmlinux_value_type_id)
+           (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
                 return -EINVAL;
         return 0;
  }
@@ -609,6 +666,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
         if (attr->value_size != vt->size)
                 return ERR_PTR(-EINVAL);
  
+       if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update))
+               return ERR_PTR(-EOPNOTSUPP);
+
         t = st_ops->type;
  
         st_map_size = sizeof(*st_map) +
@@ -630,7 +690,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
                                    NUMA_NO_NODE);
         st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
         if (!st_map->uvalue || !st_map->links || !st_map->image) {
-               bpf_struct_ops_map_free(map);
+               __bpf_struct_ops_map_free(map);
                 return ERR_PTR(-ENOMEM);
         }
  
@@ -676,41 +736,175 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
  bool bpf_struct_ops_get(const void *kdata)
  {
         struct bpf_struct_ops_value *kvalue;
+       struct bpf_struct_ops_map *st_map;
+       struct bpf_map *map;
  
         kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+       st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
  
-       return refcount_inc_not_zero(&kvalue->refcnt);
+       map = __bpf_map_inc_not_zero(&st_map->map, false);
+       return !IS_ERR(map);
  }
  
-static void bpf_struct_ops_put_rcu(struct rcu_head *head)
+void bpf_struct_ops_put(const void *kdata)
  {
+       struct bpf_struct_ops_value *kvalue;
         struct bpf_struct_ops_map *st_map;
  
-       st_map = container_of(head, struct bpf_struct_ops_map, rcu);
+       kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+       st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
         bpf_map_put(&st_map->map);
  }
  
-void bpf_struct_ops_put(const void *kdata)
+static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
  {
-       struct bpf_struct_ops_value *kvalue;
+       struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
  
-       kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
-       if (refcount_dec_and_test(&kvalue->refcnt)) {
-               struct bpf_struct_ops_map *st_map;
+       return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
+               map->map_flags & BPF_F_LINK &&
+               /* Pair with smp_store_release() during map_update */
+               smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+}
  
-               st_map = container_of(kvalue, struct bpf_struct_ops_map,
-                                     kvalue);
-               /* The struct_ops's function may switch to another struct_ops.
-                *
-                * For example, bpf_tcp_cc_x->init() may switch to
-                * another tcp_cc_y by calling
-                * setsockopt(TCP_CONGESTION, "tcp_cc_y").
-                * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
-                * and its map->refcnt may reach 0 which then free its
-                * trampoline image while tcp_cc_x is still running.
-                *
-                * Thus, a rcu grace period is needed here.
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+       struct bpf_struct_ops_link *st_link;
+       struct bpf_struct_ops_map *st_map;
+
+       st_link = container_of(link, struct bpf_struct_ops_link, link);
+       st_map = (struct bpf_struct_ops_map *)
+               rcu_dereference_protected(st_link->map, true);
+       if (st_map) {
+               /* st_link->map can be NULL if
+                * bpf_struct_ops_link_create() fails to register.
                  */
-               call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
+               st_map->st_ops->unreg(&st_map->kvalue.data);
+               bpf_map_put(&st_map->map);
         }
+       kfree(st_link);
  }
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+                                           struct seq_file *seq)
+{
+       struct bpf_struct_ops_link *st_link;
+       struct bpf_map *map;
+
+       st_link = container_of(link, struct bpf_struct_ops_link, link);
+       rcu_read_lock();
+       map = rcu_dereference(st_link->map);
+       seq_printf(seq, "map_id:\t%d\n", map->id);
+       rcu_read_unlock();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+                                              struct bpf_link_info *info)
+{
+       struct bpf_struct_ops_link *st_link;
+       struct bpf_map *map;
+
+       st_link = container_of(link, struct bpf_struct_ops_link, link);
+       rcu_read_lock();
+       map = rcu_dereference(st_link->map);
+       info->struct_ops.map_id = map->id;
+       rcu_read_unlock();
+       return 0;
+}
+
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
+                                         struct bpf_map *expected_old_map)
+{
+       struct bpf_struct_ops_map *st_map, *old_st_map;
+       struct bpf_map *old_map;
+       struct bpf_struct_ops_link *st_link;
+       int err = 0;
+
+       st_link = container_of(link, struct bpf_struct_ops_link, link);
+       st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+       if (!bpf_struct_ops_valid_to_reg(new_map))
+               return -EINVAL;
+
+       mutex_lock(&update_mutex);
+
+       old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+       if (expected_old_map && old_map != expected_old_map) {
+               err = -EPERM;
+               goto err_out;
+       }
+
+       old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+       /* The new and old struct_ops must be the same type. */
+       if (st_map->st_ops != old_st_map->st_ops) {
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+       if (err)
+               goto err_out;
+
+       bpf_map_inc(new_map);
+       rcu_assign_pointer(st_link->map, new_map);
+       bpf_map_put(old_map);
+
+err_out:
+       mutex_unlock(&update_mutex);
+
+       return err;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+       .dealloc = bpf_struct_ops_map_link_dealloc,
+       .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+       .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+       .update_map = bpf_struct_ops_map_link_update,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+       struct bpf_struct_ops_link *link = NULL;
+       struct bpf_link_primer link_primer;
+       struct bpf_struct_ops_map *st_map;
+       struct bpf_map *map;
+       int err;
+
+       map = bpf_map_get(attr->link_create.map_fd);
+       if (!map)
+               return -EINVAL;
+
+       st_map = (struct bpf_struct_ops_map *)map;
+
+       if (!bpf_struct_ops_valid_to_reg(map)) {
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       link = kzalloc(sizeof(*link), GFP_USER);
+       if (!link) {
+               err = -ENOMEM;
+               goto err_out;
+       }
+       bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+
+       err = bpf_link_prime(&link->link, &link_primer);
+       if (err)
+               goto err_out;
+
+       err = st_map->st_ops->reg(st_map->kvalue.data);
+       if (err) {
+               bpf_link_cleanup(&link_primer);
+               link = NULL;
+               goto err_out;
+       }
+       RCU_INIT_POINTER(link->map, map);
+
+       return bpf_link_settle(&link_primer);
+
+err_out:
+       bpf_map_put(map);
+       kfree(link);
+       return err;
+}
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index 099e906..b4d758f 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1303,8 +1303,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
         return map;
  }
  
-/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+/* map_idr_lock should have been held or the map should have been
+ * protected by rcu read lock.
+ */
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
  {
         int refold;
  
@@ -2823,16 +2825,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
         const struct bpf_prog *prog = link->prog;
         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
  
-       bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
         seq_printf(m,
                    "link_type:\t%s\n"
-                  "link_id:\t%u\n"
-                  "prog_tag:\t%s\n"
-                  "prog_id:\t%u\n",
+                  "link_id:\t%u\n",
                    bpf_link_type_strs[link->type],
-                  link->id,
-                  prog_tag,
-                  prog->aux->id);
+                  link->id);
+       if (prog) {
+               bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+               seq_printf(m,
+                          "prog_tag:\t%s\n"
+                          "prog_id:\t%u\n",
+                          prog_tag,
+                          prog->aux->id);
+       }
         if (link->ops->show_fdinfo)
                 link->ops->show_fdinfo(link, m);
  }
@@ -4312,7 +4317,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
  
         info.type = link->type;
         info.id = link->id;
-       info.prog_id = link->prog->aux->id;
+       if (link->prog)
+               info.prog_id = link->prog->aux->id;
  
         if (link->ops->fill_link_info) {
                 err = link->ops->fill_link_info(link, &info);
@@ -4575,6 +4581,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
         if (CHECK_ATTR(BPF_LINK_CREATE))
                 return -EINVAL;
  
+       if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+               return bpf_struct_ops_link_create(attr);
+
         prog = bpf_prog_get(attr->link_create.prog_fd);
         if (IS_ERR(prog))
                 return PTR_ERR(prog);
@@ -4673,6 +4682,35 @@ out:
         return ret;
  }
  
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+       struct bpf_map *new_map, *old_map = NULL;
+       int ret;
+
+       new_map = bpf_map_get(attr->link_update.new_map_fd);
+       if (IS_ERR(new_map))
+               return -EINVAL;
+
+       if (attr->link_update.flags & BPF_F_REPLACE) {
+               old_map = bpf_map_get(attr->link_update.old_map_fd);
+               if (IS_ERR(old_map)) {
+                       ret = -EINVAL;
+                       goto out_put;
+               }
+       } else if (attr->link_update.old_map_fd) {
+               ret = -EINVAL;
+               goto out_put;
+       }
+
+       ret = link->ops->update_map(link, new_map, old_map);
+
+       if (old_map)
+               bpf_map_put(old_map);
+out_put:
+       bpf_map_put(new_map);
+       return ret;
+}
+
  #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
  
  static int link_update(union bpf_attr *attr)
@@ -4693,6 +4731,11 @@ static int link_update(union bpf_attr *attr)
         if (IS_ERR(link))
                 return PTR_ERR(link);
  
+       if (link->ops->update_map) {
+               ret = link_update_map(link, attr);
+               goto out_put_link;
+       }
+
         new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
         if (IS_ERR(new_prog)) {
                 ret = PTR_ERR(new_prog);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c

index 13fc0c1..e8b2782 100644 (file)
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -239,8 +239,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
                 if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
                                      sizeof(tcp_ca->name)) <= 0)
                         return -EINVAL;
-               if (tcp_ca_find(utcp_ca->name))
-                       return -EEXIST;
                 return 1;
         }
  
@@ -266,13 +264,25 @@ static void bpf_tcp_ca_unreg(void *kdata)
         tcp_unregister_congestion_control(kdata);
  }
  
+static int bpf_tcp_ca_update(void *kdata, void *old_kdata)
+{
+       return tcp_update_congestion_control(kdata, old_kdata);
+}
+
+static int bpf_tcp_ca_validate(void *kdata)
+{
+       return tcp_validate_congestion_control(kdata);
+}
+
  struct bpf_struct_ops bpf_tcp_congestion_ops = {
         .verifier_ops = &bpf_tcp_ca_verifier_ops,
         .reg = bpf_tcp_ca_reg,
         .unreg = bpf_tcp_ca_unreg,
+       .update = bpf_tcp_ca_update,
         .check_member = bpf_tcp_ca_check_member,
         .init_member = bpf_tcp_ca_init_member,
         .init = bpf_tcp_ca_init,
+       .validate = bpf_tcp_ca_validate,
         .name = "tcp_congestion_ops",
  };
  
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c

index db8b4b4..1b34050 100644 (file)
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -75,14 +75,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
         return NULL;
  }
  
-/*
- * Attach new congestion control algorithm to the list
- * of available options.
- */
-int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
  {
-       int ret = 0;
-
         /* all algorithms must implement these */
         if (!ca->ssthresh || !ca->undo_cwnd ||
             !(ca->cong_avoid || ca->cong_control)) {
@@ -90,6 +84,20 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
                 return -EINVAL;
         }
  
+       return 0;
+}
+
+/* Attach new congestion control algorithm to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+       int ret;
+
+       ret = tcp_validate_congestion_control(ca);
+       if (ret)
+               return ret;
+
         ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
  
         spin_lock(&tcp_cong_list_lock);
@@ -130,6 +138,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
  }
  EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
  
+/* Replace a registered old ca with a new one.
+ *
+ * The new ca must have the same name as the old one, that has been
+ * registered.
+ */
+int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
+{
+       struct tcp_congestion_ops *existing;
+       int ret;
+
+       ret = tcp_validate_congestion_control(ca);
+       if (ret)
+               return ret;
+
+       ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+       spin_lock(&tcp_cong_list_lock);
+       existing = tcp_ca_find_key(old_ca->key);
+       if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
+               pr_notice("%s not registered or non-unique key\n",
+                         ca->name);
+               ret = -EINVAL;
+       } else if (existing != old_ca) {
+               pr_notice("invalid old congestion control algorithm to replace\n");
+               ret = -EINVAL;
+       } else {
+               /* Add the new one before removing the old one to keep
+                * one implementation available all the time.
+                */
+               list_add_tail_rcu(&ca->list, &tcp_cong_list);
+               list_del_rcu(&existing->list);
+               pr_debug("%s updated\n", ca->name);
+       }
+       spin_unlock(&tcp_cong_list_lock);
+
+       /* Wait for outstanding readers to complete before the
+        * module or struct_ops gets removed entirely.
+        */
+       if (!ret)
+               synchronize_rcu();
+
+       return ret;
+}
+
  u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
  {
         const struct tcp_congestion_ops *ca;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h

index 13129df..d6c5a02 100644 (file)
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
         BPF_PERF_EVENT,
         BPF_TRACE_KPROBE_MULTI,
         BPF_LSM_CGROUP,
+       BPF_STRUCT_OPS,
         __MAX_BPF_ATTACH_TYPE
  };
  
@@ -1266,6 +1267,9 @@ enum {
  
  /* Create a map that is suitable to be an inner map with dynamic max entries */
         BPF_F_INNER_MAP         = (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+       BPF_F_LINK              = (1U << 13),
  };
  
  /* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
         } task_fd_query;
  
         struct { /* struct used by BPF_LINK_CREATE command */
-               __u32           prog_fd;        /* eBPF program to attach */
+               union {
+                       __u32           prog_fd;        /* eBPF program to attach */
+                       __u32           map_fd;         /* eBPF struct_ops to attach */
+               };
                 union {
                         __u32           target_fd;      /* object to attach to */
                         __u32           target_ifindex; /* target ifindex */
@@ -1548,12 +1555,23 @@ union bpf_attr {
  
         struct { /* struct used by BPF_LINK_UPDATE command */
                 __u32           link_fd;        /* link fd */
-               /* new program fd to update link with */
-               __u32           new_prog_fd;
+               union {
+                       /* new program fd to update link with */
+                       __u32           new_prog_fd;
+                       /* new struct_ops map fd to update link with */
+                       __u32           new_map_fd;
+               };
                 __u32           flags;          /* extra flags */
-               /* expected link's program fd; is specified only if
-                * BPF_F_REPLACE flag is set in flags */
-               __u32           old_prog_fd;
+               union {
+                       /* expected link's program fd; is specified only if
+                        * BPF_F_REPLACE flag is set in flags.
+                        */
+                       __u32           old_prog_fd;
+                       /* expected link's map fd; is specified only
+                        * if BPF_F_REPLACE flag is set.
+                        */
+                       __u32           old_map_fd;
+               };
         } link_update;
  
         struct {
@@ -6379,6 +6397,9 @@ struct bpf_link_info {
                 struct {
                         __u32 ifindex;
                 } xdp;
+               struct {
+                       __u32 map_id;
+               } struct_ops;
         };
  } __attribute__((aligned(8)));
  
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c

index e750b6f..7670359 100644 (file)
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -794,11 +794,17 @@ int bpf_link_update(int link_fd, int new_prog_fd,
         if (!OPTS_VALID(opts, bpf_link_update_opts))
                 return libbpf_err(-EINVAL);
  
+       if (OPTS_GET(opts, old_prog_fd, 0) && OPTS_GET(opts, old_map_fd, 0))
+               return libbpf_err(-EINVAL);
+
         memset(&attr, 0, attr_sz);
         attr.link_update.link_fd = link_fd;
         attr.link_update.new_prog_fd = new_prog_fd;
         attr.link_update.flags = OPTS_GET(opts, flags, 0);
-       attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0);
+       if (OPTS_GET(opts, old_prog_fd, 0))
+               attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0);
+       else if (OPTS_GET(opts, old_map_fd, 0))
+               attr.link_update.old_map_fd = OPTS_GET(opts, old_map_fd, 0);
  
         ret = sys_bpf(BPF_LINK_UPDATE, &attr, attr_sz);
         return libbpf_err_errno(ret);
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h

index f0f7863..b073e73 100644 (file)
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -336,8 +336,9 @@ struct bpf_link_update_opts {
         size_t sz; /* size of this struct for forward/backward compatibility */
         __u32 flags;       /* extra flags */
         __u32 old_prog_fd; /* expected old program FD */
+       __u32 old_map_fd;  /* expected old map FD */
  };
-#define bpf_link_update_opts__last_field old_prog_fd
+#define bpf_link_update_opts__last_field old_map_fd
  
  LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd,
                                const struct bpf_link_update_opts *opts);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c

index 5d32aa8..f6a071d 100644 (file)
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -116,6 +116,7 @@ static const char * const attach_type_name[] = {
         [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE]    = "sk_reuseport_select_or_migrate",
         [BPF_PERF_EVENT]                = "perf_event",
         [BPF_TRACE_KPROBE_MULTI]        = "trace_kprobe_multi",
+       [BPF_STRUCT_OPS]                = "struct_ops",
  };
  
  static const char * const link_type_name[] = {
@@ -467,6 +468,7 @@ struct bpf_struct_ops {
  #define KCONFIG_SEC ".kconfig"
  #define KSYMS_SEC ".ksyms"
  #define STRUCT_OPS_SEC ".struct_ops"
+#define STRUCT_OPS_LINK_SEC ".struct_ops.link"
  
  enum libbpf_map_type {
         LIBBPF_MAP_UNSPEC,
@@ -596,6 +598,7 @@ struct elf_state {
         Elf64_Ehdr *ehdr;
         Elf_Data *symbols;
         Elf_Data *st_ops_data;
+       Elf_Data *st_ops_link_data;
         size_t shstrndx; /* section index for section name strings */
         size_t strtabidx;
         struct elf_sec_desc *secs;
@@ -605,6 +608,7 @@ struct elf_state {
         int text_shndx;
         int symbols_shndx;
         int st_ops_shndx;
+       int st_ops_link_shndx;
  };
  
  struct usdt_manager;
@@ -1118,7 +1122,8 @@ static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj)
         return 0;
  }
  
-static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
+static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name,
+                               int shndx, Elf_Data *data, __u32 map_flags)
  {
         const struct btf_type *type, *datasec;
         const struct btf_var_secinfo *vsi;
@@ -1129,15 +1134,15 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
         struct bpf_map *map;
         __u32 i;
  
-       if (obj->efile.st_ops_shndx == -1)
+       if (shndx == -1)
                 return 0;
  
         btf = obj->btf;
-       datasec_id = btf__find_by_name_kind(btf, STRUCT_OPS_SEC,
+       datasec_id = btf__find_by_name_kind(btf, sec_name,
                                             BTF_KIND_DATASEC);
         if (datasec_id < 0) {
                 pr_warn("struct_ops init: DATASEC %s not found\n",
-                       STRUCT_OPS_SEC);
+                       sec_name);
                 return -EINVAL;
         }
  
@@ -1150,7 +1155,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
                 type_id = btf__resolve_type(obj->btf, vsi->type);
                 if (type_id < 0) {
                         pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n",
-                               vsi->type, STRUCT_OPS_SEC);
+                               vsi->type, sec_name);
                         return -EINVAL;
                 }
  
@@ -1169,7 +1174,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
                 if (IS_ERR(map))
                         return PTR_ERR(map);
  
-               map->sec_idx = obj->efile.st_ops_shndx;
+               map->sec_idx = shndx;
                 map->sec_offset = vsi->offset;
                 map->name = strdup(var_name);
                 if (!map->name)
@@ -1179,6 +1184,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
                 map->def.key_size = sizeof(int);
                 map->def.value_size = type->size;
                 map->def.max_entries = 1;
+               map->def.map_flags = map_flags;
  
                 map->st_ops = calloc(1, sizeof(*map->st_ops));
                 if (!map->st_ops)
@@ -1191,14 +1197,14 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
                 if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off)
                         return -ENOMEM;
  
-               if (vsi->offset + type->size > obj->efile.st_ops_data->d_size) {
+               if (vsi->offset + type->size > data->d_size) {
                         pr_warn("struct_ops init: var %s is beyond the end of DATASEC %s\n",
-                               var_name, STRUCT_OPS_SEC);
+                               var_name, sec_name);
                         return -EINVAL;
                 }
  
                 memcpy(st_ops->data,
-                      obj->efile.st_ops_data->d_buf + vsi->offset,
+                      data->d_buf + vsi->offset,
                        type->size);
                 st_ops->tname = tname;
                 st_ops->type = type;
@@ -1211,6 +1217,19 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
         return 0;
  }
  
+static int bpf_object_init_struct_ops(struct bpf_object *obj)
+{
+       int err;
+
+       err = init_struct_ops_maps(obj, STRUCT_OPS_SEC, obj->efile.st_ops_shndx,
+                                  obj->efile.st_ops_data, 0);
+       err = err ?: init_struct_ops_maps(obj, STRUCT_OPS_LINK_SEC,
+                                         obj->efile.st_ops_link_shndx,
+                                         obj->efile.st_ops_link_data,
+                                         BPF_F_LINK);
+       return err;
+}
+
  static struct bpf_object *bpf_object__new(const char *path,
                                           const void *obj_buf,
                                           size_t obj_buf_sz,
@@ -1247,6 +1266,7 @@ static struct bpf_object *bpf_object__new(const char *path,
         obj->efile.obj_buf_sz = obj_buf_sz;
         obj->efile.btf_maps_shndx = -1;
         obj->efile.st_ops_shndx = -1;
+       obj->efile.st_ops_link_shndx = -1;
         obj->kconfig_map_idx = -1;
  
         obj->kern_version = get_kernel_version();
@@ -1264,6 +1284,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj)
         obj->efile.elf = NULL;
         obj->efile.symbols = NULL;
         obj->efile.st_ops_data = NULL;
+       obj->efile.st_ops_link_data = NULL;
  
         zfree(&obj->efile.secs);
         obj->efile.sec_cnt = 0;
@@ -2618,7 +2639,7 @@ static int bpf_object__init_maps(struct bpf_object *obj,
         err = bpf_object__init_user_btf_maps(obj, strict, pin_root_path);
         err = err ?: bpf_object__init_global_data_maps(obj);
         err = err ?: bpf_object__init_kconfig_map(obj);
-       err = err ?: bpf_object__init_struct_ops_maps(obj);
+       err = err ?: bpf_object_init_struct_ops(obj);
  
         return err;
  }
@@ -2752,12 +2773,13 @@ static bool libbpf_needs_btf(const struct bpf_object *obj)
  {
         return obj->efile.btf_maps_shndx >= 0 ||
                obj->efile.st_ops_shndx >= 0 ||
+              obj->efile.st_ops_link_shndx >= 0 ||
                obj->nr_extern > 0;
  }
  
  static bool kernel_needs_btf(const struct bpf_object *obj)
  {
-       return obj->efile.st_ops_shndx >= 0;
+       return obj->efile.st_ops_shndx >= 0 || obj->efile.st_ops_link_shndx >= 0;
  }
  
  static int bpf_object__init_btf(struct bpf_object *obj,
@@ -3450,6 +3472,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
                         } else if (strcmp(name, STRUCT_OPS_SEC) == 0) {
                                 obj->efile.st_ops_data = data;
                                 obj->efile.st_ops_shndx = idx;
+                       } else if (strcmp(name, STRUCT_OPS_LINK_SEC) == 0) {
+                               obj->efile.st_ops_link_data = data;
+                               obj->efile.st_ops_link_shndx = idx;
                         } else {
                                 pr_info("elf: skipping unrecognized data section(%d) %s\n",
                                         idx, name);
@@ -3464,6 +3489,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
                         /* Only do relo for section with exec instructions */
                         if (!section_have_execinstr(obj, targ_sec_idx) &&
                             strcmp(name, ".rel" STRUCT_OPS_SEC) &&
+                           strcmp(name, ".rel" STRUCT_OPS_LINK_SEC) &&
                             strcmp(name, ".rel" MAPS_ELF_SEC)) {
                                 pr_info("elf: skipping relo section(%d) %s for section(%d) %s\n",
                                         idx, name, targ_sec_idx,
@@ -6610,7 +6636,7 @@ static int bpf_object__collect_relos(struct bpf_object *obj)
                         return -LIBBPF_ERRNO__INTERNAL;
                 }
  
-               if (idx == obj->efile.st_ops_shndx)
+               if (idx == obj->efile.st_ops_shndx || idx == obj->efile.st_ops_link_shndx)
                         err = bpf_object__collect_st_ops_relos(obj, shdr, data);
                 else if (idx == obj->efile.btf_maps_shndx)
                         err = bpf_object__collect_map_relos(obj, shdr, data);
@@ -7686,6 +7712,37 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
         return 0;
  }
  
+static void bpf_map_prepare_vdata(const struct bpf_map *map)
+{
+       struct bpf_struct_ops *st_ops;
+       __u32 i;
+
+       st_ops = map->st_ops;
+       for (i = 0; i < btf_vlen(st_ops->type); i++) {
+               struct bpf_program *prog = st_ops->progs[i];
+               void *kern_data;
+               int prog_fd;
+
+               if (!prog)
+                       continue;
+
+               prog_fd = bpf_program__fd(prog);
+               kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
+               *(unsigned long *)kern_data = prog_fd;
+       }
+}
+
+static int bpf_object_prepare_struct_ops(struct bpf_object *obj)
+{
+       int i;
+
+       for (i = 0; i < obj->nr_maps; i++)
+               if (bpf_map__is_struct_ops(&obj->maps[i]))
+                       bpf_map_prepare_vdata(&obj->maps[i]);
+
+       return 0;
+}
+
  static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path)
  {
         int err, i;
@@ -7711,6 +7768,7 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
         err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path);
         err = err ? : bpf_object__load_progs(obj, extra_log_level);
         err = err ? : bpf_object_init_prog_arrays(obj);
+       err = err ? : bpf_object_prepare_struct_ops(obj);
  
         if (obj->gen_loader) {
                 /* reset FDs */
@@ -8820,6 +8878,7 @@ const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t)
  }
  
  static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj,
+                                                    int sec_idx,
                                                      size_t offset)
  {
         struct bpf_map *map;
@@ -8829,7 +8888,8 @@ static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj,
                 map = &obj->maps[i];
                 if (!bpf_map__is_struct_ops(map))
                         continue;
-               if (map->sec_offset <= offset &&
+               if (map->sec_idx == sec_idx &&
+                   map->sec_offset <= offset &&
                     offset - map->sec_offset < map->def.value_size)
                         return map;
         }
@@ -8871,7 +8931,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
                 }
  
                 name = elf_sym_str(obj, sym->st_name) ?: "<?>";
-               map = find_struct_ops_map_by_offset(obj, rel->r_offset);
+               map = find_struct_ops_map_by_offset(obj, shdr->sh_info, rel->r_offset);
                 if (!map) {
                         pr_warn("struct_ops reloc: cannot find map at rel->r_offset %zu\n",
                                 (size_t)rel->r_offset);
@@ -8938,8 +8998,9 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
                 }
  
                 /* struct_ops BPF prog can be re-used between multiple
-                * .struct_ops as long as it's the same struct_ops struct
-                * definition and the same function pointer field
+                * .struct_ops & .struct_ops.link as long as it's the
+                * same struct_ops struct definition and the same
+                * function pointer field
                  */
                 if (prog->attach_btf_id != st_ops->type_id ||
                     prog->expected_attach_type != member_idx) {
@@ -11579,22 +11640,30 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog)
         return link;
  }
  
+struct bpf_link_struct_ops {
+       struct bpf_link link;
+       int map_fd;
+};
+
  static int bpf_link__detach_struct_ops(struct bpf_link *link)
  {
+       struct bpf_link_struct_ops *st_link;
         __u32 zero = 0;
  
-       if (bpf_map_delete_elem(link->fd, &zero))
-               return -errno;
+       st_link = container_of(link, struct bpf_link_struct_ops, link);
  
-       return 0;
+       if (st_link->map_fd < 0)
+               /* w/o a real link */
+               return bpf_map_delete_elem(link->fd, &zero);
+
+       return close(link->fd);
  }
  
  struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
  {
-       struct bpf_struct_ops *st_ops;
-       struct bpf_link *link;
-       __u32 i, zero = 0;
-       int err;
+       struct bpf_link_struct_ops *link;
+       __u32 zero = 0;
+       int err, fd;
  
         if (!bpf_map__is_struct_ops(map) || map->fd == -1)
                 return libbpf_err_ptr(-EINVAL);
@@ -11603,31 +11672,72 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
         if (!link)
                 return libbpf_err_ptr(-EINVAL);
  
-       st_ops = map->st_ops;
-       for (i = 0; i < btf_vlen(st_ops->type); i++) {
-               struct bpf_program *prog = st_ops->progs[i];
-               void *kern_data;
-               int prog_fd;
+       /* kern_vdata should be prepared during the loading phase. */
+       err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
+       /* It can be EBUSY if the map has been used to create or
+        * update a link before.  We don't allow updating the value of
+        * a struct_ops once it is set.  That ensures that the value
+        * never changed.  So, it is safe to skip EBUSY.
+        */
+       if (err && (!(map->def.map_flags & BPF_F_LINK) || err != -EBUSY)) {
+               free(link);
+               return libbpf_err_ptr(err);
+       }
  
-               if (!prog)
-                       continue;
+       link->link.detach = bpf_link__detach_struct_ops;
  
-               prog_fd = bpf_program__fd(prog);
-               kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
-               *(unsigned long *)kern_data = prog_fd;
+       if (!(map->def.map_flags & BPF_F_LINK)) {
+               /* w/o a real link */
+               link->link.fd = map->fd;
+               link->map_fd = -1;
+               return &link->link;
         }
  
-       err = bpf_map_update_elem(map->fd, &zero, st_ops->kern_vdata, 0);
-       if (err) {
-               err = -errno;
+       fd = bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, NULL);
+       if (fd < 0) {
                 free(link);
-               return libbpf_err_ptr(err);
+               return libbpf_err_ptr(fd);
         }
  
-       link->detach = bpf_link__detach_struct_ops;
-       link->fd = map->fd;
+       link->link.fd = fd;
+       link->map_fd = map->fd;
  
-       return link;
+       return &link->link;
+}
+
+/*
+ * Swap the back struct_ops of a link with a new struct_ops map.
+ */
+int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map)
+{
+       struct bpf_link_struct_ops *st_ops_link;
+       __u32 zero = 0;
+       int err;
+
+       if (!bpf_map__is_struct_ops(map) || map->fd < 0)
+               return -EINVAL;
+
+       st_ops_link = container_of(link, struct bpf_link_struct_ops, link);
+       /* Ensure the type of a link is correct */
+       if (st_ops_link->map_fd < 0)
+               return -EINVAL;
+
+       err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
+       /* It can be EBUSY if the map has been used to create or
+        * update a link before.  We don't allow updating the value of
+        * a struct_ops once it is set.  That ensures that the value
+        * never changed.  So, it is safe to skip EBUSY.
+        */
+       if (err && err != -EBUSY)
+               return err;
+
+       err = bpf_link_update(link->fd, map->fd, NULL);
+       if (err < 0)
+               return err;
+
+       st_ops_link->map_fd = map->fd;
+
+       return 0;
  }
  
  typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr,
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h

index db4992a..1615e55 100644 (file)
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -719,6 +719,7 @@ bpf_program__attach_freplace(const struct bpf_program *prog,
  struct bpf_map;
  
  LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map);
+LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map);
  
  struct bpf_iter_attach_opts {
         size_t sz; /* size of this struct for forward/backward compatibility */
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map

index 50dde1f..a5aa3a3 100644 (file)
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -386,6 +386,7 @@ LIBBPF_1.1.0 {
  LIBBPF_1.2.0 {
         global:
                 bpf_btf_get_info_by_fd;
+               bpf_link__update_map;
                 bpf_link_get_info_by_fd;
                 bpf_map_get_info_by_fd;
                 bpf_prog_get_info_by_fd;
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c

index e980188..a53c254 100644 (file)
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -8,6 +8,7 @@
  #include "bpf_dctcp.skel.h"
  #include "bpf_cubic.skel.h"
  #include "bpf_tcp_nogpl.skel.h"
+#include "tcp_ca_update.skel.h"
  #include "bpf_dctcp_release.skel.h"
  #include "tcp_ca_write_sk_pacing.skel.h"
  #include "tcp_ca_incompl_cong_ops.skel.h"
@@ -381,6 +382,155 @@ static void test_unsupp_cong_op(void)
         libbpf_set_print(old_print_fn);
  }
  
+static void test_update_ca(void)
+{
+       struct tcp_ca_update *skel;
+       struct bpf_link *link;
+       int saved_ca1_cnt;
+       int err;
+
+       skel = tcp_ca_update__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "open"))
+               return;
+
+       link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+       ASSERT_OK_PTR(link, "attach_struct_ops");
+
+       do_test("tcp_ca_update", NULL);
+       saved_ca1_cnt = skel->bss->ca1_cnt;
+       ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
+
+       err = bpf_link__update_map(link, skel->maps.ca_update_2);
+       ASSERT_OK(err, "update_map");
+
+       do_test("tcp_ca_update", NULL);
+       ASSERT_EQ(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
+       ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt");
+
+       bpf_link__destroy(link);
+       tcp_ca_update__destroy(skel);
+}
+
+static void test_update_wrong(void)
+{
+       struct tcp_ca_update *skel;
+       struct bpf_link *link;
+       int saved_ca1_cnt;
+       int err;
+
+       skel = tcp_ca_update__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "open"))
+               return;
+
+       link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+       ASSERT_OK_PTR(link, "attach_struct_ops");
+
+       do_test("tcp_ca_update", NULL);
+       saved_ca1_cnt = skel->bss->ca1_cnt;
+       ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
+
+       err = bpf_link__update_map(link, skel->maps.ca_wrong);
+       ASSERT_ERR(err, "update_map");
+
+       do_test("tcp_ca_update", NULL);
+       ASSERT_GT(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
+
+       bpf_link__destroy(link);
+       tcp_ca_update__destroy(skel);
+}
+
+static void test_mixed_links(void)
+{
+       struct tcp_ca_update *skel;
+       struct bpf_link *link, *link_nl;
+       int err;
+
+       skel = tcp_ca_update__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "open"))
+               return;
+
+       link_nl = bpf_map__attach_struct_ops(skel->maps.ca_no_link);
+       ASSERT_OK_PTR(link_nl, "attach_struct_ops_nl");
+
+       link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+       ASSERT_OK_PTR(link, "attach_struct_ops");
+
+       do_test("tcp_ca_update", NULL);
+       ASSERT_GT(skel->bss->ca1_cnt, 0, "ca1_ca1_cnt");
+
+       err = bpf_link__update_map(link, skel->maps.ca_no_link);
+       ASSERT_ERR(err, "update_map");
+
+       bpf_link__destroy(link);
+       bpf_link__destroy(link_nl);
+       tcp_ca_update__destroy(skel);
+}
+
+static void test_multi_links(void)
+{
+       struct tcp_ca_update *skel;
+       struct bpf_link *link;
+
+       skel = tcp_ca_update__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "open"))
+               return;
+
+       link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+       ASSERT_OK_PTR(link, "attach_struct_ops_1st");
+       bpf_link__destroy(link);
+
+       /* A map should be able to be used to create links multiple
+        * times.
+        */
+       link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+       ASSERT_OK_PTR(link, "attach_struct_ops_2nd");
+       bpf_link__destroy(link);
+
+       tcp_ca_update__destroy(skel);
+}
+
+static void test_link_replace(void)
+{
+       DECLARE_LIBBPF_OPTS(bpf_link_update_opts, opts);
+       struct tcp_ca_update *skel;
+       struct bpf_link *link;
+       int err;
+
+       skel = tcp_ca_update__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "open"))
+               return;
+
+       link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+       ASSERT_OK_PTR(link, "attach_struct_ops_1st");
+       bpf_link__destroy(link);
+
+       link = bpf_map__attach_struct_ops(skel->maps.ca_update_2);
+       ASSERT_OK_PTR(link, "attach_struct_ops_2nd");
+
+       /* BPF_F_REPLACE with a wrong old map Fd. It should fail!
+        *
+        * With BPF_F_REPLACE, the link should be updated only if the
+        * old map fd given here matches the map backing the link.
+        */
+       opts.old_map_fd = bpf_map__fd(skel->maps.ca_update_1);
+       opts.flags = BPF_F_REPLACE;
+       err = bpf_link_update(bpf_link__fd(link),
+                             bpf_map__fd(skel->maps.ca_update_1),
+                             &opts);
+       ASSERT_ERR(err, "bpf_link_update_fail");
+
+       /* BPF_F_REPLACE with a correct old map Fd. It should success! */
+       opts.old_map_fd = bpf_map__fd(skel->maps.ca_update_2);
+       err = bpf_link_update(bpf_link__fd(link),
+                             bpf_map__fd(skel->maps.ca_update_1),
+                             &opts);
+       ASSERT_OK(err, "bpf_link_update_success");
+
+       bpf_link__destroy(link);
+
+       tcp_ca_update__destroy(skel);
+}
+
  void test_bpf_tcp_ca(void)
  {
         if (test__start_subtest("dctcp"))
@@ -399,4 +549,14 @@ void test_bpf_tcp_ca(void)
                 test_incompl_cong_ops();
         if (test__start_subtest("unsupp_cong_op"))
                 test_unsupp_cong_op();
+       if (test__start_subtest("update_ca"))
+               test_update_ca();
+       if (test__start_subtest("update_wrong"))
+               test_update_wrong();
+       if (test__start_subtest("mixed_links"))
+               test_mixed_links();
+       if (test__start_subtest("multi_links"))
+               test_multi_links();
+       if (test__start_subtest("link_replace"))
+               test_link_replace();
  }
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c

new file mode 100644 (file)

index 0000000..b93a0ed
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int ca1_cnt = 0;
+int ca2_cnt = 0;
+
+static inline struct tcp_sock *tcp_sk(const struct sock *sk)
+{
+       return (struct tcp_sock *)sk;
+}
+
+SEC("struct_ops/ca_update_1_init")
+void BPF_PROG(ca_update_1_init, struct sock *sk)
+{
+       ca1_cnt++;
+}
+
+SEC("struct_ops/ca_update_2_init")
+void BPF_PROG(ca_update_2_init, struct sock *sk)
+{
+       ca2_cnt++;
+}
+
+SEC("struct_ops/ca_update_cong_control")
+void BPF_PROG(ca_update_cong_control, struct sock *sk,
+             const struct rate_sample *rs)
+{
+}
+
+SEC("struct_ops/ca_update_ssthresh")
+__u32 BPF_PROG(ca_update_ssthresh, struct sock *sk)
+{
+       return tcp_sk(sk)->snd_ssthresh;
+}
+
+SEC("struct_ops/ca_update_undo_cwnd")
+__u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk)
+{
+       return tcp_sk(sk)->snd_cwnd;
+}
+
+SEC(".struct_ops.link")
+struct tcp_congestion_ops ca_update_1 = {
+       .init = (void *)ca_update_1_init,
+       .cong_control = (void *)ca_update_cong_control,
+       .ssthresh = (void *)ca_update_ssthresh,
+       .undo_cwnd = (void *)ca_update_undo_cwnd,
+       .name = "tcp_ca_update",
+};
+
+SEC(".struct_ops.link")
+struct tcp_congestion_ops ca_update_2 = {
+       .init = (void *)ca_update_2_init,
+       .cong_control = (void *)ca_update_cong_control,
+       .ssthresh = (void *)ca_update_ssthresh,
+       .undo_cwnd = (void *)ca_update_undo_cwnd,
+       .name = "tcp_ca_update",
+};
+
+SEC(".struct_ops.link")
+struct tcp_congestion_ops ca_wrong = {
+       .cong_control = (void *)ca_update_cong_control,
+       .ssthresh = (void *)ca_update_ssthresh,
+       .undo_cwnd = (void *)ca_update_undo_cwnd,
+       .name = "tcp_ca_wrong",
+};
+
+SEC(".struct_ops")
+struct tcp_congestion_ops ca_no_link = {
+       .cong_control = (void *)ca_update_cong_control,
+       .ssthresh = (void *)ca_update_ssthresh,
+       .undo_cwnd = (void *)ca_update_undo_cwnd,
+       .name = "tcp_ca_no_link",
+};
author	Martin KaFai Lau <martin.lau@kernel.org>
	Thu, 23 Mar 2023 05:49:40 +0000 (22:49 -0700)
committer	Martin KaFai Lau <martin.lau@kernel.org>
	Thu, 23 Mar 2023 05:53:27 +0000 (22:53 -0700)
include/linux/bpf.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
include/uapi/linux/bpf.h		patch \| blob \| history
kernel/bpf/bpf_struct_ops.c		patch \| blob \| history
kernel/bpf/syscall.c		patch \| blob \| history
net/ipv4/bpf_tcp_ca.c		patch \| blob \| history
net/ipv4/tcp_cong.c		patch \| blob \| history
tools/include/uapi/linux/bpf.h		patch \| blob \| history
tools/lib/bpf/bpf.c		patch \| blob \| history
tools/lib/bpf/bpf.h		patch \| blob \| history
tools/lib/bpf/libbpf.c		patch \| blob \| history
tools/lib/bpf/libbpf.h		patch \| blob \| history
tools/lib/bpf/libbpf.map		patch \| blob \| history
tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c		patch \| blob \| history
tools/testing/selftests/bpf/progs/tcp_ca_update.c	[new file with mode: 0644]	patch \| blob