bpf: Add bpf_for_each_map_elem() helper

author Yonghong Song <yhs@fb.com>

Fri, 26 Feb 2021 20:49:25 +0000 (12:49 -0800)

committer Alexei Starovoitov <ast@kernel.org>

Fri, 26 Feb 2021 21:23:52 +0000 (13:23 -0800)
author Yonghong Song <yhs@fb.com>
Fri, 26 Feb 2021 20:49:25 +0000 (12:49 -0800)
committer Alexei Starovoitov <ast@kernel.org>
Fri, 26 Feb 2021 21:23:52 +0000 (13:23 -0800)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index e1e4d2f60527a99fe2a9fa34ea848e2b5a5623ce..aeb1b93a4d75bd8e2fc1558dde7ac0e301da0907 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,7 @@ struct bpf_local_storage;
  struct bpf_local_storage_map;
  struct kobject;
  struct mem_cgroup;
+struct bpf_func_state;
  
  extern struct idr btf_idr;
  extern spinlock_t btf_idr_lock;
@@ -129,6 +130,13 @@ struct bpf_map_ops {
         bool (*map_meta_equal)(const struct bpf_map *meta0,
                                const struct bpf_map *meta1);
  
+
+       int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
+                                             struct bpf_func_state *caller,
+                                             struct bpf_func_state *callee);
+       int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn,
+                                    void *callback_ctx, u64 flags);
+
         /* BTF name and id of struct allocated by map_alloc */
         const char * const map_btf_name;
         int *map_btf_id;
@@ -295,6 +303,8 @@ enum bpf_arg_type {
         ARG_CONST_ALLOC_SIZE_OR_ZERO,   /* number of allocated bytes requested */
         ARG_PTR_TO_BTF_ID_SOCK_COMMON,  /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
         ARG_PTR_TO_PERCPU_BTF_ID,       /* pointer to in-kernel percpu type */
+       ARG_PTR_TO_FUNC,        /* pointer to a bpf program function */
+       ARG_PTR_TO_STACK_OR_NULL,       /* pointer to stack or NULL */
         __BPF_ARG_TYPE_MAX,
  };
  
@@ -411,6 +421,8 @@ enum bpf_reg_type {
         PTR_TO_RDWR_BUF,         /* reg points to a read/write buffer */
         PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
         PTR_TO_PERCPU_BTF_ID,    /* reg points to a percpu kernel variable */
+       PTR_TO_FUNC,             /* reg points to a bpf program function */
+       PTR_TO_MAP_KEY,          /* reg points to a map element key */
  };
  
  /* The information passed from prog-specific *_is_valid_access
@@ -1887,6 +1899,7 @@ extern const struct bpf_func_proto bpf_sock_from_file_proto;
  extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
  extern const struct bpf_func_proto bpf_task_storage_get_proto;
  extern const struct bpf_func_proto bpf_task_storage_delete_proto;
+extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
  
  const struct bpf_func_proto *bpf_tracing_func_proto(
         enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h

index 971b33aca13d043be660a501248f5fa836f91dd3..51c2ffa3d901162a9901eaa43f03697b5a5e205c 100644 (file)
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -68,6 +68,8 @@ struct bpf_reg_state {
                         unsigned long raw1;
                         unsigned long raw2;
                 } raw;
+
+               u32 subprogno; /* for PTR_TO_FUNC */
         };
         /* For PTR_TO_PACKET, used to find other pointers with the same variable
          * offset, so they can share range knowledge.
@@ -204,6 +206,7 @@ struct bpf_func_state {
         int acquired_refs;
         struct bpf_reference_state *refs;
         int allocated_stack;
+       bool in_callback_fn;
         struct bpf_stack_state *stack;
  };
  
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index 79c893310492b33689352b9dff5f53a42dd9d72b..b89af20cfa19ac0ee8019e1f9ced89e738b65269 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -393,6 +393,15 @@ enum bpf_link_type {
   *                   is struct/union.
   */
  #define BPF_PSEUDO_BTF_ID      3
+/* insn[0].src_reg:  BPF_PSEUDO_FUNC
+ * insn[0].imm:      insn offset to the func
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of the function
+ * verifier type:    PTR_TO_FUNC.
+ */
+#define BPF_PSEUDO_FUNC                4
  
  /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
   * offset to another bpf function
@@ -3909,6 +3918,34 @@ union bpf_attr {
   *             * **BPF_MTU_CHK_RET_FRAG_NEEDED**
   *             * **BPF_MTU_CHK_RET_SEGS_TOOBIG**
   *
+ * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags)
+ *     Description
+ *             For each element in **map**, call **callback_fn** function with
+ *             **map**, **callback_ctx** and other map-specific parameters.
+ *             The **callback_fn** should be a static function and
+ *             the **callback_ctx** should be a pointer to the stack.
+ *             The **flags** is used to control certain aspects of the helper.
+ *             Currently, the **flags** must be 0.
+ *
+ *             The following are a list of supported map types and their
+ *             respective expected callback signatures:
+ *
+ *             BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH,
+ *             BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ *             BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY
+ *
+ *             long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx);
+ *
+ *             For per_cpu maps, the map_value is the value on the cpu where the
+ *             bpf_prog is running.
+ *
+ *             If **callback_fn** return 0, the helper will continue to the next
+ *             element. If return value is 1, the helper will skip the rest of
+ *             elements and return. Other return values are not used now.
+ *
+ *     Return
+ *             The number of traversed map elements for success, **-EINVAL** for
+ *             invalid **flags**.
   */
  #define __BPF_FUNC_MAPPER(FN)          \
         FN(unspec),                     \
@@ -4075,6 +4112,7 @@ union bpf_attr {
         FN(ima_inode_hash),             \
         FN(sock_from_file),             \
         FN(check_mtu),                  \
+       FN(for_each_map_elem),          \
         /* */
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c

index a0d9eade9c8049098d7c83b5316fced5895dae87..931870f9cf5670664679fbfec012d334c487a152 100644 (file)
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
          */
         return ret == 0 ? 0 : -EAGAIN;
  }
+
+BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn,
+          void *, callback_ctx, u64, flags)
+{
+       return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags);
+}
+
+const struct bpf_func_proto bpf_for_each_map_elem_proto = {
+       .func           = bpf_for_each_map_elem,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_CONST_MAP_PTR,
+       .arg2_type      = ARG_PTR_TO_FUNC,
+       .arg3_type      = ARG_PTR_TO_STACK_OR_NULL,
+       .arg4_type      = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c

index 308427fe03a3ad09e68593e8900ceb9afeb104dd..074800226327d75669ec6d57e3e3a3c7f01a44a1 100644 (file)
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -708,6 +708,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
                 return &bpf_ringbuf_discard_proto;
         case BPF_FUNC_ringbuf_query:
                 return &bpf_ringbuf_query_proto;
+       case BPF_FUNC_for_each_map_elem:
+               return &bpf_for_each_map_elem_proto;
         default:
                 break;
         }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

index dbdca49ac6cc479ca0d4b413bc7470eb05d89eaa..53afe9461b030f411f80867f0953ebcbbfe1b8ef 100644 (file)
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn)
                insn->src_reg == BPF_PSEUDO_CALL;
  }
  
+static bool bpf_pseudo_func(const struct bpf_insn *insn)
+{
+       return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
+              insn->src_reg == BPF_PSEUDO_FUNC;
+}
+
  struct bpf_call_arg_meta {
         struct bpf_map *map_ptr;
         bool raw_mode;
@@ -248,6 +254,7 @@ struct bpf_call_arg_meta {
         u32 btf_id;
         struct btf *ret_btf;
         u32 ret_btf_id;
+       u32 subprogno;
  };
  
  struct btf *btf_vmlinux;
@@ -427,6 +434,7 @@ static bool reg_type_not_null(enum bpf_reg_type type)
         return type == PTR_TO_SOCKET ||
                 type == PTR_TO_TCP_SOCK ||
                 type == PTR_TO_MAP_VALUE ||
+               type == PTR_TO_MAP_KEY ||
                 type == PTR_TO_SOCK_COMMON;
  }
  
@@ -469,7 +477,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type)
                type == ARG_PTR_TO_MEM_OR_NULL ||
                type == ARG_PTR_TO_CTX_OR_NULL ||
                type == ARG_PTR_TO_SOCKET_OR_NULL ||
-              type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+              type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
+              type == ARG_PTR_TO_STACK_OR_NULL;
  }
  
  /* Determine whether the function releases some resources allocated by another
@@ -552,6 +561,8 @@ static const char * const reg_type_str[] = {
         [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
         [PTR_TO_RDWR_BUF]       = "rdwr_buf",
         [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
+       [PTR_TO_FUNC]           = "func",
+       [PTR_TO_MAP_KEY]        = "map_key",
  };
  
  static char slot_type_char[] = {
@@ -623,6 +634,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
                         if (type_is_pkt_pointer(t))
                                 verbose(env, ",r=%d", reg->range);
                         else if (t == CONST_PTR_TO_MAP ||
+                                t == PTR_TO_MAP_KEY ||
                                  t == PTR_TO_MAP_VALUE ||
                                  t == PTR_TO_MAP_VALUE_OR_NULL)
                                 verbose(env, ",ks=%d,vs=%d",
@@ -1555,6 +1567,19 @@ static int check_subprogs(struct bpf_verifier_env *env)
  
         /* determine subprog starts. The end is one before the next starts */
         for (i = 0; i < insn_cnt; i++) {
+               if (bpf_pseudo_func(insn + i)) {
+                       if (!env->bpf_capable) {
+                               verbose(env,
+                                       "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+                               return -EPERM;
+                       }
+                       ret = add_subprog(env, i + insn[i].imm + 1);
+                       if (ret < 0)
+                               return ret;
+                       /* remember subprog */
+                       insn[i + 1].imm = ret;
+                       continue;
+               }
                 if (!bpf_pseudo_call(insn + i))
                         continue;
                 if (!env->bpf_capable) {
@@ -2286,6 +2311,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
         case PTR_TO_PERCPU_BTF_ID:
         case PTR_TO_MEM:
         case PTR_TO_MEM_OR_NULL:
+       case PTR_TO_FUNC:
+       case PTR_TO_MAP_KEY:
                 return true;
         default:
                 return false;
@@ -2890,6 +2917,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
  
         reg = &cur_regs(env)[regno];
         switch (reg->type) {
+       case PTR_TO_MAP_KEY:
+               verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
+                       mem_size, off, size);
+               break;
         case PTR_TO_MAP_VALUE:
                 verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
                         mem_size, off, size);
@@ -3295,6 +3326,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
         case PTR_TO_FLOW_KEYS:
                 pointer_desc = "flow keys ";
                 break;
+       case PTR_TO_MAP_KEY:
+               pointer_desc = "key ";
+               break;
         case PTR_TO_MAP_VALUE:
                 pointer_desc = "value ";
                 break;
@@ -3396,7 +3430,7 @@ process_func:
  continue_func:
         subprog_end = subprog[idx + 1].start;
         for (; i < subprog_end; i++) {
-               if (!bpf_pseudo_call(insn + i))
+               if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
                         continue;
                 /* remember insn and function to return to */
                 ret_insn[frame] = i + 1;
@@ -3833,7 +3867,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
         /* for access checks, reg->off is just part of off */
         off += reg->off;
  
-       if (reg->type == PTR_TO_MAP_VALUE) {
+       if (reg->type == PTR_TO_MAP_KEY) {
+               if (t == BPF_WRITE) {
+                       verbose(env, "write to change key R%d not allowed\n", regno);
+                       return -EACCES;
+               }
+
+               err = check_mem_region_access(env, regno, off, size,
+                                             reg->map_ptr->key_size, false);
+               if (err)
+                       return err;
+               if (value_regno >= 0)
+                       mark_reg_unknown(env, regs, value_regno);
+       } else if (reg->type == PTR_TO_MAP_VALUE) {
                 if (t == BPF_WRITE && value_regno >= 0 &&
                     is_pointer_value(env, value_regno)) {
                         verbose(env, "R%d leaks addr into map\n", value_regno);
@@ -4249,6 +4295,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
         case PTR_TO_PACKET_META:
                 return check_packet_access(env, regno, reg->off, access_size,
                                            zero_size_allowed);
+       case PTR_TO_MAP_KEY:
+               return check_mem_region_access(env, regno, reg->off, access_size,
+                                              reg->map_ptr->key_size, false);
         case PTR_TO_MAP_VALUE:
                 if (check_map_access_type(env, regno, reg->off, access_size,
                                           meta && meta->raw_mode ? BPF_WRITE :
@@ -4465,6 +4514,7 @@ static const struct bpf_reg_types map_key_value_types = {
                 PTR_TO_STACK,
                 PTR_TO_PACKET,
                 PTR_TO_PACKET_META,
+               PTR_TO_MAP_KEY,
                 PTR_TO_MAP_VALUE,
         },
  };
@@ -4496,6 +4546,7 @@ static const struct bpf_reg_types mem_types = {
                 PTR_TO_STACK,
                 PTR_TO_PACKET,
                 PTR_TO_PACKET_META,
+               PTR_TO_MAP_KEY,
                 PTR_TO_MAP_VALUE,
                 PTR_TO_MEM,
                 PTR_TO_RDONLY_BUF,
@@ -4508,6 +4559,7 @@ static const struct bpf_reg_types int_ptr_types = {
                 PTR_TO_STACK,
                 PTR_TO_PACKET,
                 PTR_TO_PACKET_META,
+               PTR_TO_MAP_KEY,
                 PTR_TO_MAP_VALUE,
         },
  };
@@ -4520,6 +4572,8 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T
  static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
  static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
  static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
+static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
  
  static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
         [ARG_PTR_TO_MAP_KEY]            = &map_key_value_types,
@@ -4548,6 +4602,8 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
         [ARG_PTR_TO_INT]                = &int_ptr_types,
         [ARG_PTR_TO_LONG]               = &int_ptr_types,
         [ARG_PTR_TO_PERCPU_BTF_ID]      = &percpu_btf_ptr_types,
+       [ARG_PTR_TO_FUNC]               = &func_ptr_types,
+       [ARG_PTR_TO_STACK_OR_NULL]      = &stack_ptr_types,
  };
  
  static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4729,6 +4785,8 @@ skip_type_check:
                         verbose(env, "verifier internal error\n");
                         return -EFAULT;
                 }
+       } else if (arg_type == ARG_PTR_TO_FUNC) {
+               meta->subprogno = reg->subprogno;
         } else if (arg_type_is_mem_ptr(arg_type)) {
                 /* The access to this pointer is only checked when we hit the
                  * next is_mem_size argument below.
@@ -5375,6 +5433,35 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
         return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
  }
  
+static int set_map_elem_callback_state(struct bpf_verifier_env *env,
+                                      struct bpf_func_state *caller,
+                                      struct bpf_func_state *callee,
+                                      int insn_idx)
+{
+       struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
+       struct bpf_map *map;
+       int err;
+
+       if (bpf_map_ptr_poisoned(insn_aux)) {
+               verbose(env, "tail_call abusing map_ptr\n");
+               return -EINVAL;
+       }
+
+       map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+       if (!map->ops->map_set_for_each_callback_args ||
+           !map->ops->map_for_each_callback) {
+               verbose(env, "callback function not allowed for map\n");
+               return -ENOTSUPP;
+       }
+
+       err = map->ops->map_set_for_each_callback_args(env, caller, callee);
+       if (err)
+               return err;
+
+       callee->in_callback_fn = true;
+       return 0;
+}
+
  static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
  {
         struct bpf_verifier_state *state = env->cur_state;
@@ -5397,8 +5484,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
  
         state->curframe--;
         caller = state->frame[state->curframe];
-       /* return to the caller whatever r0 had in the callee */
-       caller->regs[BPF_REG_0] = *r0;
+       if (callee->in_callback_fn) {
+               /* enforce R0 return value range [0, 1]. */
+               struct tnum range = tnum_range(0, 1);
+
+               if (r0->type != SCALAR_VALUE) {
+                       verbose(env, "R0 not a scalar value\n");
+                       return -EACCES;
+               }
+               if (!tnum_in(range, r0->var_off)) {
+                       verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+                       return -EINVAL;
+               }
+       } else {
+               /* return to the caller whatever r0 had in the callee */
+               caller->regs[BPF_REG_0] = *r0;
+       }
  
         /* Transfer references to the caller */
         err = transfer_reference_state(caller, callee);
@@ -5453,7 +5554,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
             func_id != BPF_FUNC_map_delete_elem &&
             func_id != BPF_FUNC_map_push_elem &&
             func_id != BPF_FUNC_map_pop_elem &&
-           func_id != BPF_FUNC_map_peek_elem)
+           func_id != BPF_FUNC_map_peek_elem &&
+           func_id != BPF_FUNC_for_each_map_elem)
                 return 0;
  
         if (map == NULL) {
@@ -5534,15 +5636,18 @@ static int check_reference_leak(struct bpf_verifier_env *env)
         return state->acquired_refs ? -EINVAL : 0;
  }
  
-static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
+static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+                            int *insn_idx_p)
  {
         const struct bpf_func_proto *fn = NULL;
         struct bpf_reg_state *regs;
         struct bpf_call_arg_meta meta;
+       int insn_idx = *insn_idx_p;
         bool changes_data;
-       int i, err;
+       int i, err, func_id;
  
         /* find function prototype */
+       func_id = insn->imm;
         if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
                 verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
                         func_id);
@@ -5638,6 +5743,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
                 return -EINVAL;
         }
  
+       if (func_id == BPF_FUNC_for_each_map_elem) {
+               err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+                                       set_map_elem_callback_state);
+               if (err < 0)
+                       return -EINVAL;
+       }
+
         /* reset caller saved regs */
         for (i = 0; i < CALLER_SAVED_REGS; i++) {
                 mark_reg_not_init(env, regs, caller_saved[i]);
@@ -5891,6 +6003,19 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
                 else
                         *ptr_limit = -off;
                 return 0;
+       case PTR_TO_MAP_KEY:
+               /* Currently, this code is not exercised as the only use
+                * is bpf_for_each_map_elem() helper which requires
+                * bpf_capble. The code has been tested manually for
+                * future use.
+                */
+               if (mask_to_left) {
+                       *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
+               } else {
+                       off = ptr_reg->smin_value + ptr_reg->off;
+                       *ptr_limit = ptr_reg->map_ptr->key_size - off;
+               }
+               return 0;
         case PTR_TO_MAP_VALUE:
                 if (mask_to_left) {
                         *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
@@ -6092,6 +6217,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                 verbose(env, "R%d pointer arithmetic on %s prohibited\n",
                         dst, reg_type_str[ptr_reg->type]);
                 return -EACCES;
+       case PTR_TO_MAP_KEY:
         case PTR_TO_MAP_VALUE:
                 if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
                         verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
@@ -8271,6 +8397,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
                 return 0;
         }
  
+       if (insn->src_reg == BPF_PSEUDO_FUNC) {
+               struct bpf_prog_aux *aux = env->prog->aux;
+               u32 subprogno = insn[1].imm;
+
+               if (!aux->func_info) {
+                       verbose(env, "missing btf func_info\n");
+                       return -EINVAL;
+               }
+               if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
+                       verbose(env, "callback function not static\n");
+                       return -EINVAL;
+               }
+
+               dst_reg->type = PTR_TO_FUNC;
+               dst_reg->subprogno = subprogno;
+               return 0;
+       }
+
         map = env->used_maps[aux->map_index];
         mark_reg_known_zero(env, regs, insn->dst_reg);
         dst_reg->map_ptr = map;
@@ -8657,6 +8801,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
         struct bpf_insn *insns = env->prog->insnsi;
         int ret;
  
+       if (bpf_pseudo_func(insns + t))
+               return visit_func_call_insn(t, insn_cnt, insns, env, true);
+
         /* All non-branch instructions have a single fall-through edge. */
         if (BPF_CLASS(insns[t].code) != BPF_JMP &&
             BPF_CLASS(insns[t].code) != BPF_JMP32)
@@ -9277,6 +9424,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
                          */
                         return false;
                 }
+       case PTR_TO_MAP_KEY:
         case PTR_TO_MAP_VALUE:
                 /* If the new min/max/var_off satisfy the old ones and
                  * everything else matches, we are OK.
@@ -10123,10 +10271,9 @@ static int do_check(struct bpf_verifier_env *env)
                                 if (insn->src_reg == BPF_PSEUDO_CALL)
                                         err = check_func_call(env, insn, &env->insn_idx);
                                 else
-                                       err = check_helper_call(env, insn->imm, env->insn_idx);
+                                       err = check_helper_call(env, insn, &env->insn_idx);
                                 if (err)
                                         return err;
-
                         } else if (opcode == BPF_JA) {
                                 if (BPF_SRC(insn->code) != BPF_K ||
                                     insn->imm != 0 ||
@@ -10555,6 +10702,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
                                 goto next_insn;
                         }
  
+                       if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
+                               aux = &env->insn_aux_data[i];
+                               aux->ptr_type = PTR_TO_FUNC;
+                               goto next_insn;
+                       }
+
                         /* In final convert_pseudo_ld_imm64() step, this is
                          * converted into regular 64-bit imm load insn.
                          */
@@ -10687,9 +10840,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
         int insn_cnt = env->prog->len;
         int i;
  
-       for (i = 0; i < insn_cnt; i++, insn++)
-               if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
-                       insn->src_reg = 0;
+       for (i = 0; i < insn_cnt; i++, insn++) {
+               if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
+                       continue;
+               if (insn->src_reg == BPF_PSEUDO_FUNC)
+                       continue;
+               insn->src_reg = 0;
+       }
  }
  
  /* single env->prog->insni[off] instruction was replaced with the range
@@ -11330,6 +11487,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
                 return 0;
  
         for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+               if (bpf_pseudo_func(insn)) {
+                       env->insn_aux_data[i].call_imm = insn->imm;
+                       /* subprog is encoded in insn[1].imm */
+                       continue;
+               }
+
                 if (!bpf_pseudo_call(insn))
                         continue;
                 /* Upon error here we cannot fall back to interpreter but
@@ -11459,6 +11622,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
         for (i = 0; i < env->subprog_cnt; i++) {
                 insn = func[i]->insnsi;
                 for (j = 0; j < func[i]->len; j++, insn++) {
+                       if (bpf_pseudo_func(insn)) {
+                               subprog = insn[1].imm;
+                               insn[0].imm = (u32)(long)func[subprog]->bpf_func;
+                               insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
+                               continue;
+                       }
                         if (!bpf_pseudo_call(insn))
                                 continue;
                         subprog = insn->off;
@@ -11504,6 +11673,11 @@ static int jit_subprogs(struct bpf_verifier_env *env)
          * later look the same as if they were interpreted only.
          */
         for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+               if (bpf_pseudo_func(insn)) {
+                       insn[0].imm = env->insn_aux_data[i].call_imm;
+                       insn[1].imm = find_subprog(env, i + insn[0].imm + 1);
+                       continue;
+               }
                 if (!bpf_pseudo_call(insn))
                         continue;
                 insn->off = env->insn_aux_data[i].call_imm;
@@ -11568,6 +11742,14 @@ static int fixup_call_args(struct bpf_verifier_env *env)
                 return -EINVAL;
         }
         for (i = 0; i < prog->len; i++, insn++) {
+               if (bpf_pseudo_func(insn)) {
+                       /* When JIT fails the progs with callback calls
+                        * have to be rejected, since interpreter doesn't support them yet.
+                        */
+                       verbose(env, "callbacks are not allowed in non-JITed programs\n");
+                       return -EINVAL;
+               }
+
                 if (!bpf_pseudo_call(insn))
                         continue;
                 depth = get_callee_stack_depth(env, insn, i);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c

index e9701744d8e4c55da0a6704f68c445e65bdfb0c5..0d23755c2747bd117ed45e69eefe27b3adffe267 100644 (file)
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1371,6 +1371,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                 return &bpf_task_storage_get_proto;
         case BPF_FUNC_task_storage_delete:
                 return &bpf_task_storage_delete_proto;
+       case BPF_FUNC_for_each_map_elem:
+               return &bpf_for_each_map_elem_proto;
         default:
                 return NULL;
         }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h

index 79c893310492b33689352b9dff5f53a42dd9d72b..b89af20cfa19ac0ee8019e1f9ced89e738b65269 100644 (file)
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -393,6 +393,15 @@ enum bpf_link_type {
   *                   is struct/union.
   */
  #define BPF_PSEUDO_BTF_ID      3
+/* insn[0].src_reg:  BPF_PSEUDO_FUNC
+ * insn[0].imm:      insn offset to the func
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of the function
+ * verifier type:    PTR_TO_FUNC.
+ */
+#define BPF_PSEUDO_FUNC                4
  
  /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
   * offset to another bpf function
@@ -3909,6 +3918,34 @@ union bpf_attr {
   *             * **BPF_MTU_CHK_RET_FRAG_NEEDED**
   *             * **BPF_MTU_CHK_RET_SEGS_TOOBIG**
   *
+ * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags)
+ *     Description
+ *             For each element in **map**, call **callback_fn** function with
+ *             **map**, **callback_ctx** and other map-specific parameters.
+ *             The **callback_fn** should be a static function and
+ *             the **callback_ctx** should be a pointer to the stack.
+ *             The **flags** is used to control certain aspects of the helper.
+ *             Currently, the **flags** must be 0.
+ *
+ *             The following are a list of supported map types and their
+ *             respective expected callback signatures:
+ *
+ *             BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH,
+ *             BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ *             BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY
+ *
+ *             long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx);
+ *
+ *             For per_cpu maps, the map_value is the value on the cpu where the
+ *             bpf_prog is running.
+ *
+ *             If **callback_fn** return 0, the helper will continue to the next
+ *             element. If return value is 1, the helper will skip the rest of
+ *             elements and return. Other return values are not used now.
+ *
+ *     Return
+ *             The number of traversed map elements for success, **-EINVAL** for
+ *             invalid **flags**.
   */
  #define __BPF_FUNC_MAPPER(FN)          \
         FN(unspec),                     \
@@ -4075,6 +4112,7 @@ union bpf_attr {
         FN(ima_inode_hash),             \
         FN(sock_from_file),             \
         FN(check_mtu),                  \
+       FN(for_each_map_elem),          \
         /* */
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
author	Yonghong Song <yhs@fb.com>
	Fri, 26 Feb 2021 20:49:25 +0000 (12:49 -0800)
committer	Alexei Starovoitov <ast@kernel.org>
	Fri, 26 Feb 2021 21:23:52 +0000 (13:23 -0800)
include/linux/bpf.h		patch \| blob \| history
include/linux/bpf_verifier.h		patch \| blob \| history
include/uapi/linux/bpf.h		patch \| blob \| history
kernel/bpf/bpf_iter.c		patch \| blob \| history
kernel/bpf/helpers.c		patch \| blob \| history
kernel/bpf/verifier.c		patch \| blob \| history
kernel/trace/bpf_trace.c		patch \| blob \| history
tools/include/uapi/linux/bpf.h		patch \| blob \| history