From 15a07b33814d14ca817887dbea8530728dc0fbe4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:55 -0800 Subject: [PATCH] bpf: add lookup/update support for per-cpu hash and array maps The functions bpf_map_lookup_elem(map, key, value) and bpf_map_update_elem(map, key, value, flags) need to get/set values from all-cpus for per-cpu hash and array maps, so that user space can aggregate/update them as necessary. Example of single counter aggregation in user space: unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); long values[nr_cpus]; long value = 0; bpf_lookup_elem(fd, key, values); for (i = 0; i < nr_cpus; i++) value += values[i]; The user space must provide round_up(value_size, 8) * nr_cpus array to get/set values, since kernel will use 'long' copy of per-cpu values to try to copy good counters atomically. It's a best-effort, since bpf programs and user space are racing to access the same memory. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 23 ++++++++++++++ kernel/bpf/arraymap.c | 64 +++++++++++++++++++++++++++++++++++++++ kernel/bpf/hashtab.c | 83 +++++++++++++++++++++++++++++++++++++++++++++------ kernel/bpf/syscall.c | 57 ++++++++++++++++++++++++----------- 4 files changed, 201 insertions(+), 26 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 141fb0d4..90ee6ab 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -183,6 +183,29 @@ int bpf_prog_new_fd(struct bpf_prog *prog); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname); +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 flags); +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 flags); + +/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and + * forced to use 'long' read/writes to try to atomically copy long counters. + * Best-effort only. No barriers here, since it _will_ race with concurrent + * updates from BPF programs. Called from bpf syscall and mostly used with + * size 8 or 16 bytes, so ask compiler to inline it. + */ +static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) +{ + const long *lsrc = src; + long *ldst = dst; + + size /= sizeof(long); + while (size--) + *ldst++ = *lsrc++; +} + /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); #else diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b9bf1d7..bd3bdf2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -130,6 +130,32 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index]); } +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(index >= array->map.max_entries)) + return -ENOENT; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -177,6 +203,44 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + if (unlikely(index >= array->map.max_entries)) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (unlikely(map_flags == BPF_NOEXIST)) + /* all elements already exist */ + return -EEXIST; + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + off += size; + } + rcu_read_unlock(); + return 0; +} + /* Called from syscall or from eBPF program */ static int array_map_delete_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2be5f6e..fd5db8f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -290,7 +290,7 @@ static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size) static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, - bool percpu) + bool percpu, bool onallcpus) { u32 size = htab->map.value_size; struct htab_elem *l_new; @@ -312,8 +312,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, return NULL; } - /* copy true value_size bytes */ - memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } htab_elem_set_ptr(l_new, key_size, pptr); } else { memcpy(l_new->key + round_up(key_size, 8), value, size); @@ -368,7 +378,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* allocate new element outside of the lock, since * we're most likley going to insert it */ - l_new = alloc_htab_elem(htab, key, value, key_size, hash, false); + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); if (!l_new) return -ENOMEM; @@ -402,8 +412,9 @@ err: return ret; } -static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; @@ -436,12 +447,25 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { + void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); + u32 size = htab->map.value_size; + /* per-cpu hash map can update value in-place */ - memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)), - value, htab->map.value_size); + if (!onallcpus) { + memcpy(this_cpu_ptr(pptr), value, size); + } else { + int off = 0, cpu; + + size = round_up(size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true); + hash, true, onallcpus); if (!l_new) { ret = -ENOMEM; goto err; @@ -455,6 +479,12 @@ err: return ret; } +static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { @@ -557,6 +587,41 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +{ + struct htab_elem *l; + void __percpu *pptr; + int ret = -ENOENT; + int cpu, off = 0; + u32 size; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + l = __htab_map_lookup_elem(map, key); + if (!l) + goto out; + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + ret = 0; +out: + rcu_read_unlock(); + return ret; +} + +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, true); +} + static const struct bpf_map_ops htab_percpu_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6373970..c95a753 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -239,6 +239,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; + u32 value_size; struct fd f; int err; @@ -259,23 +260,35 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; - rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); - if (ptr) - memcpy(value, ptr, map->value_size); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_copy(map, key, value); + } else { + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, value_size); + rcu_read_unlock(); + err = ptr ? 0 : -ENOENT; + } - err = -ENOENT; - if (!ptr) + if (err) goto free_value; err = -EFAULT; - if (copy_to_user(uvalue, value, map->value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; err = 0; @@ -298,6 +311,7 @@ static int map_update_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; + u32 value_size; struct fd f; int err; @@ -318,21 +332,30 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; err = -EFAULT; - if (copy_from_user(value, uvalue, map->value_size) != 0) + if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* eBPF program that use maps are running under rcu_read_lock(), - * therefore all map accessors rely on this fact, so do the same here - */ - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, attr->flags); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_update(map, key, value, attr->flags); + } else { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, attr->flags); + rcu_read_unlock(); + } free_value: kfree(value); -- 2.7.4