Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf...
[platform/kernel/linux-rpi.git] / kernel / bpf / syscall.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  */
4 #include <linux/bpf.h>
5 #include <linux/bpf-cgroup.h>
6 #include <linux/bpf_trace.h>
7 #include <linux/bpf_lirc.h>
8 #include <linux/bpf_verifier.h>
9 #include <linux/bsearch.h>
10 #include <linux/btf.h>
11 #include <linux/syscalls.h>
12 #include <linux/slab.h>
13 #include <linux/sched/signal.h>
14 #include <linux/vmalloc.h>
15 #include <linux/mmzone.h>
16 #include <linux/anon_inodes.h>
17 #include <linux/fdtable.h>
18 #include <linux/file.h>
19 #include <linux/fs.h>
20 #include <linux/license.h>
21 #include <linux/filter.h>
22 #include <linux/kernel.h>
23 #include <linux/idr.h>
24 #include <linux/cred.h>
25 #include <linux/timekeeping.h>
26 #include <linux/ctype.h>
27 #include <linux/nospec.h>
28 #include <linux/audit.h>
29 #include <uapi/linux/btf.h>
30 #include <linux/pgtable.h>
31 #include <linux/bpf_lsm.h>
32 #include <linux/poll.h>
33 #include <linux/sort.h>
34 #include <linux/bpf-netns.h>
35 #include <linux/rcupdate_trace.h>
36 #include <linux/memcontrol.h>
37 #include <linux/trace_events.h>
38
39 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
40                           (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
41                           (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
42 #define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
43 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
44 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
45                         IS_FD_HASH(map))
46
47 #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
48
49 DEFINE_PER_CPU(int, bpf_prog_active);
50 static DEFINE_IDR(prog_idr);
51 static DEFINE_SPINLOCK(prog_idr_lock);
52 static DEFINE_IDR(map_idr);
53 static DEFINE_SPINLOCK(map_idr_lock);
54 static DEFINE_IDR(link_idr);
55 static DEFINE_SPINLOCK(link_idr_lock);
56
57 int sysctl_unprivileged_bpf_disabled __read_mostly =
58         IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
59
60 static const struct bpf_map_ops * const bpf_map_types[] = {
61 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
62 #define BPF_MAP_TYPE(_id, _ops) \
63         [_id] = &_ops,
64 #define BPF_LINK_TYPE(_id, _name)
65 #include <linux/bpf_types.h>
66 #undef BPF_PROG_TYPE
67 #undef BPF_MAP_TYPE
68 #undef BPF_LINK_TYPE
69 };
70
71 /*
72  * If we're handed a bigger struct than we know of, ensure all the unknown bits
73  * are 0 - i.e. new user-space does not rely on any kernel feature extensions
74  * we don't know about yet.
75  *
76  * There is a ToCToU between this function call and the following
77  * copy_from_user() call. However, this is not a concern since this function is
78  * meant to be a future-proofing of bits.
79  */
80 int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
81                              size_t expected_size,
82                              size_t actual_size)
83 {
84         int res;
85
86         if (unlikely(actual_size > PAGE_SIZE))  /* silly large */
87                 return -E2BIG;
88
89         if (actual_size <= expected_size)
90                 return 0;
91
92         if (uaddr.is_kernel)
93                 res = memchr_inv(uaddr.kernel + expected_size, 0,
94                                  actual_size - expected_size) == NULL;
95         else
96                 res = check_zeroed_user(uaddr.user + expected_size,
97                                         actual_size - expected_size);
98         if (res < 0)
99                 return res;
100         return res ? 0 : -E2BIG;
101 }
102
103 const struct bpf_map_ops bpf_map_offload_ops = {
104         .map_meta_equal = bpf_map_meta_equal,
105         .map_alloc = bpf_map_offload_map_alloc,
106         .map_free = bpf_map_offload_map_free,
107         .map_check_btf = map_check_no_btf,
108 };
109
110 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
111 {
112         const struct bpf_map_ops *ops;
113         u32 type = attr->map_type;
114         struct bpf_map *map;
115         int err;
116
117         if (type >= ARRAY_SIZE(bpf_map_types))
118                 return ERR_PTR(-EINVAL);
119         type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
120         ops = bpf_map_types[type];
121         if (!ops)
122                 return ERR_PTR(-EINVAL);
123
124         if (ops->map_alloc_check) {
125                 err = ops->map_alloc_check(attr);
126                 if (err)
127                         return ERR_PTR(err);
128         }
129         if (attr->map_ifindex)
130                 ops = &bpf_map_offload_ops;
131         map = ops->map_alloc(attr);
132         if (IS_ERR(map))
133                 return map;
134         map->ops = ops;
135         map->map_type = type;
136         return map;
137 }
138
139 static void bpf_map_write_active_inc(struct bpf_map *map)
140 {
141         atomic64_inc(&map->writecnt);
142 }
143
144 static void bpf_map_write_active_dec(struct bpf_map *map)
145 {
146         atomic64_dec(&map->writecnt);
147 }
148
149 bool bpf_map_write_active(const struct bpf_map *map)
150 {
151         return atomic64_read(&map->writecnt) != 0;
152 }
153
154 static u32 bpf_map_value_size(const struct bpf_map *map)
155 {
156         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
157             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
158             map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
159             map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
160                 return round_up(map->value_size, 8) * num_possible_cpus();
161         else if (IS_FD_MAP(map))
162                 return sizeof(u32);
163         else
164                 return  map->value_size;
165 }
166
167 static void maybe_wait_bpf_programs(struct bpf_map *map)
168 {
169         /* Wait for any running BPF programs to complete so that
170          * userspace, when we return to it, knows that all programs
171          * that could be running use the new map value.
172          */
173         if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
174             map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
175                 synchronize_rcu();
176 }
177
178 static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
179                                 void *key, void *value, __u64 flags)
180 {
181         int err;
182
183         /* Need to create a kthread, thus must support schedule */
184         if (bpf_map_is_dev_bound(map)) {
185                 return bpf_map_offload_update_elem(map, key, value, flags);
186         } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
187                    map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
188                 return map->ops->map_update_elem(map, key, value, flags);
189         } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
190                    map->map_type == BPF_MAP_TYPE_SOCKMAP) {
191                 return sock_map_update_elem_sys(map, key, value, flags);
192         } else if (IS_FD_PROG_ARRAY(map)) {
193                 return bpf_fd_array_map_update_elem(map, map_file, key, value,
194                                                     flags);
195         }
196
197         bpf_disable_instrumentation();
198         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
199             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
200                 err = bpf_percpu_hash_update(map, key, value, flags);
201         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
202                 err = bpf_percpu_array_update(map, key, value, flags);
203         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
204                 err = bpf_percpu_cgroup_storage_update(map, key, value,
205                                                        flags);
206         } else if (IS_FD_ARRAY(map)) {
207                 rcu_read_lock();
208                 err = bpf_fd_array_map_update_elem(map, map_file, key, value,
209                                                    flags);
210                 rcu_read_unlock();
211         } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
212                 rcu_read_lock();
213                 err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
214                                                   flags);
215                 rcu_read_unlock();
216         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
217                 /* rcu_read_lock() is not needed */
218                 err = bpf_fd_reuseport_array_update_elem(map, key, value,
219                                                          flags);
220         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
221                    map->map_type == BPF_MAP_TYPE_STACK ||
222                    map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
223                 err = map->ops->map_push_elem(map, value, flags);
224         } else {
225                 rcu_read_lock();
226                 err = map->ops->map_update_elem(map, key, value, flags);
227                 rcu_read_unlock();
228         }
229         bpf_enable_instrumentation();
230         maybe_wait_bpf_programs(map);
231
232         return err;
233 }
234
235 static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
236                               __u64 flags)
237 {
238         void *ptr;
239         int err;
240
241         if (bpf_map_is_dev_bound(map))
242                 return bpf_map_offload_lookup_elem(map, key, value);
243
244         bpf_disable_instrumentation();
245         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
246             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
247                 err = bpf_percpu_hash_copy(map, key, value);
248         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
249                 err = bpf_percpu_array_copy(map, key, value);
250         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
251                 err = bpf_percpu_cgroup_storage_copy(map, key, value);
252         } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
253                 err = bpf_stackmap_copy(map, key, value);
254         } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
255                 err = bpf_fd_array_map_lookup_elem(map, key, value);
256         } else if (IS_FD_HASH(map)) {
257                 err = bpf_fd_htab_map_lookup_elem(map, key, value);
258         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
259                 err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
260         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
261                    map->map_type == BPF_MAP_TYPE_STACK ||
262                    map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
263                 err = map->ops->map_peek_elem(map, value);
264         } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
265                 /* struct_ops map requires directly updating "value" */
266                 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
267         } else {
268                 rcu_read_lock();
269                 if (map->ops->map_lookup_elem_sys_only)
270                         ptr = map->ops->map_lookup_elem_sys_only(map, key);
271                 else
272                         ptr = map->ops->map_lookup_elem(map, key);
273                 if (IS_ERR(ptr)) {
274                         err = PTR_ERR(ptr);
275                 } else if (!ptr) {
276                         err = -ENOENT;
277                 } else {
278                         err = 0;
279                         if (flags & BPF_F_LOCK)
280                                 /* lock 'ptr' and copy everything but lock */
281                                 copy_map_value_locked(map, value, ptr, true);
282                         else
283                                 copy_map_value(map, value, ptr);
284                         /* mask lock and timer, since value wasn't zero inited */
285                         check_and_init_map_value(map, value);
286                 }
287                 rcu_read_unlock();
288         }
289
290         bpf_enable_instrumentation();
291         maybe_wait_bpf_programs(map);
292
293         return err;
294 }
295
296 /* Please, do not use this function outside from the map creation path
297  * (e.g. in map update path) without taking care of setting the active
298  * memory cgroup (see at bpf_map_kmalloc_node() for example).
299  */
300 static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
301 {
302         /* We really just want to fail instead of triggering OOM killer
303          * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
304          * which is used for lower order allocation requests.
305          *
306          * It has been observed that higher order allocation requests done by
307          * vmalloc with __GFP_NORETRY being set might fail due to not trying
308          * to reclaim memory from the page cache, thus we set
309          * __GFP_RETRY_MAYFAIL to avoid such situations.
310          */
311
312         const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
313         unsigned int flags = 0;
314         unsigned long align = 1;
315         void *area;
316
317         if (size >= SIZE_MAX)
318                 return NULL;
319
320         /* kmalloc()'ed memory can't be mmap()'ed */
321         if (mmapable) {
322                 BUG_ON(!PAGE_ALIGNED(size));
323                 align = SHMLBA;
324                 flags = VM_USERMAP;
325         } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
326                 area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
327                                     numa_node);
328                 if (area != NULL)
329                         return area;
330         }
331
332         return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
333                         gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
334                         flags, numa_node, __builtin_return_address(0));
335 }
336
337 void *bpf_map_area_alloc(u64 size, int numa_node)
338 {
339         return __bpf_map_area_alloc(size, numa_node, false);
340 }
341
342 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
343 {
344         return __bpf_map_area_alloc(size, numa_node, true);
345 }
346
347 void bpf_map_area_free(void *area)
348 {
349         kvfree(area);
350 }
351
352 static u32 bpf_map_flags_retain_permanent(u32 flags)
353 {
354         /* Some map creation flags are not tied to the map object but
355          * rather to the map fd instead, so they have no meaning upon
356          * map object inspection since multiple file descriptors with
357          * different (access) properties can exist here. Thus, given
358          * this has zero meaning for the map itself, lets clear these
359          * from here.
360          */
361         return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
362 }
363
364 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
365 {
366         map->map_type = attr->map_type;
367         map->key_size = attr->key_size;
368         map->value_size = attr->value_size;
369         map->max_entries = attr->max_entries;
370         map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
371         map->numa_node = bpf_map_attr_numa_node(attr);
372         map->map_extra = attr->map_extra;
373 }
374
375 static int bpf_map_alloc_id(struct bpf_map *map)
376 {
377         int id;
378
379         idr_preload(GFP_KERNEL);
380         spin_lock_bh(&map_idr_lock);
381         id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
382         if (id > 0)
383                 map->id = id;
384         spin_unlock_bh(&map_idr_lock);
385         idr_preload_end();
386
387         if (WARN_ON_ONCE(!id))
388                 return -ENOSPC;
389
390         return id > 0 ? 0 : id;
391 }
392
393 void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
394 {
395         unsigned long flags;
396
397         /* Offloaded maps are removed from the IDR store when their device
398          * disappears - even if someone holds an fd to them they are unusable,
399          * the memory is gone, all ops will fail; they are simply waiting for
400          * refcnt to drop to be freed.
401          */
402         if (!map->id)
403                 return;
404
405         if (do_idr_lock)
406                 spin_lock_irqsave(&map_idr_lock, flags);
407         else
408                 __acquire(&map_idr_lock);
409
410         idr_remove(&map_idr, map->id);
411         map->id = 0;
412
413         if (do_idr_lock)
414                 spin_unlock_irqrestore(&map_idr_lock, flags);
415         else
416                 __release(&map_idr_lock);
417 }
418
419 #ifdef CONFIG_MEMCG_KMEM
420 static void bpf_map_save_memcg(struct bpf_map *map)
421 {
422         /* Currently if a map is created by a process belonging to the root
423          * memory cgroup, get_obj_cgroup_from_current() will return NULL.
424          * So we have to check map->objcg for being NULL each time it's
425          * being used.
426          */
427         map->objcg = get_obj_cgroup_from_current();
428 }
429
430 static void bpf_map_release_memcg(struct bpf_map *map)
431 {
432         if (map->objcg)
433                 obj_cgroup_put(map->objcg);
434 }
435
436 static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
437 {
438         if (map->objcg)
439                 return get_mem_cgroup_from_objcg(map->objcg);
440
441         return root_mem_cgroup;
442 }
443
444 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
445                            int node)
446 {
447         struct mem_cgroup *memcg, *old_memcg;
448         void *ptr;
449
450         memcg = bpf_map_get_memcg(map);
451         old_memcg = set_active_memcg(memcg);
452         ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
453         set_active_memcg(old_memcg);
454         mem_cgroup_put(memcg);
455
456         return ptr;
457 }
458
459 void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
460 {
461         struct mem_cgroup *memcg, *old_memcg;
462         void *ptr;
463
464         memcg = bpf_map_get_memcg(map);
465         old_memcg = set_active_memcg(memcg);
466         ptr = kzalloc(size, flags | __GFP_ACCOUNT);
467         set_active_memcg(old_memcg);
468         mem_cgroup_put(memcg);
469
470         return ptr;
471 }
472
473 void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
474                                     size_t align, gfp_t flags)
475 {
476         struct mem_cgroup *memcg, *old_memcg;
477         void __percpu *ptr;
478
479         memcg = bpf_map_get_memcg(map);
480         old_memcg = set_active_memcg(memcg);
481         ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
482         set_active_memcg(old_memcg);
483         mem_cgroup_put(memcg);
484
485         return ptr;
486 }
487
488 #else
489 static void bpf_map_save_memcg(struct bpf_map *map)
490 {
491 }
492
493 static void bpf_map_release_memcg(struct bpf_map *map)
494 {
495 }
496 #endif
497
498 static int btf_field_cmp(const void *a, const void *b)
499 {
500         const struct btf_field *f1 = a, *f2 = b;
501
502         if (f1->offset < f2->offset)
503                 return -1;
504         else if (f1->offset > f2->offset)
505                 return 1;
506         return 0;
507 }
508
509 struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
510                                   enum btf_field_type type)
511 {
512         struct btf_field *field;
513
514         if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type))
515                 return NULL;
516         field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
517         if (!field || !(field->type & type))
518                 return NULL;
519         return field;
520 }
521
522 void btf_record_free(struct btf_record *rec)
523 {
524         int i;
525
526         if (IS_ERR_OR_NULL(rec))
527                 return;
528         for (i = 0; i < rec->cnt; i++) {
529                 switch (rec->fields[i].type) {
530                 case BPF_SPIN_LOCK:
531                 case BPF_TIMER:
532                         break;
533                 case BPF_KPTR_UNREF:
534                 case BPF_KPTR_REF:
535                         if (rec->fields[i].kptr.module)
536                                 module_put(rec->fields[i].kptr.module);
537                         btf_put(rec->fields[i].kptr.btf);
538                         break;
539                 case BPF_LIST_HEAD:
540                 case BPF_LIST_NODE:
541                         /* Nothing to release for bpf_list_head */
542                         break;
543                 default:
544                         WARN_ON_ONCE(1);
545                         continue;
546                 }
547         }
548         kfree(rec);
549 }
550
551 void bpf_map_free_record(struct bpf_map *map)
552 {
553         btf_record_free(map->record);
554         map->record = NULL;
555 }
556
557 struct btf_record *btf_record_dup(const struct btf_record *rec)
558 {
559         const struct btf_field *fields;
560         struct btf_record *new_rec;
561         int ret, size, i;
562
563         if (IS_ERR_OR_NULL(rec))
564                 return NULL;
565         size = offsetof(struct btf_record, fields[rec->cnt]);
566         new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
567         if (!new_rec)
568                 return ERR_PTR(-ENOMEM);
569         /* Do a deep copy of the btf_record */
570         fields = rec->fields;
571         new_rec->cnt = 0;
572         for (i = 0; i < rec->cnt; i++) {
573                 switch (fields[i].type) {
574                 case BPF_SPIN_LOCK:
575                 case BPF_TIMER:
576                         break;
577                 case BPF_KPTR_UNREF:
578                 case BPF_KPTR_REF:
579                         btf_get(fields[i].kptr.btf);
580                         if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
581                                 ret = -ENXIO;
582                                 goto free;
583                         }
584                         break;
585                 case BPF_LIST_HEAD:
586                 case BPF_LIST_NODE:
587                         /* Nothing to acquire for bpf_list_head */
588                         break;
589                 default:
590                         ret = -EFAULT;
591                         WARN_ON_ONCE(1);
592                         goto free;
593                 }
594                 new_rec->cnt++;
595         }
596         return new_rec;
597 free:
598         btf_record_free(new_rec);
599         return ERR_PTR(ret);
600 }
601
602 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
603 {
604         bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
605         int size;
606
607         if (!a_has_fields && !b_has_fields)
608                 return true;
609         if (a_has_fields != b_has_fields)
610                 return false;
611         if (rec_a->cnt != rec_b->cnt)
612                 return false;
613         size = offsetof(struct btf_record, fields[rec_a->cnt]);
614         /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
615          * members are zeroed out. So memcmp is safe to do without worrying
616          * about padding/unused fields.
617          *
618          * While spin_lock, timer, and kptr have no relation to map BTF,
619          * list_head metadata is specific to map BTF, the btf and value_rec
620          * members in particular. btf is the map BTF, while value_rec points to
621          * btf_record in that map BTF.
622          *
623          * So while by default, we don't rely on the map BTF (which the records
624          * were parsed from) matching for both records, which is not backwards
625          * compatible, in case list_head is part of it, we implicitly rely on
626          * that by way of depending on memcmp succeeding for it.
627          */
628         return !memcmp(rec_a, rec_b, size);
629 }
630
631 void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
632 {
633         if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
634                 return;
635         bpf_timer_cancel_and_free(obj + rec->timer_off);
636 }
637
638 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
639 {
640         const struct btf_field *fields;
641         int i;
642
643         if (IS_ERR_OR_NULL(rec))
644                 return;
645         fields = rec->fields;
646         for (i = 0; i < rec->cnt; i++) {
647                 const struct btf_field *field = &fields[i];
648                 void *field_ptr = obj + field->offset;
649
650                 switch (fields[i].type) {
651                 case BPF_SPIN_LOCK:
652                         break;
653                 case BPF_TIMER:
654                         bpf_timer_cancel_and_free(field_ptr);
655                         break;
656                 case BPF_KPTR_UNREF:
657                         WRITE_ONCE(*(u64 *)field_ptr, 0);
658                         break;
659                 case BPF_KPTR_REF:
660                         field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
661                         break;
662                 case BPF_LIST_HEAD:
663                         if (WARN_ON_ONCE(rec->spin_lock_off < 0))
664                                 continue;
665                         bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
666                         break;
667                 case BPF_LIST_NODE:
668                         break;
669                 default:
670                         WARN_ON_ONCE(1);
671                         continue;
672                 }
673         }
674 }
675
676 /* called from workqueue */
677 static void bpf_map_free_deferred(struct work_struct *work)
678 {
679         struct bpf_map *map = container_of(work, struct bpf_map, work);
680         struct btf_field_offs *foffs = map->field_offs;
681         struct btf_record *rec = map->record;
682
683         security_bpf_map_free(map);
684         bpf_map_release_memcg(map);
685         /* implementation dependent freeing */
686         map->ops->map_free(map);
687         /* Delay freeing of field_offs and btf_record for maps, as map_free
688          * callback usually needs access to them. It is better to do it here
689          * than require each callback to do the free itself manually.
690          *
691          * Note that the btf_record stashed in map->inner_map_meta->record was
692          * already freed using the map_free callback for map in map case which
693          * eventually calls bpf_map_free_meta, since inner_map_meta is only a
694          * template bpf_map struct used during verification.
695          */
696         kfree(foffs);
697         btf_record_free(rec);
698 }
699
700 static void bpf_map_put_uref(struct bpf_map *map)
701 {
702         if (atomic64_dec_and_test(&map->usercnt)) {
703                 if (map->ops->map_release_uref)
704                         map->ops->map_release_uref(map);
705         }
706 }
707
708 /* decrement map refcnt and schedule it for freeing via workqueue
709  * (unrelying map implementation ops->map_free() might sleep)
710  */
711 static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
712 {
713         if (atomic64_dec_and_test(&map->refcnt)) {
714                 /* bpf_map_free_id() must be called first */
715                 bpf_map_free_id(map, do_idr_lock);
716                 btf_put(map->btf);
717                 INIT_WORK(&map->work, bpf_map_free_deferred);
718                 /* Avoid spawning kworkers, since they all might contend
719                  * for the same mutex like slab_mutex.
720                  */
721                 queue_work(system_unbound_wq, &map->work);
722         }
723 }
724
725 void bpf_map_put(struct bpf_map *map)
726 {
727         __bpf_map_put(map, true);
728 }
729 EXPORT_SYMBOL_GPL(bpf_map_put);
730
731 void bpf_map_put_with_uref(struct bpf_map *map)
732 {
733         bpf_map_put_uref(map);
734         bpf_map_put(map);
735 }
736
737 static int bpf_map_release(struct inode *inode, struct file *filp)
738 {
739         struct bpf_map *map = filp->private_data;
740
741         if (map->ops->map_release)
742                 map->ops->map_release(map, filp);
743
744         bpf_map_put_with_uref(map);
745         return 0;
746 }
747
748 static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
749 {
750         fmode_t mode = f.file->f_mode;
751
752         /* Our file permissions may have been overridden by global
753          * map permissions facing syscall side.
754          */
755         if (READ_ONCE(map->frozen))
756                 mode &= ~FMODE_CAN_WRITE;
757         return mode;
758 }
759
760 #ifdef CONFIG_PROC_FS
761 /* Provides an approximation of the map's memory footprint.
762  * Used only to provide a backward compatibility and display
763  * a reasonable "memlock" info.
764  */
765 static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
766 {
767         unsigned long size;
768
769         size = round_up(map->key_size + bpf_map_value_size(map), 8);
770
771         return round_up(map->max_entries * size, PAGE_SIZE);
772 }
773
774 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
775 {
776         struct bpf_map *map = filp->private_data;
777         u32 type = 0, jited = 0;
778
779         if (map_type_contains_progs(map)) {
780                 spin_lock(&map->owner.lock);
781                 type  = map->owner.type;
782                 jited = map->owner.jited;
783                 spin_unlock(&map->owner.lock);
784         }
785
786         seq_printf(m,
787                    "map_type:\t%u\n"
788                    "key_size:\t%u\n"
789                    "value_size:\t%u\n"
790                    "max_entries:\t%u\n"
791                    "map_flags:\t%#x\n"
792                    "map_extra:\t%#llx\n"
793                    "memlock:\t%lu\n"
794                    "map_id:\t%u\n"
795                    "frozen:\t%u\n",
796                    map->map_type,
797                    map->key_size,
798                    map->value_size,
799                    map->max_entries,
800                    map->map_flags,
801                    (unsigned long long)map->map_extra,
802                    bpf_map_memory_footprint(map),
803                    map->id,
804                    READ_ONCE(map->frozen));
805         if (type) {
806                 seq_printf(m, "owner_prog_type:\t%u\n", type);
807                 seq_printf(m, "owner_jited:\t%u\n", jited);
808         }
809 }
810 #endif
811
812 static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
813                               loff_t *ppos)
814 {
815         /* We need this handler such that alloc_file() enables
816          * f_mode with FMODE_CAN_READ.
817          */
818         return -EINVAL;
819 }
820
821 static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
822                                size_t siz, loff_t *ppos)
823 {
824         /* We need this handler such that alloc_file() enables
825          * f_mode with FMODE_CAN_WRITE.
826          */
827         return -EINVAL;
828 }
829
830 /* called for any extra memory-mapped regions (except initial) */
831 static void bpf_map_mmap_open(struct vm_area_struct *vma)
832 {
833         struct bpf_map *map = vma->vm_file->private_data;
834
835         if (vma->vm_flags & VM_MAYWRITE)
836                 bpf_map_write_active_inc(map);
837 }
838
839 /* called for all unmapped memory region (including initial) */
840 static void bpf_map_mmap_close(struct vm_area_struct *vma)
841 {
842         struct bpf_map *map = vma->vm_file->private_data;
843
844         if (vma->vm_flags & VM_MAYWRITE)
845                 bpf_map_write_active_dec(map);
846 }
847
848 static const struct vm_operations_struct bpf_map_default_vmops = {
849         .open           = bpf_map_mmap_open,
850         .close          = bpf_map_mmap_close,
851 };
852
853 static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
854 {
855         struct bpf_map *map = filp->private_data;
856         int err;
857
858         if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
859                 return -ENOTSUPP;
860
861         if (!(vma->vm_flags & VM_SHARED))
862                 return -EINVAL;
863
864         mutex_lock(&map->freeze_mutex);
865
866         if (vma->vm_flags & VM_WRITE) {
867                 if (map->frozen) {
868                         err = -EPERM;
869                         goto out;
870                 }
871                 /* map is meant to be read-only, so do not allow mapping as
872                  * writable, because it's possible to leak a writable page
873                  * reference and allows user-space to still modify it after
874                  * freezing, while verifier will assume contents do not change
875                  */
876                 if (map->map_flags & BPF_F_RDONLY_PROG) {
877                         err = -EACCES;
878                         goto out;
879                 }
880         }
881
882         /* set default open/close callbacks */
883         vma->vm_ops = &bpf_map_default_vmops;
884         vma->vm_private_data = map;
885         vma->vm_flags &= ~VM_MAYEXEC;
886         if (!(vma->vm_flags & VM_WRITE))
887                 /* disallow re-mapping with PROT_WRITE */
888                 vma->vm_flags &= ~VM_MAYWRITE;
889
890         err = map->ops->map_mmap(map, vma);
891         if (err)
892                 goto out;
893
894         if (vma->vm_flags & VM_MAYWRITE)
895                 bpf_map_write_active_inc(map);
896 out:
897         mutex_unlock(&map->freeze_mutex);
898         return err;
899 }
900
901 static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
902 {
903         struct bpf_map *map = filp->private_data;
904
905         if (map->ops->map_poll)
906                 return map->ops->map_poll(map, filp, pts);
907
908         return EPOLLERR;
909 }
910
911 const struct file_operations bpf_map_fops = {
912 #ifdef CONFIG_PROC_FS
913         .show_fdinfo    = bpf_map_show_fdinfo,
914 #endif
915         .release        = bpf_map_release,
916         .read           = bpf_dummy_read,
917         .write          = bpf_dummy_write,
918         .mmap           = bpf_map_mmap,
919         .poll           = bpf_map_poll,
920 };
921
922 int bpf_map_new_fd(struct bpf_map *map, int flags)
923 {
924         int ret;
925
926         ret = security_bpf_map(map, OPEN_FMODE(flags));
927         if (ret < 0)
928                 return ret;
929
930         return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
931                                 flags | O_CLOEXEC);
932 }
933
934 int bpf_get_file_flag(int flags)
935 {
936         if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
937                 return -EINVAL;
938         if (flags & BPF_F_RDONLY)
939                 return O_RDONLY;
940         if (flags & BPF_F_WRONLY)
941                 return O_WRONLY;
942         return O_RDWR;
943 }
944
945 /* helper macro to check that unused fields 'union bpf_attr' are zero */
946 #define CHECK_ATTR(CMD) \
947         memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
948                    sizeof(attr->CMD##_LAST_FIELD), 0, \
949                    sizeof(*attr) - \
950                    offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
951                    sizeof(attr->CMD##_LAST_FIELD)) != NULL
952
953 /* dst and src must have at least "size" number of bytes.
954  * Return strlen on success and < 0 on error.
955  */
956 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
957 {
958         const char *end = src + size;
959         const char *orig_src = src;
960
961         memset(dst, 0, size);
962         /* Copy all isalnum(), '_' and '.' chars. */
963         while (src < end && *src) {
964                 if (!isalnum(*src) &&
965                     *src != '_' && *src != '.')
966                         return -EINVAL;
967                 *dst++ = *src++;
968         }
969
970         /* No '\0' found in "size" number of bytes */
971         if (src == end)
972                 return -EINVAL;
973
974         return src - orig_src;
975 }
976
977 int map_check_no_btf(const struct bpf_map *map,
978                      const struct btf *btf,
979                      const struct btf_type *key_type,
980                      const struct btf_type *value_type)
981 {
982         return -ENOTSUPP;
983 }
984
985 static int map_check_btf(struct bpf_map *map, const struct btf *btf,
986                          u32 btf_key_id, u32 btf_value_id)
987 {
988         const struct btf_type *key_type, *value_type;
989         u32 key_size, value_size;
990         int ret = 0;
991
992         /* Some maps allow key to be unspecified. */
993         if (btf_key_id) {
994                 key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
995                 if (!key_type || key_size != map->key_size)
996                         return -EINVAL;
997         } else {
998                 key_type = btf_type_by_id(btf, 0);
999                 if (!map->ops->map_check_btf)
1000                         return -EINVAL;
1001         }
1002
1003         value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
1004         if (!value_type || value_size != map->value_size)
1005                 return -EINVAL;
1006
1007         map->record = btf_parse_fields(btf, value_type,
1008                                        BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD,
1009                                        map->value_size);
1010         if (!IS_ERR_OR_NULL(map->record)) {
1011                 int i;
1012
1013                 if (!bpf_capable()) {
1014                         ret = -EPERM;
1015                         goto free_map_tab;
1016                 }
1017                 if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
1018                         ret = -EACCES;
1019                         goto free_map_tab;
1020                 }
1021                 for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
1022                         switch (map->record->field_mask & (1 << i)) {
1023                         case 0:
1024                                 continue;
1025                         case BPF_SPIN_LOCK:
1026                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1027                                     map->map_type != BPF_MAP_TYPE_ARRAY &&
1028                                     map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
1029                                     map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1030                                     map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1031                                     map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1032                                     map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1033                                         ret = -EOPNOTSUPP;
1034                                         goto free_map_tab;
1035                                 }
1036                                 break;
1037                         case BPF_TIMER:
1038                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1039                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1040                                     map->map_type != BPF_MAP_TYPE_ARRAY) {
1041                                         ret = -EOPNOTSUPP;
1042                                         goto free_map_tab;
1043                                 }
1044                                 break;
1045                         case BPF_KPTR_UNREF:
1046                         case BPF_KPTR_REF:
1047                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1048                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1049                                     map->map_type != BPF_MAP_TYPE_ARRAY &&
1050                                     map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
1051                                         ret = -EOPNOTSUPP;
1052                                         goto free_map_tab;
1053                                 }
1054                                 break;
1055                         case BPF_LIST_HEAD:
1056                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1057                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1058                                     map->map_type != BPF_MAP_TYPE_ARRAY) {
1059                                         ret = -EOPNOTSUPP;
1060                                         goto free_map_tab;
1061                                 }
1062                                 break;
1063                         default:
1064                                 /* Fail if map_type checks are missing for a field type */
1065                                 ret = -EOPNOTSUPP;
1066                                 goto free_map_tab;
1067                         }
1068                 }
1069         }
1070
1071         ret = btf_check_and_fixup_fields(btf, map->record);
1072         if (ret < 0)
1073                 goto free_map_tab;
1074
1075         if (map->ops->map_check_btf) {
1076                 ret = map->ops->map_check_btf(map, btf, key_type, value_type);
1077                 if (ret < 0)
1078                         goto free_map_tab;
1079         }
1080
1081         return ret;
1082 free_map_tab:
1083         bpf_map_free_record(map);
1084         return ret;
1085 }
1086
1087 #define BPF_MAP_CREATE_LAST_FIELD map_extra
1088 /* called via syscall */
1089 static int map_create(union bpf_attr *attr)
1090 {
1091         int numa_node = bpf_map_attr_numa_node(attr);
1092         struct btf_field_offs *foffs;
1093         struct bpf_map *map;
1094         int f_flags;
1095         int err;
1096
1097         err = CHECK_ATTR(BPF_MAP_CREATE);
1098         if (err)
1099                 return -EINVAL;
1100
1101         if (attr->btf_vmlinux_value_type_id) {
1102                 if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
1103                     attr->btf_key_type_id || attr->btf_value_type_id)
1104                         return -EINVAL;
1105         } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
1106                 return -EINVAL;
1107         }
1108
1109         if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
1110             attr->map_extra != 0)
1111                 return -EINVAL;
1112
1113         f_flags = bpf_get_file_flag(attr->map_flags);
1114         if (f_flags < 0)
1115                 return f_flags;
1116
1117         if (numa_node != NUMA_NO_NODE &&
1118             ((unsigned int)numa_node >= nr_node_ids ||
1119              !node_online(numa_node)))
1120                 return -EINVAL;
1121
1122         /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
1123         map = find_and_alloc_map(attr);
1124         if (IS_ERR(map))
1125                 return PTR_ERR(map);
1126
1127         err = bpf_obj_name_cpy(map->name, attr->map_name,
1128                                sizeof(attr->map_name));
1129         if (err < 0)
1130                 goto free_map;
1131
1132         atomic64_set(&map->refcnt, 1);
1133         atomic64_set(&map->usercnt, 1);
1134         mutex_init(&map->freeze_mutex);
1135         spin_lock_init(&map->owner.lock);
1136
1137         if (attr->btf_key_type_id || attr->btf_value_type_id ||
1138             /* Even the map's value is a kernel's struct,
1139              * the bpf_prog.o must have BTF to begin with
1140              * to figure out the corresponding kernel's
1141              * counter part.  Thus, attr->btf_fd has
1142              * to be valid also.
1143              */
1144             attr->btf_vmlinux_value_type_id) {
1145                 struct btf *btf;
1146
1147                 btf = btf_get_by_fd(attr->btf_fd);
1148                 if (IS_ERR(btf)) {
1149                         err = PTR_ERR(btf);
1150                         goto free_map;
1151                 }
1152                 if (btf_is_kernel(btf)) {
1153                         btf_put(btf);
1154                         err = -EACCES;
1155                         goto free_map;
1156                 }
1157                 map->btf = btf;
1158
1159                 if (attr->btf_value_type_id) {
1160                         err = map_check_btf(map, btf, attr->btf_key_type_id,
1161                                             attr->btf_value_type_id);
1162                         if (err)
1163                                 goto free_map;
1164                 }
1165
1166                 map->btf_key_type_id = attr->btf_key_type_id;
1167                 map->btf_value_type_id = attr->btf_value_type_id;
1168                 map->btf_vmlinux_value_type_id =
1169                         attr->btf_vmlinux_value_type_id;
1170         }
1171
1172
1173         foffs = btf_parse_field_offs(map->record);
1174         if (IS_ERR(foffs)) {
1175                 err = PTR_ERR(foffs);
1176                 goto free_map;
1177         }
1178         map->field_offs = foffs;
1179
1180         err = security_bpf_map_alloc(map);
1181         if (err)
1182                 goto free_map_field_offs;
1183
1184         err = bpf_map_alloc_id(map);
1185         if (err)
1186                 goto free_map_sec;
1187
1188         bpf_map_save_memcg(map);
1189
1190         err = bpf_map_new_fd(map, f_flags);
1191         if (err < 0) {
1192                 /* failed to allocate fd.
1193                  * bpf_map_put_with_uref() is needed because the above
1194                  * bpf_map_alloc_id() has published the map
1195                  * to the userspace and the userspace may
1196                  * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
1197                  */
1198                 bpf_map_put_with_uref(map);
1199                 return err;
1200         }
1201
1202         return err;
1203
1204 free_map_sec:
1205         security_bpf_map_free(map);
1206 free_map_field_offs:
1207         kfree(map->field_offs);
1208 free_map:
1209         btf_put(map->btf);
1210         map->ops->map_free(map);
1211         return err;
1212 }
1213
1214 /* if error is returned, fd is released.
1215  * On success caller should complete fd access with matching fdput()
1216  */
1217 struct bpf_map *__bpf_map_get(struct fd f)
1218 {
1219         if (!f.file)
1220                 return ERR_PTR(-EBADF);
1221         if (f.file->f_op != &bpf_map_fops) {
1222                 fdput(f);
1223                 return ERR_PTR(-EINVAL);
1224         }
1225
1226         return f.file->private_data;
1227 }
1228
1229 void bpf_map_inc(struct bpf_map *map)
1230 {
1231         atomic64_inc(&map->refcnt);
1232 }
1233 EXPORT_SYMBOL_GPL(bpf_map_inc);
1234
1235 void bpf_map_inc_with_uref(struct bpf_map *map)
1236 {
1237         atomic64_inc(&map->refcnt);
1238         atomic64_inc(&map->usercnt);
1239 }
1240 EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
1241
1242 struct bpf_map *bpf_map_get(u32 ufd)
1243 {
1244         struct fd f = fdget(ufd);
1245         struct bpf_map *map;
1246
1247         map = __bpf_map_get(f);
1248         if (IS_ERR(map))
1249                 return map;
1250
1251         bpf_map_inc(map);
1252         fdput(f);
1253
1254         return map;
1255 }
1256 EXPORT_SYMBOL(bpf_map_get);
1257
1258 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
1259 {
1260         struct fd f = fdget(ufd);
1261         struct bpf_map *map;
1262
1263         map = __bpf_map_get(f);
1264         if (IS_ERR(map))
1265                 return map;
1266
1267         bpf_map_inc_with_uref(map);
1268         fdput(f);
1269
1270         return map;
1271 }
1272
1273 /* map_idr_lock should have been held */
1274 static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
1275 {
1276         int refold;
1277
1278         refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
1279         if (!refold)
1280                 return ERR_PTR(-ENOENT);
1281         if (uref)
1282                 atomic64_inc(&map->usercnt);
1283
1284         return map;
1285 }
1286
1287 struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
1288 {
1289         spin_lock_bh(&map_idr_lock);
1290         map = __bpf_map_inc_not_zero(map, false);
1291         spin_unlock_bh(&map_idr_lock);
1292
1293         return map;
1294 }
1295 EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
1296
1297 int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
1298 {
1299         return -ENOTSUPP;
1300 }
1301
1302 static void *__bpf_copy_key(void __user *ukey, u64 key_size)
1303 {
1304         if (key_size)
1305                 return vmemdup_user(ukey, key_size);
1306
1307         if (ukey)
1308                 return ERR_PTR(-EINVAL);
1309
1310         return NULL;
1311 }
1312
1313 static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
1314 {
1315         if (key_size)
1316                 return kvmemdup_bpfptr(ukey, key_size);
1317
1318         if (!bpfptr_is_null(ukey))
1319                 return ERR_PTR(-EINVAL);
1320
1321         return NULL;
1322 }
1323
1324 /* last field in 'union bpf_attr' used by this command */
1325 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
1326
1327 static int map_lookup_elem(union bpf_attr *attr)
1328 {
1329         void __user *ukey = u64_to_user_ptr(attr->key);
1330         void __user *uvalue = u64_to_user_ptr(attr->value);
1331         int ufd = attr->map_fd;
1332         struct bpf_map *map;
1333         void *key, *value;
1334         u32 value_size;
1335         struct fd f;
1336         int err;
1337
1338         if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
1339                 return -EINVAL;
1340
1341         if (attr->flags & ~BPF_F_LOCK)
1342                 return -EINVAL;
1343
1344         f = fdget(ufd);
1345         map = __bpf_map_get(f);
1346         if (IS_ERR(map))
1347                 return PTR_ERR(map);
1348         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1349                 err = -EPERM;
1350                 goto err_put;
1351         }
1352
1353         if ((attr->flags & BPF_F_LOCK) &&
1354             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1355                 err = -EINVAL;
1356                 goto err_put;
1357         }
1358
1359         key = __bpf_copy_key(ukey, map->key_size);
1360         if (IS_ERR(key)) {
1361                 err = PTR_ERR(key);
1362                 goto err_put;
1363         }
1364
1365         value_size = bpf_map_value_size(map);
1366
1367         err = -ENOMEM;
1368         value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1369         if (!value)
1370                 goto free_key;
1371
1372         if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
1373                 if (copy_from_user(value, uvalue, value_size))
1374                         err = -EFAULT;
1375                 else
1376                         err = bpf_map_copy_value(map, key, value, attr->flags);
1377                 goto free_value;
1378         }
1379
1380         err = bpf_map_copy_value(map, key, value, attr->flags);
1381         if (err)
1382                 goto free_value;
1383
1384         err = -EFAULT;
1385         if (copy_to_user(uvalue, value, value_size) != 0)
1386                 goto free_value;
1387
1388         err = 0;
1389
1390 free_value:
1391         kvfree(value);
1392 free_key:
1393         kvfree(key);
1394 err_put:
1395         fdput(f);
1396         return err;
1397 }
1398
1399
1400 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
1401
1402 static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
1403 {
1404         bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
1405         bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
1406         int ufd = attr->map_fd;
1407         struct bpf_map *map;
1408         void *key, *value;
1409         u32 value_size;
1410         struct fd f;
1411         int err;
1412
1413         if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
1414                 return -EINVAL;
1415
1416         f = fdget(ufd);
1417         map = __bpf_map_get(f);
1418         if (IS_ERR(map))
1419                 return PTR_ERR(map);
1420         bpf_map_write_active_inc(map);
1421         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1422                 err = -EPERM;
1423                 goto err_put;
1424         }
1425
1426         if ((attr->flags & BPF_F_LOCK) &&
1427             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1428                 err = -EINVAL;
1429                 goto err_put;
1430         }
1431
1432         key = ___bpf_copy_key(ukey, map->key_size);
1433         if (IS_ERR(key)) {
1434                 err = PTR_ERR(key);
1435                 goto err_put;
1436         }
1437
1438         value_size = bpf_map_value_size(map);
1439         value = kvmemdup_bpfptr(uvalue, value_size);
1440         if (IS_ERR(value)) {
1441                 err = PTR_ERR(value);
1442                 goto free_key;
1443         }
1444
1445         err = bpf_map_update_value(map, f.file, key, value, attr->flags);
1446
1447         kvfree(value);
1448 free_key:
1449         kvfree(key);
1450 err_put:
1451         bpf_map_write_active_dec(map);
1452         fdput(f);
1453         return err;
1454 }
1455
1456 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
1457
1458 static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
1459 {
1460         bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
1461         int ufd = attr->map_fd;
1462         struct bpf_map *map;
1463         struct fd f;
1464         void *key;
1465         int err;
1466
1467         if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
1468                 return -EINVAL;
1469
1470         f = fdget(ufd);
1471         map = __bpf_map_get(f);
1472         if (IS_ERR(map))
1473                 return PTR_ERR(map);
1474         bpf_map_write_active_inc(map);
1475         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1476                 err = -EPERM;
1477                 goto err_put;
1478         }
1479
1480         key = ___bpf_copy_key(ukey, map->key_size);
1481         if (IS_ERR(key)) {
1482                 err = PTR_ERR(key);
1483                 goto err_put;
1484         }
1485
1486         if (bpf_map_is_dev_bound(map)) {
1487                 err = bpf_map_offload_delete_elem(map, key);
1488                 goto out;
1489         } else if (IS_FD_PROG_ARRAY(map) ||
1490                    map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1491                 /* These maps require sleepable context */
1492                 err = map->ops->map_delete_elem(map, key);
1493                 goto out;
1494         }
1495
1496         bpf_disable_instrumentation();
1497         rcu_read_lock();
1498         err = map->ops->map_delete_elem(map, key);
1499         rcu_read_unlock();
1500         bpf_enable_instrumentation();
1501         maybe_wait_bpf_programs(map);
1502 out:
1503         kvfree(key);
1504 err_put:
1505         bpf_map_write_active_dec(map);
1506         fdput(f);
1507         return err;
1508 }
1509
1510 /* last field in 'union bpf_attr' used by this command */
1511 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
1512
1513 static int map_get_next_key(union bpf_attr *attr)
1514 {
1515         void __user *ukey = u64_to_user_ptr(attr->key);
1516         void __user *unext_key = u64_to_user_ptr(attr->next_key);
1517         int ufd = attr->map_fd;
1518         struct bpf_map *map;
1519         void *key, *next_key;
1520         struct fd f;
1521         int err;
1522
1523         if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
1524                 return -EINVAL;
1525
1526         f = fdget(ufd);
1527         map = __bpf_map_get(f);
1528         if (IS_ERR(map))
1529                 return PTR_ERR(map);
1530         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1531                 err = -EPERM;
1532                 goto err_put;
1533         }
1534
1535         if (ukey) {
1536                 key = __bpf_copy_key(ukey, map->key_size);
1537                 if (IS_ERR(key)) {
1538                         err = PTR_ERR(key);
1539                         goto err_put;
1540                 }
1541         } else {
1542                 key = NULL;
1543         }
1544
1545         err = -ENOMEM;
1546         next_key = kvmalloc(map->key_size, GFP_USER);
1547         if (!next_key)
1548                 goto free_key;
1549
1550         if (bpf_map_is_dev_bound(map)) {
1551                 err = bpf_map_offload_get_next_key(map, key, next_key);
1552                 goto out;
1553         }
1554
1555         rcu_read_lock();
1556         err = map->ops->map_get_next_key(map, key, next_key);
1557         rcu_read_unlock();
1558 out:
1559         if (err)
1560                 goto free_next_key;
1561
1562         err = -EFAULT;
1563         if (copy_to_user(unext_key, next_key, map->key_size) != 0)
1564                 goto free_next_key;
1565
1566         err = 0;
1567
1568 free_next_key:
1569         kvfree(next_key);
1570 free_key:
1571         kvfree(key);
1572 err_put:
1573         fdput(f);
1574         return err;
1575 }
1576
1577 int generic_map_delete_batch(struct bpf_map *map,
1578                              const union bpf_attr *attr,
1579                              union bpf_attr __user *uattr)
1580 {
1581         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1582         u32 cp, max_count;
1583         int err = 0;
1584         void *key;
1585
1586         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1587                 return -EINVAL;
1588
1589         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1590             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1591                 return -EINVAL;
1592         }
1593
1594         max_count = attr->batch.count;
1595         if (!max_count)
1596                 return 0;
1597
1598         key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1599         if (!key)
1600                 return -ENOMEM;
1601
1602         for (cp = 0; cp < max_count; cp++) {
1603                 err = -EFAULT;
1604                 if (copy_from_user(key, keys + cp * map->key_size,
1605                                    map->key_size))
1606                         break;
1607
1608                 if (bpf_map_is_dev_bound(map)) {
1609                         err = bpf_map_offload_delete_elem(map, key);
1610                         break;
1611                 }
1612
1613                 bpf_disable_instrumentation();
1614                 rcu_read_lock();
1615                 err = map->ops->map_delete_elem(map, key);
1616                 rcu_read_unlock();
1617                 bpf_enable_instrumentation();
1618                 if (err)
1619                         break;
1620                 cond_resched();
1621         }
1622         if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1623                 err = -EFAULT;
1624
1625         kvfree(key);
1626
1627         maybe_wait_bpf_programs(map);
1628         return err;
1629 }
1630
1631 int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
1632                              const union bpf_attr *attr,
1633                              union bpf_attr __user *uattr)
1634 {
1635         void __user *values = u64_to_user_ptr(attr->batch.values);
1636         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1637         u32 value_size, cp, max_count;
1638         void *key, *value;
1639         int err = 0;
1640
1641         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1642                 return -EINVAL;
1643
1644         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1645             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1646                 return -EINVAL;
1647         }
1648
1649         value_size = bpf_map_value_size(map);
1650
1651         max_count = attr->batch.count;
1652         if (!max_count)
1653                 return 0;
1654
1655         key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1656         if (!key)
1657                 return -ENOMEM;
1658
1659         value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1660         if (!value) {
1661                 kvfree(key);
1662                 return -ENOMEM;
1663         }
1664
1665         for (cp = 0; cp < max_count; cp++) {
1666                 err = -EFAULT;
1667                 if (copy_from_user(key, keys + cp * map->key_size,
1668                     map->key_size) ||
1669                     copy_from_user(value, values + cp * value_size, value_size))
1670                         break;
1671
1672                 err = bpf_map_update_value(map, map_file, key, value,
1673                                            attr->batch.elem_flags);
1674
1675                 if (err)
1676                         break;
1677                 cond_resched();
1678         }
1679
1680         if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1681                 err = -EFAULT;
1682
1683         kvfree(value);
1684         kvfree(key);
1685         return err;
1686 }
1687
1688 #define MAP_LOOKUP_RETRIES 3
1689
1690 int generic_map_lookup_batch(struct bpf_map *map,
1691                                     const union bpf_attr *attr,
1692                                     union bpf_attr __user *uattr)
1693 {
1694         void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
1695         void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1696         void __user *values = u64_to_user_ptr(attr->batch.values);
1697         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1698         void *buf, *buf_prevkey, *prev_key, *key, *value;
1699         int err, retry = MAP_LOOKUP_RETRIES;
1700         u32 value_size, cp, max_count;
1701
1702         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1703                 return -EINVAL;
1704
1705         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1706             !btf_record_has_field(map->record, BPF_SPIN_LOCK))
1707                 return -EINVAL;
1708
1709         value_size = bpf_map_value_size(map);
1710
1711         max_count = attr->batch.count;
1712         if (!max_count)
1713                 return 0;
1714
1715         if (put_user(0, &uattr->batch.count))
1716                 return -EFAULT;
1717
1718         buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1719         if (!buf_prevkey)
1720                 return -ENOMEM;
1721
1722         buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
1723         if (!buf) {
1724                 kvfree(buf_prevkey);
1725                 return -ENOMEM;
1726         }
1727
1728         err = -EFAULT;
1729         prev_key = NULL;
1730         if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
1731                 goto free_buf;
1732         key = buf;
1733         value = key + map->key_size;
1734         if (ubatch)
1735                 prev_key = buf_prevkey;
1736
1737         for (cp = 0; cp < max_count;) {
1738                 rcu_read_lock();
1739                 err = map->ops->map_get_next_key(map, prev_key, key);
1740                 rcu_read_unlock();
1741                 if (err)
1742                         break;
1743                 err = bpf_map_copy_value(map, key, value,
1744                                          attr->batch.elem_flags);
1745
1746                 if (err == -ENOENT) {
1747                         if (retry) {
1748                                 retry--;
1749                                 continue;
1750                         }
1751                         err = -EINTR;
1752                         break;
1753                 }
1754
1755                 if (err)
1756                         goto free_buf;
1757
1758                 if (copy_to_user(keys + cp * map->key_size, key,
1759                                  map->key_size)) {
1760                         err = -EFAULT;
1761                         goto free_buf;
1762                 }
1763                 if (copy_to_user(values + cp * value_size, value, value_size)) {
1764                         err = -EFAULT;
1765                         goto free_buf;
1766                 }
1767
1768                 if (!prev_key)
1769                         prev_key = buf_prevkey;
1770
1771                 swap(prev_key, key);
1772                 retry = MAP_LOOKUP_RETRIES;
1773                 cp++;
1774                 cond_resched();
1775         }
1776
1777         if (err == -EFAULT)
1778                 goto free_buf;
1779
1780         if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
1781                     (cp && copy_to_user(uobatch, prev_key, map->key_size))))
1782                 err = -EFAULT;
1783
1784 free_buf:
1785         kvfree(buf_prevkey);
1786         kvfree(buf);
1787         return err;
1788 }
1789
1790 #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
1791
1792 static int map_lookup_and_delete_elem(union bpf_attr *attr)
1793 {
1794         void __user *ukey = u64_to_user_ptr(attr->key);
1795         void __user *uvalue = u64_to_user_ptr(attr->value);
1796         int ufd = attr->map_fd;
1797         struct bpf_map *map;
1798         void *key, *value;
1799         u32 value_size;
1800         struct fd f;
1801         int err;
1802
1803         if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
1804                 return -EINVAL;
1805
1806         if (attr->flags & ~BPF_F_LOCK)
1807                 return -EINVAL;
1808
1809         f = fdget(ufd);
1810         map = __bpf_map_get(f);
1811         if (IS_ERR(map))
1812                 return PTR_ERR(map);
1813         bpf_map_write_active_inc(map);
1814         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
1815             !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1816                 err = -EPERM;
1817                 goto err_put;
1818         }
1819
1820         if (attr->flags &&
1821             (map->map_type == BPF_MAP_TYPE_QUEUE ||
1822              map->map_type == BPF_MAP_TYPE_STACK)) {
1823                 err = -EINVAL;
1824                 goto err_put;
1825         }
1826
1827         if ((attr->flags & BPF_F_LOCK) &&
1828             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1829                 err = -EINVAL;
1830                 goto err_put;
1831         }
1832
1833         key = __bpf_copy_key(ukey, map->key_size);
1834         if (IS_ERR(key)) {
1835                 err = PTR_ERR(key);
1836                 goto err_put;
1837         }
1838
1839         value_size = bpf_map_value_size(map);
1840
1841         err = -ENOMEM;
1842         value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1843         if (!value)
1844                 goto free_key;
1845
1846         err = -ENOTSUPP;
1847         if (map->map_type == BPF_MAP_TYPE_QUEUE ||
1848             map->map_type == BPF_MAP_TYPE_STACK) {
1849                 err = map->ops->map_pop_elem(map, value);
1850         } else if (map->map_type == BPF_MAP_TYPE_HASH ||
1851                    map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
1852                    map->map_type == BPF_MAP_TYPE_LRU_HASH ||
1853                    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
1854                 if (!bpf_map_is_dev_bound(map)) {
1855                         bpf_disable_instrumentation();
1856                         rcu_read_lock();
1857                         err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
1858                         rcu_read_unlock();
1859                         bpf_enable_instrumentation();
1860                 }
1861         }
1862
1863         if (err)
1864                 goto free_value;
1865
1866         if (copy_to_user(uvalue, value, value_size) != 0) {
1867                 err = -EFAULT;
1868                 goto free_value;
1869         }
1870
1871         err = 0;
1872
1873 free_value:
1874         kvfree(value);
1875 free_key:
1876         kvfree(key);
1877 err_put:
1878         bpf_map_write_active_dec(map);
1879         fdput(f);
1880         return err;
1881 }
1882
1883 #define BPF_MAP_FREEZE_LAST_FIELD map_fd
1884
1885 static int map_freeze(const union bpf_attr *attr)
1886 {
1887         int err = 0, ufd = attr->map_fd;
1888         struct bpf_map *map;
1889         struct fd f;
1890
1891         if (CHECK_ATTR(BPF_MAP_FREEZE))
1892                 return -EINVAL;
1893
1894         f = fdget(ufd);
1895         map = __bpf_map_get(f);
1896         if (IS_ERR(map))
1897                 return PTR_ERR(map);
1898
1899         if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
1900                 fdput(f);
1901                 return -ENOTSUPP;
1902         }
1903
1904         mutex_lock(&map->freeze_mutex);
1905         if (bpf_map_write_active(map)) {
1906                 err = -EBUSY;
1907                 goto err_put;
1908         }
1909         if (READ_ONCE(map->frozen)) {
1910                 err = -EBUSY;
1911                 goto err_put;
1912         }
1913         if (!bpf_capable()) {
1914                 err = -EPERM;
1915                 goto err_put;
1916         }
1917
1918         WRITE_ONCE(map->frozen, true);
1919 err_put:
1920         mutex_unlock(&map->freeze_mutex);
1921         fdput(f);
1922         return err;
1923 }
1924
1925 static const struct bpf_prog_ops * const bpf_prog_types[] = {
1926 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
1927         [_id] = & _name ## _prog_ops,
1928 #define BPF_MAP_TYPE(_id, _ops)
1929 #define BPF_LINK_TYPE(_id, _name)
1930 #include <linux/bpf_types.h>
1931 #undef BPF_PROG_TYPE
1932 #undef BPF_MAP_TYPE
1933 #undef BPF_LINK_TYPE
1934 };
1935
1936 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
1937 {
1938         const struct bpf_prog_ops *ops;
1939
1940         if (type >= ARRAY_SIZE(bpf_prog_types))
1941                 return -EINVAL;
1942         type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
1943         ops = bpf_prog_types[type];
1944         if (!ops)
1945                 return -EINVAL;
1946
1947         if (!bpf_prog_is_dev_bound(prog->aux))
1948                 prog->aux->ops = ops;
1949         else
1950                 prog->aux->ops = &bpf_offload_prog_ops;
1951         prog->type = type;
1952         return 0;
1953 }
1954
1955 enum bpf_audit {
1956         BPF_AUDIT_LOAD,
1957         BPF_AUDIT_UNLOAD,
1958         BPF_AUDIT_MAX,
1959 };
1960
1961 static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
1962         [BPF_AUDIT_LOAD]   = "LOAD",
1963         [BPF_AUDIT_UNLOAD] = "UNLOAD",
1964 };
1965
1966 static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
1967 {
1968         struct audit_context *ctx = NULL;
1969         struct audit_buffer *ab;
1970
1971         if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
1972                 return;
1973         if (audit_enabled == AUDIT_OFF)
1974                 return;
1975         if (op == BPF_AUDIT_LOAD)
1976                 ctx = audit_context();
1977         ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
1978         if (unlikely(!ab))
1979                 return;
1980         audit_log_format(ab, "prog-id=%u op=%s",
1981                          prog->aux->id, bpf_audit_str[op]);
1982         audit_log_end(ab);
1983 }
1984
1985 static int bpf_prog_alloc_id(struct bpf_prog *prog)
1986 {
1987         int id;
1988
1989         idr_preload(GFP_KERNEL);
1990         spin_lock_bh(&prog_idr_lock);
1991         id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
1992         if (id > 0)
1993                 prog->aux->id = id;
1994         spin_unlock_bh(&prog_idr_lock);
1995         idr_preload_end();
1996
1997         /* id is in [1, INT_MAX) */
1998         if (WARN_ON_ONCE(!id))
1999                 return -ENOSPC;
2000
2001         return id > 0 ? 0 : id;
2002 }
2003
2004 void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
2005 {
2006         unsigned long flags;
2007
2008         /* cBPF to eBPF migrations are currently not in the idr store.
2009          * Offloaded programs are removed from the store when their device
2010          * disappears - even if someone grabs an fd to them they are unusable,
2011          * simply waiting for refcnt to drop to be freed.
2012          */
2013         if (!prog->aux->id)
2014                 return;
2015
2016         if (do_idr_lock)
2017                 spin_lock_irqsave(&prog_idr_lock, flags);
2018         else
2019                 __acquire(&prog_idr_lock);
2020
2021         idr_remove(&prog_idr, prog->aux->id);
2022         prog->aux->id = 0;
2023
2024         if (do_idr_lock)
2025                 spin_unlock_irqrestore(&prog_idr_lock, flags);
2026         else
2027                 __release(&prog_idr_lock);
2028 }
2029
2030 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
2031 {
2032         struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
2033
2034         kvfree(aux->func_info);
2035         kfree(aux->func_info_aux);
2036         free_uid(aux->user);
2037         security_bpf_prog_free(aux);
2038         bpf_prog_free(aux->prog);
2039 }
2040
2041 static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
2042 {
2043         bpf_prog_kallsyms_del_all(prog);
2044         btf_put(prog->aux->btf);
2045         kvfree(prog->aux->jited_linfo);
2046         kvfree(prog->aux->linfo);
2047         kfree(prog->aux->kfunc_tab);
2048         if (prog->aux->attach_btf)
2049                 btf_put(prog->aux->attach_btf);
2050
2051         if (deferred) {
2052                 if (prog->aux->sleepable)
2053                         call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
2054                 else
2055                         call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
2056         } else {
2057                 __bpf_prog_put_rcu(&prog->aux->rcu);
2058         }
2059 }
2060
2061 static void bpf_prog_put_deferred(struct work_struct *work)
2062 {
2063         struct bpf_prog_aux *aux;
2064         struct bpf_prog *prog;
2065
2066         aux = container_of(work, struct bpf_prog_aux, work);
2067         prog = aux->prog;
2068         perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
2069         bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
2070         __bpf_prog_put_noref(prog, true);
2071 }
2072
2073 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
2074 {
2075         struct bpf_prog_aux *aux = prog->aux;
2076
2077         if (atomic64_dec_and_test(&aux->refcnt)) {
2078                 /* bpf_prog_free_id() must be called first */
2079                 bpf_prog_free_id(prog, do_idr_lock);
2080
2081                 if (in_irq() || irqs_disabled()) {
2082                         INIT_WORK(&aux->work, bpf_prog_put_deferred);
2083                         schedule_work(&aux->work);
2084                 } else {
2085                         bpf_prog_put_deferred(&aux->work);
2086                 }
2087         }
2088 }
2089
2090 void bpf_prog_put(struct bpf_prog *prog)
2091 {
2092         __bpf_prog_put(prog, true);
2093 }
2094 EXPORT_SYMBOL_GPL(bpf_prog_put);
2095
2096 static int bpf_prog_release(struct inode *inode, struct file *filp)
2097 {
2098         struct bpf_prog *prog = filp->private_data;
2099
2100         bpf_prog_put(prog);
2101         return 0;
2102 }
2103
2104 struct bpf_prog_kstats {
2105         u64 nsecs;
2106         u64 cnt;
2107         u64 misses;
2108 };
2109
2110 void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
2111 {
2112         struct bpf_prog_stats *stats;
2113         unsigned int flags;
2114
2115         stats = this_cpu_ptr(prog->stats);
2116         flags = u64_stats_update_begin_irqsave(&stats->syncp);
2117         u64_stats_inc(&stats->misses);
2118         u64_stats_update_end_irqrestore(&stats->syncp, flags);
2119 }
2120
2121 static void bpf_prog_get_stats(const struct bpf_prog *prog,
2122                                struct bpf_prog_kstats *stats)
2123 {
2124         u64 nsecs = 0, cnt = 0, misses = 0;
2125         int cpu;
2126
2127         for_each_possible_cpu(cpu) {
2128                 const struct bpf_prog_stats *st;
2129                 unsigned int start;
2130                 u64 tnsecs, tcnt, tmisses;
2131
2132                 st = per_cpu_ptr(prog->stats, cpu);
2133                 do {
2134                         start = u64_stats_fetch_begin(&st->syncp);
2135                         tnsecs = u64_stats_read(&st->nsecs);
2136                         tcnt = u64_stats_read(&st->cnt);
2137                         tmisses = u64_stats_read(&st->misses);
2138                 } while (u64_stats_fetch_retry(&st->syncp, start));
2139                 nsecs += tnsecs;
2140                 cnt += tcnt;
2141                 misses += tmisses;
2142         }
2143         stats->nsecs = nsecs;
2144         stats->cnt = cnt;
2145         stats->misses = misses;
2146 }
2147
2148 #ifdef CONFIG_PROC_FS
2149 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
2150 {
2151         const struct bpf_prog *prog = filp->private_data;
2152         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
2153         struct bpf_prog_kstats stats;
2154
2155         bpf_prog_get_stats(prog, &stats);
2156         bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
2157         seq_printf(m,
2158                    "prog_type:\t%u\n"
2159                    "prog_jited:\t%u\n"
2160                    "prog_tag:\t%s\n"
2161                    "memlock:\t%llu\n"
2162                    "prog_id:\t%u\n"
2163                    "run_time_ns:\t%llu\n"
2164                    "run_cnt:\t%llu\n"
2165                    "recursion_misses:\t%llu\n"
2166                    "verified_insns:\t%u\n",
2167                    prog->type,
2168                    prog->jited,
2169                    prog_tag,
2170                    prog->pages * 1ULL << PAGE_SHIFT,
2171                    prog->aux->id,
2172                    stats.nsecs,
2173                    stats.cnt,
2174                    stats.misses,
2175                    prog->aux->verified_insns);
2176 }
2177 #endif
2178
2179 const struct file_operations bpf_prog_fops = {
2180 #ifdef CONFIG_PROC_FS
2181         .show_fdinfo    = bpf_prog_show_fdinfo,
2182 #endif
2183         .release        = bpf_prog_release,
2184         .read           = bpf_dummy_read,
2185         .write          = bpf_dummy_write,
2186 };
2187
2188 int bpf_prog_new_fd(struct bpf_prog *prog)
2189 {
2190         int ret;
2191
2192         ret = security_bpf_prog(prog);
2193         if (ret < 0)
2194                 return ret;
2195
2196         return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
2197                                 O_RDWR | O_CLOEXEC);
2198 }
2199
2200 static struct bpf_prog *____bpf_prog_get(struct fd f)
2201 {
2202         if (!f.file)
2203                 return ERR_PTR(-EBADF);
2204         if (f.file->f_op != &bpf_prog_fops) {
2205                 fdput(f);
2206                 return ERR_PTR(-EINVAL);
2207         }
2208
2209         return f.file->private_data;
2210 }
2211
2212 void bpf_prog_add(struct bpf_prog *prog, int i)
2213 {
2214         atomic64_add(i, &prog->aux->refcnt);
2215 }
2216 EXPORT_SYMBOL_GPL(bpf_prog_add);
2217
2218 void bpf_prog_sub(struct bpf_prog *prog, int i)
2219 {
2220         /* Only to be used for undoing previous bpf_prog_add() in some
2221          * error path. We still know that another entity in our call
2222          * path holds a reference to the program, thus atomic_sub() can
2223          * be safely used in such cases!
2224          */
2225         WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
2226 }
2227 EXPORT_SYMBOL_GPL(bpf_prog_sub);
2228
2229 void bpf_prog_inc(struct bpf_prog *prog)
2230 {
2231         atomic64_inc(&prog->aux->refcnt);
2232 }
2233 EXPORT_SYMBOL_GPL(bpf_prog_inc);
2234
2235 /* prog_idr_lock should have been held */
2236 struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
2237 {
2238         int refold;
2239
2240         refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
2241
2242         if (!refold)
2243                 return ERR_PTR(-ENOENT);
2244
2245         return prog;
2246 }
2247 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
2248
2249 bool bpf_prog_get_ok(struct bpf_prog *prog,
2250                             enum bpf_prog_type *attach_type, bool attach_drv)
2251 {
2252         /* not an attachment, just a refcount inc, always allow */
2253         if (!attach_type)
2254                 return true;
2255
2256         if (prog->type != *attach_type)
2257                 return false;
2258         if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
2259                 return false;
2260
2261         return true;
2262 }
2263
2264 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
2265                                        bool attach_drv)
2266 {
2267         struct fd f = fdget(ufd);
2268         struct bpf_prog *prog;
2269
2270         prog = ____bpf_prog_get(f);
2271         if (IS_ERR(prog))
2272                 return prog;
2273         if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
2274                 prog = ERR_PTR(-EINVAL);
2275                 goto out;
2276         }
2277
2278         bpf_prog_inc(prog);
2279 out:
2280         fdput(f);
2281         return prog;
2282 }
2283
2284 struct bpf_prog *bpf_prog_get(u32 ufd)
2285 {
2286         return __bpf_prog_get(ufd, NULL, false);
2287 }
2288
2289 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
2290                                        bool attach_drv)
2291 {
2292         return __bpf_prog_get(ufd, &type, attach_drv);
2293 }
2294 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
2295
2296 /* Initially all BPF programs could be loaded w/o specifying
2297  * expected_attach_type. Later for some of them specifying expected_attach_type
2298  * at load time became required so that program could be validated properly.
2299  * Programs of types that are allowed to be loaded both w/ and w/o (for
2300  * backward compatibility) expected_attach_type, should have the default attach
2301  * type assigned to expected_attach_type for the latter case, so that it can be
2302  * validated later at attach time.
2303  *
2304  * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
2305  * prog type requires it but has some attach types that have to be backward
2306  * compatible.
2307  */
2308 static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
2309 {
2310         switch (attr->prog_type) {
2311         case BPF_PROG_TYPE_CGROUP_SOCK:
2312                 /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
2313                  * exist so checking for non-zero is the way to go here.
2314                  */
2315                 if (!attr->expected_attach_type)
2316                         attr->expected_attach_type =
2317                                 BPF_CGROUP_INET_SOCK_CREATE;
2318                 break;
2319         case BPF_PROG_TYPE_SK_REUSEPORT:
2320                 if (!attr->expected_attach_type)
2321                         attr->expected_attach_type =
2322                                 BPF_SK_REUSEPORT_SELECT;
2323                 break;
2324         }
2325 }
2326
2327 static int
2328 bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
2329                            enum bpf_attach_type expected_attach_type,
2330                            struct btf *attach_btf, u32 btf_id,
2331                            struct bpf_prog *dst_prog)
2332 {
2333         if (btf_id) {
2334                 if (btf_id > BTF_MAX_TYPE)
2335                         return -EINVAL;
2336
2337                 if (!attach_btf && !dst_prog)
2338                         return -EINVAL;
2339
2340                 switch (prog_type) {
2341                 case BPF_PROG_TYPE_TRACING:
2342                 case BPF_PROG_TYPE_LSM:
2343                 case BPF_PROG_TYPE_STRUCT_OPS:
2344                 case BPF_PROG_TYPE_EXT:
2345                         break;
2346                 default:
2347                         return -EINVAL;
2348                 }
2349         }
2350
2351         if (attach_btf && (!btf_id || dst_prog))
2352                 return -EINVAL;
2353
2354         if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
2355             prog_type != BPF_PROG_TYPE_EXT)
2356                 return -EINVAL;
2357
2358         switch (prog_type) {
2359         case BPF_PROG_TYPE_CGROUP_SOCK:
2360                 switch (expected_attach_type) {
2361                 case BPF_CGROUP_INET_SOCK_CREATE:
2362                 case BPF_CGROUP_INET_SOCK_RELEASE:
2363                 case BPF_CGROUP_INET4_POST_BIND:
2364                 case BPF_CGROUP_INET6_POST_BIND:
2365                         return 0;
2366                 default:
2367                         return -EINVAL;
2368                 }
2369         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2370                 switch (expected_attach_type) {
2371                 case BPF_CGROUP_INET4_BIND:
2372                 case BPF_CGROUP_INET6_BIND:
2373                 case BPF_CGROUP_INET4_CONNECT:
2374                 case BPF_CGROUP_INET6_CONNECT:
2375                 case BPF_CGROUP_INET4_GETPEERNAME:
2376                 case BPF_CGROUP_INET6_GETPEERNAME:
2377                 case BPF_CGROUP_INET4_GETSOCKNAME:
2378                 case BPF_CGROUP_INET6_GETSOCKNAME:
2379                 case BPF_CGROUP_UDP4_SENDMSG:
2380                 case BPF_CGROUP_UDP6_SENDMSG:
2381                 case BPF_CGROUP_UDP4_RECVMSG:
2382                 case BPF_CGROUP_UDP6_RECVMSG:
2383                         return 0;
2384                 default:
2385                         return -EINVAL;
2386                 }
2387         case BPF_PROG_TYPE_CGROUP_SKB:
2388                 switch (expected_attach_type) {
2389                 case BPF_CGROUP_INET_INGRESS:
2390                 case BPF_CGROUP_INET_EGRESS:
2391                         return 0;
2392                 default:
2393                         return -EINVAL;
2394                 }
2395         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2396                 switch (expected_attach_type) {
2397                 case BPF_CGROUP_SETSOCKOPT:
2398                 case BPF_CGROUP_GETSOCKOPT:
2399                         return 0;
2400                 default:
2401                         return -EINVAL;
2402                 }
2403         case BPF_PROG_TYPE_SK_LOOKUP:
2404                 if (expected_attach_type == BPF_SK_LOOKUP)
2405                         return 0;
2406                 return -EINVAL;
2407         case BPF_PROG_TYPE_SK_REUSEPORT:
2408                 switch (expected_attach_type) {
2409                 case BPF_SK_REUSEPORT_SELECT:
2410                 case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
2411                         return 0;
2412                 default:
2413                         return -EINVAL;
2414                 }
2415         case BPF_PROG_TYPE_SYSCALL:
2416         case BPF_PROG_TYPE_EXT:
2417                 if (expected_attach_type)
2418                         return -EINVAL;
2419                 fallthrough;
2420         default:
2421                 return 0;
2422         }
2423 }
2424
2425 static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
2426 {
2427         switch (prog_type) {
2428         case BPF_PROG_TYPE_SCHED_CLS:
2429         case BPF_PROG_TYPE_SCHED_ACT:
2430         case BPF_PROG_TYPE_XDP:
2431         case BPF_PROG_TYPE_LWT_IN:
2432         case BPF_PROG_TYPE_LWT_OUT:
2433         case BPF_PROG_TYPE_LWT_XMIT:
2434         case BPF_PROG_TYPE_LWT_SEG6LOCAL:
2435         case BPF_PROG_TYPE_SK_SKB:
2436         case BPF_PROG_TYPE_SK_MSG:
2437         case BPF_PROG_TYPE_LIRC_MODE2:
2438         case BPF_PROG_TYPE_FLOW_DISSECTOR:
2439         case BPF_PROG_TYPE_CGROUP_DEVICE:
2440         case BPF_PROG_TYPE_CGROUP_SOCK:
2441         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2442         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2443         case BPF_PROG_TYPE_CGROUP_SYSCTL:
2444         case BPF_PROG_TYPE_SOCK_OPS:
2445         case BPF_PROG_TYPE_EXT: /* extends any prog */
2446                 return true;
2447         case BPF_PROG_TYPE_CGROUP_SKB:
2448                 /* always unpriv */
2449         case BPF_PROG_TYPE_SK_REUSEPORT:
2450                 /* equivalent to SOCKET_FILTER. need CAP_BPF only */
2451         default:
2452                 return false;
2453         }
2454 }
2455
2456 static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
2457 {
2458         switch (prog_type) {
2459         case BPF_PROG_TYPE_KPROBE:
2460         case BPF_PROG_TYPE_TRACEPOINT:
2461         case BPF_PROG_TYPE_PERF_EVENT:
2462         case BPF_PROG_TYPE_RAW_TRACEPOINT:
2463         case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
2464         case BPF_PROG_TYPE_TRACING:
2465         case BPF_PROG_TYPE_LSM:
2466         case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
2467         case BPF_PROG_TYPE_EXT: /* extends any prog */
2468                 return true;
2469         default:
2470                 return false;
2471         }
2472 }
2473
2474 /* last field in 'union bpf_attr' used by this command */
2475 #define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
2476
2477 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
2478 {
2479         enum bpf_prog_type type = attr->prog_type;
2480         struct bpf_prog *prog, *dst_prog = NULL;
2481         struct btf *attach_btf = NULL;
2482         int err;
2483         char license[128];
2484         bool is_gpl;
2485
2486         if (CHECK_ATTR(BPF_PROG_LOAD))
2487                 return -EINVAL;
2488
2489         if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
2490                                  BPF_F_ANY_ALIGNMENT |
2491                                  BPF_F_TEST_STATE_FREQ |
2492                                  BPF_F_SLEEPABLE |
2493                                  BPF_F_TEST_RND_HI32 |
2494                                  BPF_F_XDP_HAS_FRAGS))
2495                 return -EINVAL;
2496
2497         if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
2498             (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
2499             !bpf_capable())
2500                 return -EPERM;
2501
2502         /* copy eBPF program license from user space */
2503         if (strncpy_from_bpfptr(license,
2504                                 make_bpfptr(attr->license, uattr.is_kernel),
2505                                 sizeof(license) - 1) < 0)
2506                 return -EFAULT;
2507         license[sizeof(license) - 1] = 0;
2508
2509         /* eBPF programs must be GPL compatible to use GPL-ed functions */
2510         is_gpl = license_is_gpl_compatible(license);
2511
2512         if (attr->insn_cnt == 0 ||
2513             attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
2514                 return -E2BIG;
2515         if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
2516             type != BPF_PROG_TYPE_CGROUP_SKB &&
2517             !bpf_capable())
2518                 return -EPERM;
2519
2520         if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
2521                 return -EPERM;
2522         if (is_perfmon_prog_type(type) && !perfmon_capable())
2523                 return -EPERM;
2524
2525         /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
2526          * or btf, we need to check which one it is
2527          */
2528         if (attr->attach_prog_fd) {
2529                 dst_prog = bpf_prog_get(attr->attach_prog_fd);
2530                 if (IS_ERR(dst_prog)) {
2531                         dst_prog = NULL;
2532                         attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
2533                         if (IS_ERR(attach_btf))
2534                                 return -EINVAL;
2535                         if (!btf_is_kernel(attach_btf)) {
2536                                 /* attaching through specifying bpf_prog's BTF
2537                                  * objects directly might be supported eventually
2538                                  */
2539                                 btf_put(attach_btf);
2540                                 return -ENOTSUPP;
2541                         }
2542                 }
2543         } else if (attr->attach_btf_id) {
2544                 /* fall back to vmlinux BTF, if BTF type ID is specified */
2545                 attach_btf = bpf_get_btf_vmlinux();
2546                 if (IS_ERR(attach_btf))
2547                         return PTR_ERR(attach_btf);
2548                 if (!attach_btf)
2549                         return -EINVAL;
2550                 btf_get(attach_btf);
2551         }
2552
2553         bpf_prog_load_fixup_attach_type(attr);
2554         if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
2555                                        attach_btf, attr->attach_btf_id,
2556                                        dst_prog)) {
2557                 if (dst_prog)
2558                         bpf_prog_put(dst_prog);
2559                 if (attach_btf)
2560                         btf_put(attach_btf);
2561                 return -EINVAL;
2562         }
2563
2564         /* plain bpf_prog allocation */
2565         prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
2566         if (!prog) {
2567                 if (dst_prog)
2568                         bpf_prog_put(dst_prog);
2569                 if (attach_btf)
2570                         btf_put(attach_btf);
2571                 return -ENOMEM;
2572         }
2573
2574         prog->expected_attach_type = attr->expected_attach_type;
2575         prog->aux->attach_btf = attach_btf;
2576         prog->aux->attach_btf_id = attr->attach_btf_id;
2577         prog->aux->dst_prog = dst_prog;
2578         prog->aux->offload_requested = !!attr->prog_ifindex;
2579         prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
2580         prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
2581
2582         err = security_bpf_prog_alloc(prog->aux);
2583         if (err)
2584                 goto free_prog;
2585
2586         prog->aux->user = get_current_user();
2587         prog->len = attr->insn_cnt;
2588
2589         err = -EFAULT;
2590         if (copy_from_bpfptr(prog->insns,
2591                              make_bpfptr(attr->insns, uattr.is_kernel),
2592                              bpf_prog_insn_size(prog)) != 0)
2593                 goto free_prog_sec;
2594
2595         prog->orig_prog = NULL;
2596         prog->jited = 0;
2597
2598         atomic64_set(&prog->aux->refcnt, 1);
2599         prog->gpl_compatible = is_gpl ? 1 : 0;
2600
2601         if (bpf_prog_is_dev_bound(prog->aux)) {
2602                 err = bpf_prog_offload_init(prog, attr);
2603                 if (err)
2604                         goto free_prog_sec;
2605         }
2606
2607         /* find program type: socket_filter vs tracing_filter */
2608         err = find_prog_type(type, prog);
2609         if (err < 0)
2610                 goto free_prog_sec;
2611
2612         prog->aux->load_time = ktime_get_boottime_ns();
2613         err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
2614                                sizeof(attr->prog_name));
2615         if (err < 0)
2616                 goto free_prog_sec;
2617
2618         /* run eBPF verifier */
2619         err = bpf_check(&prog, attr, uattr);
2620         if (err < 0)
2621                 goto free_used_maps;
2622
2623         prog = bpf_prog_select_runtime(prog, &err);
2624         if (err < 0)
2625                 goto free_used_maps;
2626
2627         err = bpf_prog_alloc_id(prog);
2628         if (err)
2629                 goto free_used_maps;
2630
2631         /* Upon success of bpf_prog_alloc_id(), the BPF prog is
2632          * effectively publicly exposed. However, retrieving via
2633          * bpf_prog_get_fd_by_id() will take another reference,
2634          * therefore it cannot be gone underneath us.
2635          *
2636          * Only for the time /after/ successful bpf_prog_new_fd()
2637          * and before returning to userspace, we might just hold
2638          * one reference and any parallel close on that fd could
2639          * rip everything out. Hence, below notifications must
2640          * happen before bpf_prog_new_fd().
2641          *
2642          * Also, any failure handling from this point onwards must
2643          * be using bpf_prog_put() given the program is exposed.
2644          */
2645         bpf_prog_kallsyms_add(prog);
2646         perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
2647         bpf_audit_prog(prog, BPF_AUDIT_LOAD);
2648
2649         err = bpf_prog_new_fd(prog);
2650         if (err < 0)
2651                 bpf_prog_put(prog);
2652         return err;
2653
2654 free_used_maps:
2655         /* In case we have subprogs, we need to wait for a grace
2656          * period before we can tear down JIT memory since symbols
2657          * are already exposed under kallsyms.
2658          */
2659         __bpf_prog_put_noref(prog, prog->aux->func_cnt);
2660         return err;
2661 free_prog_sec:
2662         free_uid(prog->aux->user);
2663         security_bpf_prog_free(prog->aux);
2664 free_prog:
2665         if (prog->aux->attach_btf)
2666                 btf_put(prog->aux->attach_btf);
2667         bpf_prog_free(prog);
2668         return err;
2669 }
2670
2671 #define BPF_OBJ_LAST_FIELD file_flags
2672
2673 static int bpf_obj_pin(const union bpf_attr *attr)
2674 {
2675         if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
2676                 return -EINVAL;
2677
2678         return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
2679 }
2680
2681 static int bpf_obj_get(const union bpf_attr *attr)
2682 {
2683         if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
2684             attr->file_flags & ~BPF_OBJ_FLAG_MASK)
2685                 return -EINVAL;
2686
2687         return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
2688                                 attr->file_flags);
2689 }
2690
2691 void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
2692                    const struct bpf_link_ops *ops, struct bpf_prog *prog)
2693 {
2694         atomic64_set(&link->refcnt, 1);
2695         link->type = type;
2696         link->id = 0;
2697         link->ops = ops;
2698         link->prog = prog;
2699 }
2700
2701 static void bpf_link_free_id(int id)
2702 {
2703         if (!id)
2704                 return;
2705
2706         spin_lock_bh(&link_idr_lock);
2707         idr_remove(&link_idr, id);
2708         spin_unlock_bh(&link_idr_lock);
2709 }
2710
2711 /* Clean up bpf_link and corresponding anon_inode file and FD. After
2712  * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
2713  * anon_inode's release() call. This helper marksbpf_link as
2714  * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
2715  * is not decremented, it's the responsibility of a calling code that failed
2716  * to complete bpf_link initialization.
2717  */
2718 void bpf_link_cleanup(struct bpf_link_primer *primer)
2719 {
2720         primer->link->prog = NULL;
2721         bpf_link_free_id(primer->id);
2722         fput(primer->file);
2723         put_unused_fd(primer->fd);
2724 }
2725
2726 void bpf_link_inc(struct bpf_link *link)
2727 {
2728         atomic64_inc(&link->refcnt);
2729 }
2730
2731 /* bpf_link_free is guaranteed to be called from process context */
2732 static void bpf_link_free(struct bpf_link *link)
2733 {
2734         bpf_link_free_id(link->id);
2735         if (link->prog) {
2736                 /* detach BPF program, clean up used resources */
2737                 link->ops->release(link);
2738                 bpf_prog_put(link->prog);
2739         }
2740         /* free bpf_link and its containing memory */
2741         link->ops->dealloc(link);
2742 }
2743
2744 static void bpf_link_put_deferred(struct work_struct *work)
2745 {
2746         struct bpf_link *link = container_of(work, struct bpf_link, work);
2747
2748         bpf_link_free(link);
2749 }
2750
2751 /* bpf_link_put can be called from atomic context, but ensures that resources
2752  * are freed from process context
2753  */
2754 void bpf_link_put(struct bpf_link *link)
2755 {
2756         if (!atomic64_dec_and_test(&link->refcnt))
2757                 return;
2758
2759         if (in_atomic()) {
2760                 INIT_WORK(&link->work, bpf_link_put_deferred);
2761                 schedule_work(&link->work);
2762         } else {
2763                 bpf_link_free(link);
2764         }
2765 }
2766 EXPORT_SYMBOL(bpf_link_put);
2767
2768 static int bpf_link_release(struct inode *inode, struct file *filp)
2769 {
2770         struct bpf_link *link = filp->private_data;
2771
2772         bpf_link_put(link);
2773         return 0;
2774 }
2775
2776 #ifdef CONFIG_PROC_FS
2777 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
2778 #define BPF_MAP_TYPE(_id, _ops)
2779 #define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
2780 static const char *bpf_link_type_strs[] = {
2781         [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
2782 #include <linux/bpf_types.h>
2783 };
2784 #undef BPF_PROG_TYPE
2785 #undef BPF_MAP_TYPE
2786 #undef BPF_LINK_TYPE
2787
2788 static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
2789 {
2790         const struct bpf_link *link = filp->private_data;
2791         const struct bpf_prog *prog = link->prog;
2792         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
2793
2794         bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
2795         seq_printf(m,
2796                    "link_type:\t%s\n"
2797                    "link_id:\t%u\n"
2798                    "prog_tag:\t%s\n"
2799                    "prog_id:\t%u\n",
2800                    bpf_link_type_strs[link->type],
2801                    link->id,
2802                    prog_tag,
2803                    prog->aux->id);
2804         if (link->ops->show_fdinfo)
2805                 link->ops->show_fdinfo(link, m);
2806 }
2807 #endif
2808
2809 static const struct file_operations bpf_link_fops = {
2810 #ifdef CONFIG_PROC_FS
2811         .show_fdinfo    = bpf_link_show_fdinfo,
2812 #endif
2813         .release        = bpf_link_release,
2814         .read           = bpf_dummy_read,
2815         .write          = bpf_dummy_write,
2816 };
2817
2818 static int bpf_link_alloc_id(struct bpf_link *link)
2819 {
2820         int id;
2821
2822         idr_preload(GFP_KERNEL);
2823         spin_lock_bh(&link_idr_lock);
2824         id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
2825         spin_unlock_bh(&link_idr_lock);
2826         idr_preload_end();
2827
2828         return id;
2829 }
2830
2831 /* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
2832  * reserving unused FD and allocating ID from link_idr. This is to be paired
2833  * with bpf_link_settle() to install FD and ID and expose bpf_link to
2834  * user-space, if bpf_link is successfully attached. If not, bpf_link and
2835  * pre-allocated resources are to be freed with bpf_cleanup() call. All the
2836  * transient state is passed around in struct bpf_link_primer.
2837  * This is preferred way to create and initialize bpf_link, especially when
2838  * there are complicated and expensive operations in between creating bpf_link
2839  * itself and attaching it to BPF hook. By using bpf_link_prime() and
2840  * bpf_link_settle() kernel code using bpf_link doesn't have to perform
2841  * expensive (and potentially failing) roll back operations in a rare case
2842  * that file, FD, or ID can't be allocated.
2843  */
2844 int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
2845 {
2846         struct file *file;
2847         int fd, id;
2848
2849         fd = get_unused_fd_flags(O_CLOEXEC);
2850         if (fd < 0)
2851                 return fd;
2852
2853
2854         id = bpf_link_alloc_id(link);
2855         if (id < 0) {
2856                 put_unused_fd(fd);
2857                 return id;
2858         }
2859
2860         file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
2861         if (IS_ERR(file)) {
2862                 bpf_link_free_id(id);
2863                 put_unused_fd(fd);
2864                 return PTR_ERR(file);
2865         }
2866
2867         primer->link = link;
2868         primer->file = file;
2869         primer->fd = fd;
2870         primer->id = id;
2871         return 0;
2872 }
2873
2874 int bpf_link_settle(struct bpf_link_primer *primer)
2875 {
2876         /* make bpf_link fetchable by ID */
2877         spin_lock_bh(&link_idr_lock);
2878         primer->link->id = primer->id;
2879         spin_unlock_bh(&link_idr_lock);
2880         /* make bpf_link fetchable by FD */
2881         fd_install(primer->fd, primer->file);
2882         /* pass through installed FD */
2883         return primer->fd;
2884 }
2885
2886 int bpf_link_new_fd(struct bpf_link *link)
2887 {
2888         return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
2889 }
2890
2891 struct bpf_link *bpf_link_get_from_fd(u32 ufd)
2892 {
2893         struct fd f = fdget(ufd);
2894         struct bpf_link *link;
2895
2896         if (!f.file)
2897                 return ERR_PTR(-EBADF);
2898         if (f.file->f_op != &bpf_link_fops) {
2899                 fdput(f);
2900                 return ERR_PTR(-EINVAL);
2901         }
2902
2903         link = f.file->private_data;
2904         bpf_link_inc(link);
2905         fdput(f);
2906
2907         return link;
2908 }
2909 EXPORT_SYMBOL(bpf_link_get_from_fd);
2910
2911 static void bpf_tracing_link_release(struct bpf_link *link)
2912 {
2913         struct bpf_tracing_link *tr_link =
2914                 container_of(link, struct bpf_tracing_link, link.link);
2915
2916         WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
2917                                                 tr_link->trampoline));
2918
2919         bpf_trampoline_put(tr_link->trampoline);
2920
2921         /* tgt_prog is NULL if target is a kernel function */
2922         if (tr_link->tgt_prog)
2923                 bpf_prog_put(tr_link->tgt_prog);
2924 }
2925
2926 static void bpf_tracing_link_dealloc(struct bpf_link *link)
2927 {
2928         struct bpf_tracing_link *tr_link =
2929                 container_of(link, struct bpf_tracing_link, link.link);
2930
2931         kfree(tr_link);
2932 }
2933
2934 static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
2935                                          struct seq_file *seq)
2936 {
2937         struct bpf_tracing_link *tr_link =
2938                 container_of(link, struct bpf_tracing_link, link.link);
2939
2940         seq_printf(seq,
2941                    "attach_type:\t%d\n",
2942                    tr_link->attach_type);
2943 }
2944
2945 static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
2946                                            struct bpf_link_info *info)
2947 {
2948         struct bpf_tracing_link *tr_link =
2949                 container_of(link, struct bpf_tracing_link, link.link);
2950
2951         info->tracing.attach_type = tr_link->attach_type;
2952         bpf_trampoline_unpack_key(tr_link->trampoline->key,
2953                                   &info->tracing.target_obj_id,
2954                                   &info->tracing.target_btf_id);
2955
2956         return 0;
2957 }
2958
2959 static const struct bpf_link_ops bpf_tracing_link_lops = {
2960         .release = bpf_tracing_link_release,
2961         .dealloc = bpf_tracing_link_dealloc,
2962         .show_fdinfo = bpf_tracing_link_show_fdinfo,
2963         .fill_link_info = bpf_tracing_link_fill_link_info,
2964 };
2965
2966 static int bpf_tracing_prog_attach(struct bpf_prog *prog,
2967                                    int tgt_prog_fd,
2968                                    u32 btf_id,
2969                                    u64 bpf_cookie)
2970 {
2971         struct bpf_link_primer link_primer;
2972         struct bpf_prog *tgt_prog = NULL;
2973         struct bpf_trampoline *tr = NULL;
2974         struct bpf_tracing_link *link;
2975         u64 key = 0;
2976         int err;
2977
2978         switch (prog->type) {
2979         case BPF_PROG_TYPE_TRACING:
2980                 if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
2981                     prog->expected_attach_type != BPF_TRACE_FEXIT &&
2982                     prog->expected_attach_type != BPF_MODIFY_RETURN) {
2983                         err = -EINVAL;
2984                         goto out_put_prog;
2985                 }
2986                 break;
2987         case BPF_PROG_TYPE_EXT:
2988                 if (prog->expected_attach_type != 0) {
2989                         err = -EINVAL;
2990                         goto out_put_prog;
2991                 }
2992                 break;
2993         case BPF_PROG_TYPE_LSM:
2994                 if (prog->expected_attach_type != BPF_LSM_MAC) {
2995                         err = -EINVAL;
2996                         goto out_put_prog;
2997                 }
2998                 break;
2999         default:
3000                 err = -EINVAL;
3001                 goto out_put_prog;
3002         }
3003
3004         if (!!tgt_prog_fd != !!btf_id) {
3005                 err = -EINVAL;
3006                 goto out_put_prog;
3007         }
3008
3009         if (tgt_prog_fd) {
3010                 /* For now we only allow new targets for BPF_PROG_TYPE_EXT */
3011                 if (prog->type != BPF_PROG_TYPE_EXT) {
3012                         err = -EINVAL;
3013                         goto out_put_prog;
3014                 }
3015
3016                 tgt_prog = bpf_prog_get(tgt_prog_fd);
3017                 if (IS_ERR(tgt_prog)) {
3018                         err = PTR_ERR(tgt_prog);
3019                         tgt_prog = NULL;
3020                         goto out_put_prog;
3021                 }
3022
3023                 key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
3024         }
3025
3026         link = kzalloc(sizeof(*link), GFP_USER);
3027         if (!link) {
3028                 err = -ENOMEM;
3029                 goto out_put_prog;
3030         }
3031         bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
3032                       &bpf_tracing_link_lops, prog);
3033         link->attach_type = prog->expected_attach_type;
3034         link->link.cookie = bpf_cookie;
3035
3036         mutex_lock(&prog->aux->dst_mutex);
3037
3038         /* There are a few possible cases here:
3039          *
3040          * - if prog->aux->dst_trampoline is set, the program was just loaded
3041          *   and not yet attached to anything, so we can use the values stored
3042          *   in prog->aux
3043          *
3044          * - if prog->aux->dst_trampoline is NULL, the program has already been
3045          *   attached to a target and its initial target was cleared (below)
3046          *
3047          * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
3048          *   target_btf_id using the link_create API.
3049          *
3050          * - if tgt_prog == NULL when this function was called using the old
3051          *   raw_tracepoint_open API, and we need a target from prog->aux
3052          *
3053          * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
3054          *   was detached and is going for re-attachment.
3055          */
3056         if (!prog->aux->dst_trampoline && !tgt_prog) {
3057                 /*
3058                  * Allow re-attach for TRACING and LSM programs. If it's
3059                  * currently linked, bpf_trampoline_link_prog will fail.
3060                  * EXT programs need to specify tgt_prog_fd, so they
3061                  * re-attach in separate code path.
3062                  */
3063                 if (prog->type != BPF_PROG_TYPE_TRACING &&
3064                     prog->type != BPF_PROG_TYPE_LSM) {
3065                         err = -EINVAL;
3066                         goto out_unlock;
3067                 }
3068                 btf_id = prog->aux->attach_btf_id;
3069                 key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
3070         }
3071
3072         if (!prog->aux->dst_trampoline ||
3073             (key && key != prog->aux->dst_trampoline->key)) {
3074                 /* If there is no saved target, or the specified target is
3075                  * different from the destination specified at load time, we
3076                  * need a new trampoline and a check for compatibility
3077                  */
3078                 struct bpf_attach_target_info tgt_info = {};
3079
3080                 err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
3081                                               &tgt_info);
3082                 if (err)
3083                         goto out_unlock;
3084
3085                 tr = bpf_trampoline_get(key, &tgt_info);
3086                 if (!tr) {
3087                         err = -ENOMEM;
3088                         goto out_unlock;
3089                 }
3090         } else {
3091                 /* The caller didn't specify a target, or the target was the
3092                  * same as the destination supplied during program load. This
3093                  * means we can reuse the trampoline and reference from program
3094                  * load time, and there is no need to allocate a new one. This
3095                  * can only happen once for any program, as the saved values in
3096                  * prog->aux are cleared below.
3097                  */
3098                 tr = prog->aux->dst_trampoline;
3099                 tgt_prog = prog->aux->dst_prog;
3100         }
3101
3102         err = bpf_link_prime(&link->link.link, &link_primer);
3103         if (err)
3104                 goto out_unlock;
3105
3106         err = bpf_trampoline_link_prog(&link->link, tr);
3107         if (err) {
3108                 bpf_link_cleanup(&link_primer);
3109                 link = NULL;
3110                 goto out_unlock;
3111         }
3112
3113         link->tgt_prog = tgt_prog;
3114         link->trampoline = tr;
3115
3116         /* Always clear the trampoline and target prog from prog->aux to make
3117          * sure the original attach destination is not kept alive after a
3118          * program is (re-)attached to another target.
3119          */
3120         if (prog->aux->dst_prog &&
3121             (tgt_prog_fd || tr != prog->aux->dst_trampoline))
3122                 /* got extra prog ref from syscall, or attaching to different prog */
3123                 bpf_prog_put(prog->aux->dst_prog);
3124         if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
3125                 /* we allocated a new trampoline, so free the old one */
3126                 bpf_trampoline_put(prog->aux->dst_trampoline);
3127
3128         prog->aux->dst_prog = NULL;
3129         prog->aux->dst_trampoline = NULL;
3130         mutex_unlock(&prog->aux->dst_mutex);
3131
3132         return bpf_link_settle(&link_primer);
3133 out_unlock:
3134         if (tr && tr != prog->aux->dst_trampoline)
3135                 bpf_trampoline_put(tr);
3136         mutex_unlock(&prog->aux->dst_mutex);
3137         kfree(link);
3138 out_put_prog:
3139         if (tgt_prog_fd && tgt_prog)
3140                 bpf_prog_put(tgt_prog);
3141         return err;
3142 }
3143
3144 struct bpf_raw_tp_link {
3145         struct bpf_link link;
3146         struct bpf_raw_event_map *btp;
3147 };
3148
3149 static void bpf_raw_tp_link_release(struct bpf_link *link)
3150 {
3151         struct bpf_raw_tp_link *raw_tp =
3152                 container_of(link, struct bpf_raw_tp_link, link);
3153
3154         bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
3155         bpf_put_raw_tracepoint(raw_tp->btp);
3156 }
3157
3158 static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
3159 {
3160         struct bpf_raw_tp_link *raw_tp =
3161                 container_of(link, struct bpf_raw_tp_link, link);
3162
3163         kfree(raw_tp);
3164 }
3165
3166 static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
3167                                         struct seq_file *seq)
3168 {
3169         struct bpf_raw_tp_link *raw_tp_link =
3170                 container_of(link, struct bpf_raw_tp_link, link);
3171
3172         seq_printf(seq,
3173                    "tp_name:\t%s\n",
3174                    raw_tp_link->btp->tp->name);
3175 }
3176
3177 static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
3178                                           struct bpf_link_info *info)
3179 {
3180         struct bpf_raw_tp_link *raw_tp_link =
3181                 container_of(link, struct bpf_raw_tp_link, link);
3182         char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
3183         const char *tp_name = raw_tp_link->btp->tp->name;
3184         u32 ulen = info->raw_tracepoint.tp_name_len;
3185         size_t tp_len = strlen(tp_name);
3186
3187         if (!ulen ^ !ubuf)
3188                 return -EINVAL;
3189
3190         info->raw_tracepoint.tp_name_len = tp_len + 1;
3191
3192         if (!ubuf)
3193                 return 0;
3194
3195         if (ulen >= tp_len + 1) {
3196                 if (copy_to_user(ubuf, tp_name, tp_len + 1))
3197                         return -EFAULT;
3198         } else {
3199                 char zero = '\0';
3200
3201                 if (copy_to_user(ubuf, tp_name, ulen - 1))
3202                         return -EFAULT;
3203                 if (put_user(zero, ubuf + ulen - 1))
3204                         return -EFAULT;
3205                 return -ENOSPC;
3206         }
3207
3208         return 0;
3209 }
3210
3211 static const struct bpf_link_ops bpf_raw_tp_link_lops = {
3212         .release = bpf_raw_tp_link_release,
3213         .dealloc = bpf_raw_tp_link_dealloc,
3214         .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
3215         .fill_link_info = bpf_raw_tp_link_fill_link_info,
3216 };
3217
3218 #ifdef CONFIG_PERF_EVENTS
3219 struct bpf_perf_link {
3220         struct bpf_link link;
3221         struct file *perf_file;
3222 };
3223
3224 static void bpf_perf_link_release(struct bpf_link *link)
3225 {
3226         struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
3227         struct perf_event *event = perf_link->perf_file->private_data;
3228
3229         perf_event_free_bpf_prog(event);
3230         fput(perf_link->perf_file);
3231 }
3232
3233 static void bpf_perf_link_dealloc(struct bpf_link *link)
3234 {
3235         struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
3236
3237         kfree(perf_link);
3238 }
3239
3240 static const struct bpf_link_ops bpf_perf_link_lops = {
3241         .release = bpf_perf_link_release,
3242         .dealloc = bpf_perf_link_dealloc,
3243 };
3244
3245 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
3246 {
3247         struct bpf_link_primer link_primer;
3248         struct bpf_perf_link *link;
3249         struct perf_event *event;
3250         struct file *perf_file;
3251         int err;
3252
3253         if (attr->link_create.flags)
3254                 return -EINVAL;
3255
3256         perf_file = perf_event_get(attr->link_create.target_fd);
3257         if (IS_ERR(perf_file))
3258                 return PTR_ERR(perf_file);
3259
3260         link = kzalloc(sizeof(*link), GFP_USER);
3261         if (!link) {
3262                 err = -ENOMEM;
3263                 goto out_put_file;
3264         }
3265         bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
3266         link->perf_file = perf_file;
3267
3268         err = bpf_link_prime(&link->link, &link_primer);
3269         if (err) {
3270                 kfree(link);
3271                 goto out_put_file;
3272         }
3273
3274         event = perf_file->private_data;
3275         err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
3276         if (err) {
3277                 bpf_link_cleanup(&link_primer);
3278                 goto out_put_file;
3279         }
3280         /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
3281         bpf_prog_inc(prog);
3282
3283         return bpf_link_settle(&link_primer);
3284
3285 out_put_file:
3286         fput(perf_file);
3287         return err;
3288 }
3289 #else
3290 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
3291 {
3292         return -EOPNOTSUPP;
3293 }
3294 #endif /* CONFIG_PERF_EVENTS */
3295
3296 static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
3297                                   const char __user *user_tp_name)
3298 {
3299         struct bpf_link_primer link_primer;
3300         struct bpf_raw_tp_link *link;
3301         struct bpf_raw_event_map *btp;
3302         const char *tp_name;
3303         char buf[128];
3304         int err;
3305
3306         switch (prog->type) {
3307         case BPF_PROG_TYPE_TRACING:
3308         case BPF_PROG_TYPE_EXT:
3309         case BPF_PROG_TYPE_LSM:
3310                 if (user_tp_name)
3311                         /* The attach point for this category of programs
3312                          * should be specified via btf_id during program load.
3313                          */
3314                         return -EINVAL;
3315                 if (prog->type == BPF_PROG_TYPE_TRACING &&
3316                     prog->expected_attach_type == BPF_TRACE_RAW_TP) {
3317                         tp_name = prog->aux->attach_func_name;
3318                         break;
3319                 }
3320                 return bpf_tracing_prog_attach(prog, 0, 0, 0);
3321         case BPF_PROG_TYPE_RAW_TRACEPOINT:
3322         case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
3323                 if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
3324                         return -EFAULT;
3325                 buf[sizeof(buf) - 1] = 0;
3326                 tp_name = buf;
3327                 break;
3328         default:
3329                 return -EINVAL;
3330         }
3331
3332         btp = bpf_get_raw_tracepoint(tp_name);
3333         if (!btp)
3334                 return -ENOENT;
3335
3336         link = kzalloc(sizeof(*link), GFP_USER);
3337         if (!link) {
3338                 err = -ENOMEM;
3339                 goto out_put_btp;
3340         }
3341         bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
3342                       &bpf_raw_tp_link_lops, prog);
3343         link->btp = btp;
3344
3345         err = bpf_link_prime(&link->link, &link_primer);
3346         if (err) {
3347                 kfree(link);
3348                 goto out_put_btp;
3349         }
3350
3351         err = bpf_probe_register(link->btp, prog);
3352         if (err) {
3353                 bpf_link_cleanup(&link_primer);
3354                 goto out_put_btp;
3355         }
3356
3357         return bpf_link_settle(&link_primer);
3358
3359 out_put_btp:
3360         bpf_put_raw_tracepoint(btp);
3361         return err;
3362 }
3363
3364 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
3365
3366 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
3367 {
3368         struct bpf_prog *prog;
3369         int fd;
3370
3371         if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
3372                 return -EINVAL;
3373
3374         prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
3375         if (IS_ERR(prog))
3376                 return PTR_ERR(prog);
3377
3378         fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
3379         if (fd < 0)
3380                 bpf_prog_put(prog);
3381         return fd;
3382 }
3383
3384 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
3385                                              enum bpf_attach_type attach_type)
3386 {
3387         switch (prog->type) {
3388         case BPF_PROG_TYPE_CGROUP_SOCK:
3389         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3390         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3391         case BPF_PROG_TYPE_SK_LOOKUP:
3392                 return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
3393         case BPF_PROG_TYPE_CGROUP_SKB:
3394                 if (!capable(CAP_NET_ADMIN))
3395                         /* cg-skb progs can be loaded by unpriv user.
3396                          * check permissions at attach time.
3397                          */
3398                         return -EPERM;
3399                 return prog->enforce_expected_attach_type &&
3400                         prog->expected_attach_type != attach_type ?
3401                         -EINVAL : 0;
3402         default:
3403                 return 0;
3404         }
3405 }
3406
3407 static enum bpf_prog_type
3408 attach_type_to_prog_type(enum bpf_attach_type attach_type)
3409 {
3410         switch (attach_type) {
3411         case BPF_CGROUP_INET_INGRESS:
3412         case BPF_CGROUP_INET_EGRESS:
3413                 return BPF_PROG_TYPE_CGROUP_SKB;
3414         case BPF_CGROUP_INET_SOCK_CREATE:
3415         case BPF_CGROUP_INET_SOCK_RELEASE:
3416         case BPF_CGROUP_INET4_POST_BIND:
3417         case BPF_CGROUP_INET6_POST_BIND:
3418                 return BPF_PROG_TYPE_CGROUP_SOCK;
3419         case BPF_CGROUP_INET4_BIND:
3420         case BPF_CGROUP_INET6_BIND:
3421         case BPF_CGROUP_INET4_CONNECT:
3422         case BPF_CGROUP_INET6_CONNECT:
3423         case BPF_CGROUP_INET4_GETPEERNAME:
3424         case BPF_CGROUP_INET6_GETPEERNAME:
3425         case BPF_CGROUP_INET4_GETSOCKNAME:
3426         case BPF_CGROUP_INET6_GETSOCKNAME:
3427         case BPF_CGROUP_UDP4_SENDMSG:
3428         case BPF_CGROUP_UDP6_SENDMSG:
3429         case BPF_CGROUP_UDP4_RECVMSG:
3430         case BPF_CGROUP_UDP6_RECVMSG:
3431                 return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
3432         case BPF_CGROUP_SOCK_OPS:
3433                 return BPF_PROG_TYPE_SOCK_OPS;
3434         case BPF_CGROUP_DEVICE:
3435                 return BPF_PROG_TYPE_CGROUP_DEVICE;
3436         case BPF_SK_MSG_VERDICT:
3437                 return BPF_PROG_TYPE_SK_MSG;
3438         case BPF_SK_SKB_STREAM_PARSER:
3439         case BPF_SK_SKB_STREAM_VERDICT:
3440         case BPF_SK_SKB_VERDICT:
3441                 return BPF_PROG_TYPE_SK_SKB;
3442         case BPF_LIRC_MODE2:
3443                 return BPF_PROG_TYPE_LIRC_MODE2;
3444         case BPF_FLOW_DISSECTOR:
3445                 return BPF_PROG_TYPE_FLOW_DISSECTOR;
3446         case BPF_CGROUP_SYSCTL:
3447                 return BPF_PROG_TYPE_CGROUP_SYSCTL;
3448         case BPF_CGROUP_GETSOCKOPT:
3449         case BPF_CGROUP_SETSOCKOPT:
3450                 return BPF_PROG_TYPE_CGROUP_SOCKOPT;
3451         case BPF_TRACE_ITER:
3452         case BPF_TRACE_RAW_TP:
3453         case BPF_TRACE_FENTRY:
3454         case BPF_TRACE_FEXIT:
3455         case BPF_MODIFY_RETURN:
3456                 return BPF_PROG_TYPE_TRACING;
3457         case BPF_LSM_MAC:
3458                 return BPF_PROG_TYPE_LSM;
3459         case BPF_SK_LOOKUP:
3460                 return BPF_PROG_TYPE_SK_LOOKUP;
3461         case BPF_XDP:
3462                 return BPF_PROG_TYPE_XDP;
3463         case BPF_LSM_CGROUP:
3464                 return BPF_PROG_TYPE_LSM;
3465         default:
3466                 return BPF_PROG_TYPE_UNSPEC;
3467         }
3468 }
3469
3470 #define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
3471
3472 #define BPF_F_ATTACH_MASK \
3473         (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
3474
3475 static int bpf_prog_attach(const union bpf_attr *attr)
3476 {
3477         enum bpf_prog_type ptype;
3478         struct bpf_prog *prog;
3479         int ret;
3480
3481         if (CHECK_ATTR(BPF_PROG_ATTACH))
3482                 return -EINVAL;
3483
3484         if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
3485                 return -EINVAL;
3486
3487         ptype = attach_type_to_prog_type(attr->attach_type);
3488         if (ptype == BPF_PROG_TYPE_UNSPEC)
3489                 return -EINVAL;
3490
3491         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
3492         if (IS_ERR(prog))
3493                 return PTR_ERR(prog);
3494
3495         if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
3496                 bpf_prog_put(prog);
3497                 return -EINVAL;
3498         }
3499
3500         switch (ptype) {
3501         case BPF_PROG_TYPE_SK_SKB:
3502         case BPF_PROG_TYPE_SK_MSG:
3503                 ret = sock_map_get_from_fd(attr, prog);
3504                 break;
3505         case BPF_PROG_TYPE_LIRC_MODE2:
3506                 ret = lirc_prog_attach(attr, prog);
3507                 break;
3508         case BPF_PROG_TYPE_FLOW_DISSECTOR:
3509                 ret = netns_bpf_prog_attach(attr, prog);
3510                 break;
3511         case BPF_PROG_TYPE_CGROUP_DEVICE:
3512         case BPF_PROG_TYPE_CGROUP_SKB:
3513         case BPF_PROG_TYPE_CGROUP_SOCK:
3514         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3515         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3516         case BPF_PROG_TYPE_CGROUP_SYSCTL:
3517         case BPF_PROG_TYPE_SOCK_OPS:
3518         case BPF_PROG_TYPE_LSM:
3519                 if (ptype == BPF_PROG_TYPE_LSM &&
3520                     prog->expected_attach_type != BPF_LSM_CGROUP)
3521                         ret = -EINVAL;
3522                 else
3523                         ret = cgroup_bpf_prog_attach(attr, ptype, prog);
3524                 break;
3525         default:
3526                 ret = -EINVAL;
3527         }
3528
3529         if (ret)
3530                 bpf_prog_put(prog);
3531         return ret;
3532 }
3533
3534 #define BPF_PROG_DETACH_LAST_FIELD attach_type
3535
3536 static int bpf_prog_detach(const union bpf_attr *attr)
3537 {
3538         enum bpf_prog_type ptype;
3539
3540         if (CHECK_ATTR(BPF_PROG_DETACH))
3541                 return -EINVAL;
3542
3543         ptype = attach_type_to_prog_type(attr->attach_type);
3544
3545         switch (ptype) {
3546         case BPF_PROG_TYPE_SK_MSG:
3547         case BPF_PROG_TYPE_SK_SKB:
3548                 return sock_map_prog_detach(attr, ptype);
3549         case BPF_PROG_TYPE_LIRC_MODE2:
3550                 return lirc_prog_detach(attr);
3551         case BPF_PROG_TYPE_FLOW_DISSECTOR:
3552                 return netns_bpf_prog_detach(attr, ptype);
3553         case BPF_PROG_TYPE_CGROUP_DEVICE:
3554         case BPF_PROG_TYPE_CGROUP_SKB:
3555         case BPF_PROG_TYPE_CGROUP_SOCK:
3556         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3557         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3558         case BPF_PROG_TYPE_CGROUP_SYSCTL:
3559         case BPF_PROG_TYPE_SOCK_OPS:
3560         case BPF_PROG_TYPE_LSM:
3561                 return cgroup_bpf_prog_detach(attr, ptype);
3562         default:
3563                 return -EINVAL;
3564         }
3565 }
3566
3567 #define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags
3568
3569 static int bpf_prog_query(const union bpf_attr *attr,
3570                           union bpf_attr __user *uattr)
3571 {
3572         if (!capable(CAP_NET_ADMIN))
3573                 return -EPERM;
3574         if (CHECK_ATTR(BPF_PROG_QUERY))
3575                 return -EINVAL;
3576         if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
3577                 return -EINVAL;
3578
3579         switch (attr->query.attach_type) {
3580         case BPF_CGROUP_INET_INGRESS:
3581         case BPF_CGROUP_INET_EGRESS:
3582         case BPF_CGROUP_INET_SOCK_CREATE:
3583         case BPF_CGROUP_INET_SOCK_RELEASE:
3584         case BPF_CGROUP_INET4_BIND:
3585         case BPF_CGROUP_INET6_BIND:
3586         case BPF_CGROUP_INET4_POST_BIND:
3587         case BPF_CGROUP_INET6_POST_BIND:
3588         case BPF_CGROUP_INET4_CONNECT:
3589         case BPF_CGROUP_INET6_CONNECT:
3590         case BPF_CGROUP_INET4_GETPEERNAME:
3591         case BPF_CGROUP_INET6_GETPEERNAME:
3592         case BPF_CGROUP_INET4_GETSOCKNAME:
3593         case BPF_CGROUP_INET6_GETSOCKNAME:
3594         case BPF_CGROUP_UDP4_SENDMSG:
3595         case BPF_CGROUP_UDP6_SENDMSG:
3596         case BPF_CGROUP_UDP4_RECVMSG:
3597         case BPF_CGROUP_UDP6_RECVMSG:
3598         case BPF_CGROUP_SOCK_OPS:
3599         case BPF_CGROUP_DEVICE:
3600         case BPF_CGROUP_SYSCTL:
3601         case BPF_CGROUP_GETSOCKOPT:
3602         case BPF_CGROUP_SETSOCKOPT:
3603         case BPF_LSM_CGROUP:
3604                 return cgroup_bpf_prog_query(attr, uattr);
3605         case BPF_LIRC_MODE2:
3606                 return lirc_prog_query(attr, uattr);
3607         case BPF_FLOW_DISSECTOR:
3608         case BPF_SK_LOOKUP:
3609                 return netns_bpf_prog_query(attr, uattr);
3610         case BPF_SK_SKB_STREAM_PARSER:
3611         case BPF_SK_SKB_STREAM_VERDICT:
3612         case BPF_SK_MSG_VERDICT:
3613         case BPF_SK_SKB_VERDICT:
3614                 return sock_map_bpf_prog_query(attr, uattr);
3615         default:
3616                 return -EINVAL;
3617         }
3618 }
3619
3620 #define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
3621
3622 static int bpf_prog_test_run(const union bpf_attr *attr,
3623                              union bpf_attr __user *uattr)
3624 {
3625         struct bpf_prog *prog;
3626         int ret = -ENOTSUPP;
3627
3628         if (CHECK_ATTR(BPF_PROG_TEST_RUN))
3629                 return -EINVAL;
3630
3631         if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
3632             (!attr->test.ctx_size_in && attr->test.ctx_in))
3633                 return -EINVAL;
3634
3635         if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
3636             (!attr->test.ctx_size_out && attr->test.ctx_out))
3637                 return -EINVAL;
3638
3639         prog = bpf_prog_get(attr->test.prog_fd);
3640         if (IS_ERR(prog))
3641                 return PTR_ERR(prog);
3642
3643         if (prog->aux->ops->test_run)
3644                 ret = prog->aux->ops->test_run(prog, attr, uattr);
3645
3646         bpf_prog_put(prog);
3647         return ret;
3648 }
3649
3650 #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
3651
3652 static int bpf_obj_get_next_id(const union bpf_attr *attr,
3653                                union bpf_attr __user *uattr,
3654                                struct idr *idr,
3655                                spinlock_t *lock)
3656 {
3657         u32 next_id = attr->start_id;
3658         int err = 0;
3659
3660         if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
3661                 return -EINVAL;
3662
3663         if (!capable(CAP_SYS_ADMIN))
3664                 return -EPERM;
3665
3666         next_id++;
3667         spin_lock_bh(lock);
3668         if (!idr_get_next(idr, &next_id))
3669                 err = -ENOENT;
3670         spin_unlock_bh(lock);
3671
3672         if (!err)
3673                 err = put_user(next_id, &uattr->next_id);
3674
3675         return err;
3676 }
3677
3678 struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
3679 {
3680         struct bpf_map *map;
3681
3682         spin_lock_bh(&map_idr_lock);
3683 again:
3684         map = idr_get_next(&map_idr, id);
3685         if (map) {
3686                 map = __bpf_map_inc_not_zero(map, false);
3687                 if (IS_ERR(map)) {
3688                         (*id)++;
3689                         goto again;
3690                 }
3691         }
3692         spin_unlock_bh(&map_idr_lock);
3693
3694         return map;
3695 }
3696
3697 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
3698 {
3699         struct bpf_prog *prog;
3700
3701         spin_lock_bh(&prog_idr_lock);
3702 again:
3703         prog = idr_get_next(&prog_idr, id);
3704         if (prog) {
3705                 prog = bpf_prog_inc_not_zero(prog);
3706                 if (IS_ERR(prog)) {
3707                         (*id)++;
3708                         goto again;
3709                 }
3710         }
3711         spin_unlock_bh(&prog_idr_lock);
3712
3713         return prog;
3714 }
3715
3716 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
3717
3718 struct bpf_prog *bpf_prog_by_id(u32 id)
3719 {
3720         struct bpf_prog *prog;
3721
3722         if (!id)
3723                 return ERR_PTR(-ENOENT);
3724
3725         spin_lock_bh(&prog_idr_lock);
3726         prog = idr_find(&prog_idr, id);
3727         if (prog)
3728                 prog = bpf_prog_inc_not_zero(prog);
3729         else
3730                 prog = ERR_PTR(-ENOENT);
3731         spin_unlock_bh(&prog_idr_lock);
3732         return prog;
3733 }
3734
3735 static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
3736 {
3737         struct bpf_prog *prog;
3738         u32 id = attr->prog_id;
3739         int fd;
3740
3741         if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
3742                 return -EINVAL;
3743
3744         if (!capable(CAP_SYS_ADMIN))
3745                 return -EPERM;
3746
3747         prog = bpf_prog_by_id(id);
3748         if (IS_ERR(prog))
3749                 return PTR_ERR(prog);
3750
3751         fd = bpf_prog_new_fd(prog);
3752         if (fd < 0)
3753                 bpf_prog_put(prog);
3754
3755         return fd;
3756 }
3757
3758 #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
3759
3760 static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
3761 {
3762         struct bpf_map *map;
3763         u32 id = attr->map_id;
3764         int f_flags;
3765         int fd;
3766
3767         if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
3768             attr->open_flags & ~BPF_OBJ_FLAG_MASK)
3769                 return -EINVAL;
3770
3771         if (!capable(CAP_SYS_ADMIN))
3772                 return -EPERM;
3773
3774         f_flags = bpf_get_file_flag(attr->open_flags);
3775         if (f_flags < 0)
3776                 return f_flags;
3777
3778         spin_lock_bh(&map_idr_lock);
3779         map = idr_find(&map_idr, id);
3780         if (map)
3781                 map = __bpf_map_inc_not_zero(map, true);
3782         else
3783                 map = ERR_PTR(-ENOENT);
3784         spin_unlock_bh(&map_idr_lock);
3785
3786         if (IS_ERR(map))
3787                 return PTR_ERR(map);
3788
3789         fd = bpf_map_new_fd(map, f_flags);
3790         if (fd < 0)
3791                 bpf_map_put_with_uref(map);
3792
3793         return fd;
3794 }
3795
3796 static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
3797                                               unsigned long addr, u32 *off,
3798                                               u32 *type)
3799 {
3800         const struct bpf_map *map;
3801         int i;
3802
3803         mutex_lock(&prog->aux->used_maps_mutex);
3804         for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
3805                 map = prog->aux->used_maps[i];
3806                 if (map == (void *)addr) {
3807                         *type = BPF_PSEUDO_MAP_FD;
3808                         goto out;
3809                 }
3810                 if (!map->ops->map_direct_value_meta)
3811                         continue;
3812                 if (!map->ops->map_direct_value_meta(map, addr, off)) {
3813                         *type = BPF_PSEUDO_MAP_VALUE;
3814                         goto out;
3815                 }
3816         }
3817         map = NULL;
3818
3819 out:
3820         mutex_unlock(&prog->aux->used_maps_mutex);
3821         return map;
3822 }
3823
3824 static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
3825                                               const struct cred *f_cred)
3826 {
3827         const struct bpf_map *map;
3828         struct bpf_insn *insns;
3829         u32 off, type;
3830         u64 imm;
3831         u8 code;
3832         int i;
3833
3834         insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
3835                         GFP_USER);
3836         if (!insns)
3837                 return insns;
3838
3839         for (i = 0; i < prog->len; i++) {
3840                 code = insns[i].code;
3841
3842                 if (code == (BPF_JMP | BPF_TAIL_CALL)) {
3843                         insns[i].code = BPF_JMP | BPF_CALL;
3844                         insns[i].imm = BPF_FUNC_tail_call;
3845                         /* fall-through */
3846                 }
3847                 if (code == (BPF_JMP | BPF_CALL) ||
3848                     code == (BPF_JMP | BPF_CALL_ARGS)) {
3849                         if (code == (BPF_JMP | BPF_CALL_ARGS))
3850                                 insns[i].code = BPF_JMP | BPF_CALL;
3851                         if (!bpf_dump_raw_ok(f_cred))
3852                                 insns[i].imm = 0;
3853                         continue;
3854                 }
3855                 if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
3856                         insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
3857                         continue;
3858                 }
3859
3860                 if (code != (BPF_LD | BPF_IMM | BPF_DW))
3861                         continue;
3862
3863                 imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
3864                 map = bpf_map_from_imm(prog, imm, &off, &type);
3865                 if (map) {
3866                         insns[i].src_reg = type;
3867                         insns[i].imm = map->id;
3868                         insns[i + 1].imm = off;
3869                         continue;
3870                 }
3871         }
3872
3873         return insns;
3874 }
3875
3876 static int set_info_rec_size(struct bpf_prog_info *info)
3877 {
3878         /*
3879          * Ensure info.*_rec_size is the same as kernel expected size
3880          *
3881          * or
3882          *
3883          * Only allow zero *_rec_size if both _rec_size and _cnt are
3884          * zero.  In this case, the kernel will set the expected
3885          * _rec_size back to the info.
3886          */
3887
3888         if ((info->nr_func_info || info->func_info_rec_size) &&
3889             info->func_info_rec_size != sizeof(struct bpf_func_info))
3890                 return -EINVAL;
3891
3892         if ((info->nr_line_info || info->line_info_rec_size) &&
3893             info->line_info_rec_size != sizeof(struct bpf_line_info))
3894                 return -EINVAL;
3895
3896         if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
3897             info->jited_line_info_rec_size != sizeof(__u64))
3898                 return -EINVAL;
3899
3900         info->func_info_rec_size = sizeof(struct bpf_func_info);
3901         info->line_info_rec_size = sizeof(struct bpf_line_info);
3902         info->jited_line_info_rec_size = sizeof(__u64);
3903
3904         return 0;
3905 }
3906
3907 static int bpf_prog_get_info_by_fd(struct file *file,
3908                                    struct bpf_prog *prog,
3909                                    const union bpf_attr *attr,
3910                                    union bpf_attr __user *uattr)
3911 {
3912         struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
3913         struct btf *attach_btf = bpf_prog_get_target_btf(prog);
3914         struct bpf_prog_info info;
3915         u32 info_len = attr->info.info_len;
3916         struct bpf_prog_kstats stats;
3917         char __user *uinsns;
3918         u32 ulen;
3919         int err;
3920
3921         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
3922         if (err)
3923                 return err;
3924         info_len = min_t(u32, sizeof(info), info_len);
3925
3926         memset(&info, 0, sizeof(info));
3927         if (copy_from_user(&info, uinfo, info_len))
3928                 return -EFAULT;
3929
3930         info.type = prog->type;
3931         info.id = prog->aux->id;
3932         info.load_time = prog->aux->load_time;
3933         info.created_by_uid = from_kuid_munged(current_user_ns(),
3934                                                prog->aux->user->uid);
3935         info.gpl_compatible = prog->gpl_compatible;
3936
3937         memcpy(info.tag, prog->tag, sizeof(prog->tag));
3938         memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
3939
3940         mutex_lock(&prog->aux->used_maps_mutex);
3941         ulen = info.nr_map_ids;
3942         info.nr_map_ids = prog->aux->used_map_cnt;
3943         ulen = min_t(u32, info.nr_map_ids, ulen);
3944         if (ulen) {
3945                 u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
3946                 u32 i;
3947
3948                 for (i = 0; i < ulen; i++)
3949                         if (put_user(prog->aux->used_maps[i]->id,
3950                                      &user_map_ids[i])) {
3951                                 mutex_unlock(&prog->aux->used_maps_mutex);
3952                                 return -EFAULT;
3953                         }
3954         }
3955         mutex_unlock(&prog->aux->used_maps_mutex);
3956
3957         err = set_info_rec_size(&info);
3958         if (err)
3959                 return err;
3960
3961         bpf_prog_get_stats(prog, &stats);
3962         info.run_time_ns = stats.nsecs;
3963         info.run_cnt = stats.cnt;
3964         info.recursion_misses = stats.misses;
3965
3966         info.verified_insns = prog->aux->verified_insns;
3967
3968         if (!bpf_capable()) {
3969                 info.jited_prog_len = 0;
3970                 info.xlated_prog_len = 0;
3971                 info.nr_jited_ksyms = 0;
3972                 info.nr_jited_func_lens = 0;
3973                 info.nr_func_info = 0;
3974                 info.nr_line_info = 0;
3975                 info.nr_jited_line_info = 0;
3976                 goto done;
3977         }
3978
3979         ulen = info.xlated_prog_len;
3980         info.xlated_prog_len = bpf_prog_insn_size(prog);
3981         if (info.xlated_prog_len && ulen) {
3982                 struct bpf_insn *insns_sanitized;
3983                 bool fault;
3984
3985                 if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
3986                         info.xlated_prog_insns = 0;
3987                         goto done;
3988                 }
3989                 insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
3990                 if (!insns_sanitized)
3991                         return -ENOMEM;
3992                 uinsns = u64_to_user_ptr(info.xlated_prog_insns);
3993                 ulen = min_t(u32, info.xlated_prog_len, ulen);
3994                 fault = copy_to_user(uinsns, insns_sanitized, ulen);
3995                 kfree(insns_sanitized);
3996                 if (fault)
3997                         return -EFAULT;
3998         }
3999
4000         if (bpf_prog_is_dev_bound(prog->aux)) {
4001                 err = bpf_prog_offload_info_fill(&info, prog);
4002                 if (err)
4003                         return err;
4004                 goto done;
4005         }
4006
4007         /* NOTE: the following code is supposed to be skipped for offload.
4008          * bpf_prog_offload_info_fill() is the place to fill similar fields
4009          * for offload.
4010          */
4011         ulen = info.jited_prog_len;
4012         if (prog->aux->func_cnt) {
4013                 u32 i;
4014
4015                 info.jited_prog_len = 0;
4016                 for (i = 0; i < prog->aux->func_cnt; i++)
4017                         info.jited_prog_len += prog->aux->func[i]->jited_len;
4018         } else {
4019                 info.jited_prog_len = prog->jited_len;
4020         }
4021
4022         if (info.jited_prog_len && ulen) {
4023                 if (bpf_dump_raw_ok(file->f_cred)) {
4024                         uinsns = u64_to_user_ptr(info.jited_prog_insns);
4025                         ulen = min_t(u32, info.jited_prog_len, ulen);
4026
4027                         /* for multi-function programs, copy the JITed
4028                          * instructions for all the functions
4029                          */
4030                         if (prog->aux->func_cnt) {
4031                                 u32 len, free, i;
4032                                 u8 *img;
4033
4034                                 free = ulen;
4035                                 for (i = 0; i < prog->aux->func_cnt; i++) {
4036                                         len = prog->aux->func[i]->jited_len;
4037                                         len = min_t(u32, len, free);
4038                                         img = (u8 *) prog->aux->func[i]->bpf_func;
4039                                         if (copy_to_user(uinsns, img, len))
4040                                                 return -EFAULT;
4041                                         uinsns += len;
4042                                         free -= len;
4043                                         if (!free)
4044                                                 break;
4045                                 }
4046                         } else {
4047                                 if (copy_to_user(uinsns, prog->bpf_func, ulen))
4048                                         return -EFAULT;
4049                         }
4050                 } else {
4051                         info.jited_prog_insns = 0;
4052                 }
4053         }
4054
4055         ulen = info.nr_jited_ksyms;
4056         info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
4057         if (ulen) {
4058                 if (bpf_dump_raw_ok(file->f_cred)) {
4059                         unsigned long ksym_addr;
4060                         u64 __user *user_ksyms;
4061                         u32 i;
4062
4063                         /* copy the address of the kernel symbol
4064                          * corresponding to each function
4065                          */
4066                         ulen = min_t(u32, info.nr_jited_ksyms, ulen);
4067                         user_ksyms = u64_to_user_ptr(info.jited_ksyms);
4068                         if (prog->aux->func_cnt) {
4069                                 for (i = 0; i < ulen; i++) {
4070                                         ksym_addr = (unsigned long)
4071                                                 prog->aux->func[i]->bpf_func;
4072                                         if (put_user((u64) ksym_addr,
4073                                                      &user_ksyms[i]))
4074                                                 return -EFAULT;
4075                                 }
4076                         } else {
4077                                 ksym_addr = (unsigned long) prog->bpf_func;
4078                                 if (put_user((u64) ksym_addr, &user_ksyms[0]))
4079                                         return -EFAULT;
4080                         }
4081                 } else {
4082                         info.jited_ksyms = 0;
4083                 }
4084         }
4085
4086         ulen = info.nr_jited_func_lens;
4087         info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
4088         if (ulen) {
4089                 if (bpf_dump_raw_ok(file->f_cred)) {
4090                         u32 __user *user_lens;
4091                         u32 func_len, i;
4092
4093                         /* copy the JITed image lengths for each function */
4094                         ulen = min_t(u32, info.nr_jited_func_lens, ulen);
4095                         user_lens = u64_to_user_ptr(info.jited_func_lens);
4096                         if (prog->aux->func_cnt) {
4097                                 for (i = 0; i < ulen; i++) {
4098                                         func_len =
4099                                                 prog->aux->func[i]->jited_len;
4100                                         if (put_user(func_len, &user_lens[i]))
4101                                                 return -EFAULT;
4102                                 }
4103                         } else {
4104                                 func_len = prog->jited_len;
4105                                 if (put_user(func_len, &user_lens[0]))
4106                                         return -EFAULT;
4107                         }
4108                 } else {
4109                         info.jited_func_lens = 0;
4110                 }
4111         }
4112
4113         if (prog->aux->btf)
4114                 info.btf_id = btf_obj_id(prog->aux->btf);
4115         info.attach_btf_id = prog->aux->attach_btf_id;
4116         if (attach_btf)
4117                 info.attach_btf_obj_id = btf_obj_id(attach_btf);
4118
4119         ulen = info.nr_func_info;
4120         info.nr_func_info = prog->aux->func_info_cnt;
4121         if (info.nr_func_info && ulen) {
4122                 char __user *user_finfo;
4123
4124                 user_finfo = u64_to_user_ptr(info.func_info);
4125                 ulen = min_t(u32, info.nr_func_info, ulen);
4126                 if (copy_to_user(user_finfo, prog->aux->func_info,
4127                                  info.func_info_rec_size * ulen))
4128                         return -EFAULT;
4129         }
4130
4131         ulen = info.nr_line_info;
4132         info.nr_line_info = prog->aux->nr_linfo;
4133         if (info.nr_line_info && ulen) {
4134                 __u8 __user *user_linfo;
4135
4136                 user_linfo = u64_to_user_ptr(info.line_info);
4137                 ulen = min_t(u32, info.nr_line_info, ulen);
4138                 if (copy_to_user(user_linfo, prog->aux->linfo,
4139                                  info.line_info_rec_size * ulen))
4140                         return -EFAULT;
4141         }
4142
4143         ulen = info.nr_jited_line_info;
4144         if (prog->aux->jited_linfo)
4145                 info.nr_jited_line_info = prog->aux->nr_linfo;
4146         else
4147                 info.nr_jited_line_info = 0;
4148         if (info.nr_jited_line_info && ulen) {
4149                 if (bpf_dump_raw_ok(file->f_cred)) {
4150                         unsigned long line_addr;
4151                         __u64 __user *user_linfo;
4152                         u32 i;
4153
4154                         user_linfo = u64_to_user_ptr(info.jited_line_info);
4155                         ulen = min_t(u32, info.nr_jited_line_info, ulen);
4156                         for (i = 0; i < ulen; i++) {
4157                                 line_addr = (unsigned long)prog->aux->jited_linfo[i];
4158                                 if (put_user((__u64)line_addr, &user_linfo[i]))
4159                                         return -EFAULT;
4160                         }
4161                 } else {
4162                         info.jited_line_info = 0;
4163                 }
4164         }
4165
4166         ulen = info.nr_prog_tags;
4167         info.nr_prog_tags = prog->aux->func_cnt ? : 1;
4168         if (ulen) {
4169                 __u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
4170                 u32 i;
4171
4172                 user_prog_tags = u64_to_user_ptr(info.prog_tags);
4173                 ulen = min_t(u32, info.nr_prog_tags, ulen);
4174                 if (prog->aux->func_cnt) {
4175                         for (i = 0; i < ulen; i++) {
4176                                 if (copy_to_user(user_prog_tags[i],
4177                                                  prog->aux->func[i]->tag,
4178                                                  BPF_TAG_SIZE))
4179                                         return -EFAULT;
4180                         }
4181                 } else {
4182                         if (copy_to_user(user_prog_tags[0],
4183                                          prog->tag, BPF_TAG_SIZE))
4184                                 return -EFAULT;
4185                 }
4186         }
4187
4188 done:
4189         if (copy_to_user(uinfo, &info, info_len) ||
4190             put_user(info_len, &uattr->info.info_len))
4191                 return -EFAULT;
4192
4193         return 0;
4194 }
4195
4196 static int bpf_map_get_info_by_fd(struct file *file,
4197                                   struct bpf_map *map,
4198                                   const union bpf_attr *attr,
4199                                   union bpf_attr __user *uattr)
4200 {
4201         struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4202         struct bpf_map_info info;
4203         u32 info_len = attr->info.info_len;
4204         int err;
4205
4206         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4207         if (err)
4208                 return err;
4209         info_len = min_t(u32, sizeof(info), info_len);
4210
4211         memset(&info, 0, sizeof(info));
4212         info.type = map->map_type;
4213         info.id = map->id;
4214         info.key_size = map->key_size;
4215         info.value_size = map->value_size;
4216         info.max_entries = map->max_entries;
4217         info.map_flags = map->map_flags;
4218         info.map_extra = map->map_extra;
4219         memcpy(info.name, map->name, sizeof(map->name));
4220
4221         if (map->btf) {
4222                 info.btf_id = btf_obj_id(map->btf);
4223                 info.btf_key_type_id = map->btf_key_type_id;
4224                 info.btf_value_type_id = map->btf_value_type_id;
4225         }
4226         info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
4227
4228         if (bpf_map_is_dev_bound(map)) {
4229                 err = bpf_map_offload_info_fill(&info, map);
4230                 if (err)
4231                         return err;
4232         }
4233
4234         if (copy_to_user(uinfo, &info, info_len) ||
4235             put_user(info_len, &uattr->info.info_len))
4236                 return -EFAULT;
4237
4238         return 0;
4239 }
4240
4241 static int bpf_btf_get_info_by_fd(struct file *file,
4242                                   struct btf *btf,
4243                                   const union bpf_attr *attr,
4244                                   union bpf_attr __user *uattr)
4245 {
4246         struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4247         u32 info_len = attr->info.info_len;
4248         int err;
4249
4250         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
4251         if (err)
4252                 return err;
4253
4254         return btf_get_info_by_fd(btf, attr, uattr);
4255 }
4256
4257 static int bpf_link_get_info_by_fd(struct file *file,
4258                                   struct bpf_link *link,
4259                                   const union bpf_attr *attr,
4260                                   union bpf_attr __user *uattr)
4261 {
4262         struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4263         struct bpf_link_info info;
4264         u32 info_len = attr->info.info_len;
4265         int err;
4266
4267         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4268         if (err)
4269                 return err;
4270         info_len = min_t(u32, sizeof(info), info_len);
4271
4272         memset(&info, 0, sizeof(info));
4273         if (copy_from_user(&info, uinfo, info_len))
4274                 return -EFAULT;
4275
4276         info.type = link->type;
4277         info.id = link->id;
4278         info.prog_id = link->prog->aux->id;
4279
4280         if (link->ops->fill_link_info) {
4281                 err = link->ops->fill_link_info(link, &info);
4282                 if (err)
4283                         return err;
4284         }
4285
4286         if (copy_to_user(uinfo, &info, info_len) ||
4287             put_user(info_len, &uattr->info.info_len))
4288                 return -EFAULT;
4289
4290         return 0;
4291 }
4292
4293
4294 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
4295
4296 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
4297                                   union bpf_attr __user *uattr)
4298 {
4299         int ufd = attr->info.bpf_fd;
4300         struct fd f;
4301         int err;
4302
4303         if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
4304                 return -EINVAL;
4305
4306         f = fdget(ufd);
4307         if (!f.file)
4308                 return -EBADFD;
4309
4310         if (f.file->f_op == &bpf_prog_fops)
4311                 err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
4312                                               uattr);
4313         else if (f.file->f_op == &bpf_map_fops)
4314                 err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
4315                                              uattr);
4316         else if (f.file->f_op == &btf_fops)
4317                 err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
4318         else if (f.file->f_op == &bpf_link_fops)
4319                 err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
4320                                               attr, uattr);
4321         else
4322                 err = -EINVAL;
4323
4324         fdput(f);
4325         return err;
4326 }
4327
4328 #define BPF_BTF_LOAD_LAST_FIELD btf_log_level
4329
4330 static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
4331 {
4332         if (CHECK_ATTR(BPF_BTF_LOAD))
4333                 return -EINVAL;
4334
4335         if (!bpf_capable())
4336                 return -EPERM;
4337
4338         return btf_new_fd(attr, uattr);
4339 }
4340
4341 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
4342
4343 static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
4344 {
4345         if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
4346                 return -EINVAL;
4347
4348         if (!capable(CAP_SYS_ADMIN))
4349                 return -EPERM;
4350
4351         return btf_get_fd_by_id(attr->btf_id);
4352 }
4353
4354 static int bpf_task_fd_query_copy(const union bpf_attr *attr,
4355                                     union bpf_attr __user *uattr,
4356                                     u32 prog_id, u32 fd_type,
4357                                     const char *buf, u64 probe_offset,
4358                                     u64 probe_addr)
4359 {
4360         char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
4361         u32 len = buf ? strlen(buf) : 0, input_len;
4362         int err = 0;
4363
4364         if (put_user(len, &uattr->task_fd_query.buf_len))
4365                 return -EFAULT;
4366         input_len = attr->task_fd_query.buf_len;
4367         if (input_len && ubuf) {
4368                 if (!len) {
4369                         /* nothing to copy, just make ubuf NULL terminated */
4370                         char zero = '\0';
4371
4372                         if (put_user(zero, ubuf))
4373                                 return -EFAULT;
4374                 } else if (input_len >= len + 1) {
4375                         /* ubuf can hold the string with NULL terminator */
4376                         if (copy_to_user(ubuf, buf, len + 1))
4377                                 return -EFAULT;
4378                 } else {
4379                         /* ubuf cannot hold the string with NULL terminator,
4380                          * do a partial copy with NULL terminator.
4381                          */
4382                         char zero = '\0';
4383
4384                         err = -ENOSPC;
4385                         if (copy_to_user(ubuf, buf, input_len - 1))
4386                                 return -EFAULT;
4387                         if (put_user(zero, ubuf + input_len - 1))
4388                                 return -EFAULT;
4389                 }
4390         }
4391
4392         if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
4393             put_user(fd_type, &uattr->task_fd_query.fd_type) ||
4394             put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
4395             put_user(probe_addr, &uattr->task_fd_query.probe_addr))
4396                 return -EFAULT;
4397
4398         return err;
4399 }
4400
4401 #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
4402
4403 static int bpf_task_fd_query(const union bpf_attr *attr,
4404                              union bpf_attr __user *uattr)
4405 {
4406         pid_t pid = attr->task_fd_query.pid;
4407         u32 fd = attr->task_fd_query.fd;
4408         const struct perf_event *event;
4409         struct task_struct *task;
4410         struct file *file;
4411         int err;
4412
4413         if (CHECK_ATTR(BPF_TASK_FD_QUERY))
4414                 return -EINVAL;
4415
4416         if (!capable(CAP_SYS_ADMIN))
4417                 return -EPERM;
4418
4419         if (attr->task_fd_query.flags != 0)
4420                 return -EINVAL;
4421
4422         rcu_read_lock();
4423         task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
4424         rcu_read_unlock();
4425         if (!task)
4426                 return -ENOENT;
4427
4428         err = 0;
4429         file = fget_task(task, fd);
4430         put_task_struct(task);
4431         if (!file)
4432                 return -EBADF;
4433
4434         if (file->f_op == &bpf_link_fops) {
4435                 struct bpf_link *link = file->private_data;
4436
4437                 if (link->ops == &bpf_raw_tp_link_lops) {
4438                         struct bpf_raw_tp_link *raw_tp =
4439                                 container_of(link, struct bpf_raw_tp_link, link);
4440                         struct bpf_raw_event_map *btp = raw_tp->btp;
4441
4442                         err = bpf_task_fd_query_copy(attr, uattr,
4443                                                      raw_tp->link.prog->aux->id,
4444                                                      BPF_FD_TYPE_RAW_TRACEPOINT,
4445                                                      btp->tp->name, 0, 0);
4446                         goto put_file;
4447                 }
4448                 goto out_not_supp;
4449         }
4450
4451         event = perf_get_event(file);
4452         if (!IS_ERR(event)) {
4453                 u64 probe_offset, probe_addr;
4454                 u32 prog_id, fd_type;
4455                 const char *buf;
4456
4457                 err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
4458                                               &buf, &probe_offset,
4459                                               &probe_addr);
4460                 if (!err)
4461                         err = bpf_task_fd_query_copy(attr, uattr, prog_id,
4462                                                      fd_type, buf,
4463                                                      probe_offset,
4464                                                      probe_addr);
4465                 goto put_file;
4466         }
4467
4468 out_not_supp:
4469         err = -ENOTSUPP;
4470 put_file:
4471         fput(file);
4472         return err;
4473 }
4474
4475 #define BPF_MAP_BATCH_LAST_FIELD batch.flags
4476
4477 #define BPF_DO_BATCH(fn, ...)                   \
4478         do {                                    \
4479                 if (!fn) {                      \
4480                         err = -ENOTSUPP;        \
4481                         goto err_put;           \
4482                 }                               \
4483                 err = fn(__VA_ARGS__);          \
4484         } while (0)
4485
4486 static int bpf_map_do_batch(const union bpf_attr *attr,
4487                             union bpf_attr __user *uattr,
4488                             int cmd)
4489 {
4490         bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
4491                          cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
4492         bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
4493         struct bpf_map *map;
4494         int err, ufd;
4495         struct fd f;
4496
4497         if (CHECK_ATTR(BPF_MAP_BATCH))
4498                 return -EINVAL;
4499
4500         ufd = attr->batch.map_fd;
4501         f = fdget(ufd);
4502         map = __bpf_map_get(f);
4503         if (IS_ERR(map))
4504                 return PTR_ERR(map);
4505         if (has_write)
4506                 bpf_map_write_active_inc(map);
4507         if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
4508                 err = -EPERM;
4509                 goto err_put;
4510         }
4511         if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
4512                 err = -EPERM;
4513                 goto err_put;
4514         }
4515
4516         if (cmd == BPF_MAP_LOOKUP_BATCH)
4517                 BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
4518         else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
4519                 BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
4520         else if (cmd == BPF_MAP_UPDATE_BATCH)
4521                 BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
4522         else
4523                 BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
4524 err_put:
4525         if (has_write)
4526                 bpf_map_write_active_dec(map);
4527         fdput(f);
4528         return err;
4529 }
4530
4531 #define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
4532 static int link_create(union bpf_attr *attr, bpfptr_t uattr)
4533 {
4534         enum bpf_prog_type ptype;
4535         struct bpf_prog *prog;
4536         int ret;
4537
4538         if (CHECK_ATTR(BPF_LINK_CREATE))
4539                 return -EINVAL;
4540
4541         prog = bpf_prog_get(attr->link_create.prog_fd);
4542         if (IS_ERR(prog))
4543                 return PTR_ERR(prog);
4544
4545         ret = bpf_prog_attach_check_attach_type(prog,
4546                                                 attr->link_create.attach_type);
4547         if (ret)
4548                 goto out;
4549
4550         switch (prog->type) {
4551         case BPF_PROG_TYPE_EXT:
4552                 break;
4553         case BPF_PROG_TYPE_PERF_EVENT:
4554         case BPF_PROG_TYPE_TRACEPOINT:
4555                 if (attr->link_create.attach_type != BPF_PERF_EVENT) {
4556                         ret = -EINVAL;
4557                         goto out;
4558                 }
4559                 break;
4560         case BPF_PROG_TYPE_KPROBE:
4561                 if (attr->link_create.attach_type != BPF_PERF_EVENT &&
4562                     attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
4563                         ret = -EINVAL;
4564                         goto out;
4565                 }
4566                 break;
4567         default:
4568                 ptype = attach_type_to_prog_type(attr->link_create.attach_type);
4569                 if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
4570                         ret = -EINVAL;
4571                         goto out;
4572                 }
4573                 break;
4574         }
4575
4576         switch (prog->type) {
4577         case BPF_PROG_TYPE_CGROUP_SKB:
4578         case BPF_PROG_TYPE_CGROUP_SOCK:
4579         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4580         case BPF_PROG_TYPE_SOCK_OPS:
4581         case BPF_PROG_TYPE_CGROUP_DEVICE:
4582         case BPF_PROG_TYPE_CGROUP_SYSCTL:
4583         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4584                 ret = cgroup_bpf_link_attach(attr, prog);
4585                 break;
4586         case BPF_PROG_TYPE_EXT:
4587                 ret = bpf_tracing_prog_attach(prog,
4588                                               attr->link_create.target_fd,
4589                                               attr->link_create.target_btf_id,
4590                                               attr->link_create.tracing.cookie);
4591                 break;
4592         case BPF_PROG_TYPE_LSM:
4593         case BPF_PROG_TYPE_TRACING:
4594                 if (attr->link_create.attach_type != prog->expected_attach_type) {
4595                         ret = -EINVAL;
4596                         goto out;
4597                 }
4598                 if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
4599                         ret = bpf_raw_tp_link_attach(prog, NULL);
4600                 else if (prog->expected_attach_type == BPF_TRACE_ITER)
4601                         ret = bpf_iter_link_attach(attr, uattr, prog);
4602                 else if (prog->expected_attach_type == BPF_LSM_CGROUP)
4603                         ret = cgroup_bpf_link_attach(attr, prog);
4604                 else
4605                         ret = bpf_tracing_prog_attach(prog,
4606                                                       attr->link_create.target_fd,
4607                                                       attr->link_create.target_btf_id,
4608                                                       attr->link_create.tracing.cookie);
4609                 break;
4610         case BPF_PROG_TYPE_FLOW_DISSECTOR:
4611         case BPF_PROG_TYPE_SK_LOOKUP:
4612                 ret = netns_bpf_link_create(attr, prog);
4613                 break;
4614 #ifdef CONFIG_NET
4615         case BPF_PROG_TYPE_XDP:
4616                 ret = bpf_xdp_link_attach(attr, prog);
4617                 break;
4618 #endif
4619         case BPF_PROG_TYPE_PERF_EVENT:
4620         case BPF_PROG_TYPE_TRACEPOINT:
4621                 ret = bpf_perf_link_attach(attr, prog);
4622                 break;
4623         case BPF_PROG_TYPE_KPROBE:
4624                 if (attr->link_create.attach_type == BPF_PERF_EVENT)
4625                         ret = bpf_perf_link_attach(attr, prog);
4626                 else
4627                         ret = bpf_kprobe_multi_link_attach(attr, prog);
4628                 break;
4629         default:
4630                 ret = -EINVAL;
4631         }
4632
4633 out:
4634         if (ret < 0)
4635                 bpf_prog_put(prog);
4636         return ret;
4637 }
4638
4639 #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
4640
4641 static int link_update(union bpf_attr *attr)
4642 {
4643         struct bpf_prog *old_prog = NULL, *new_prog;
4644         struct bpf_link *link;
4645         u32 flags;
4646         int ret;
4647
4648         if (CHECK_ATTR(BPF_LINK_UPDATE))
4649                 return -EINVAL;
4650
4651         flags = attr->link_update.flags;
4652         if (flags & ~BPF_F_REPLACE)
4653                 return -EINVAL;
4654
4655         link = bpf_link_get_from_fd(attr->link_update.link_fd);
4656         if (IS_ERR(link))
4657                 return PTR_ERR(link);
4658
4659         new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
4660         if (IS_ERR(new_prog)) {
4661                 ret = PTR_ERR(new_prog);
4662                 goto out_put_link;
4663         }
4664
4665         if (flags & BPF_F_REPLACE) {
4666                 old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
4667                 if (IS_ERR(old_prog)) {
4668                         ret = PTR_ERR(old_prog);
4669                         old_prog = NULL;
4670                         goto out_put_progs;
4671                 }
4672         } else if (attr->link_update.old_prog_fd) {
4673                 ret = -EINVAL;
4674                 goto out_put_progs;
4675         }
4676
4677         if (link->ops->update_prog)
4678                 ret = link->ops->update_prog(link, new_prog, old_prog);
4679         else
4680                 ret = -EINVAL;
4681
4682 out_put_progs:
4683         if (old_prog)
4684                 bpf_prog_put(old_prog);
4685         if (ret)
4686                 bpf_prog_put(new_prog);
4687 out_put_link:
4688         bpf_link_put(link);
4689         return ret;
4690 }
4691
4692 #define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
4693
4694 static int link_detach(union bpf_attr *attr)
4695 {
4696         struct bpf_link *link;
4697         int ret;
4698
4699         if (CHECK_ATTR(BPF_LINK_DETACH))
4700                 return -EINVAL;
4701
4702         link = bpf_link_get_from_fd(attr->link_detach.link_fd);
4703         if (IS_ERR(link))
4704                 return PTR_ERR(link);
4705
4706         if (link->ops->detach)
4707                 ret = link->ops->detach(link);
4708         else
4709                 ret = -EOPNOTSUPP;
4710
4711         bpf_link_put(link);
4712         return ret;
4713 }
4714
4715 static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
4716 {
4717         return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
4718 }
4719
4720 struct bpf_link *bpf_link_by_id(u32 id)
4721 {
4722         struct bpf_link *link;
4723
4724         if (!id)
4725                 return ERR_PTR(-ENOENT);
4726
4727         spin_lock_bh(&link_idr_lock);
4728         /* before link is "settled", ID is 0, pretend it doesn't exist yet */
4729         link = idr_find(&link_idr, id);
4730         if (link) {
4731                 if (link->id)
4732                         link = bpf_link_inc_not_zero(link);
4733                 else
4734                         link = ERR_PTR(-EAGAIN);
4735         } else {
4736                 link = ERR_PTR(-ENOENT);
4737         }
4738         spin_unlock_bh(&link_idr_lock);
4739         return link;
4740 }
4741
4742 struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
4743 {
4744         struct bpf_link *link;
4745
4746         spin_lock_bh(&link_idr_lock);
4747 again:
4748         link = idr_get_next(&link_idr, id);
4749         if (link) {
4750                 link = bpf_link_inc_not_zero(link);
4751                 if (IS_ERR(link)) {
4752                         (*id)++;
4753                         goto again;
4754                 }
4755         }
4756         spin_unlock_bh(&link_idr_lock);
4757
4758         return link;
4759 }
4760
4761 #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
4762
4763 static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
4764 {
4765         struct bpf_link *link;
4766         u32 id = attr->link_id;
4767         int fd;
4768
4769         if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
4770                 return -EINVAL;
4771
4772         if (!capable(CAP_SYS_ADMIN))
4773                 return -EPERM;
4774
4775         link = bpf_link_by_id(id);
4776         if (IS_ERR(link))
4777                 return PTR_ERR(link);
4778
4779         fd = bpf_link_new_fd(link);
4780         if (fd < 0)
4781                 bpf_link_put(link);
4782
4783         return fd;
4784 }
4785
4786 DEFINE_MUTEX(bpf_stats_enabled_mutex);
4787
4788 static int bpf_stats_release(struct inode *inode, struct file *file)
4789 {
4790         mutex_lock(&bpf_stats_enabled_mutex);
4791         static_key_slow_dec(&bpf_stats_enabled_key.key);
4792         mutex_unlock(&bpf_stats_enabled_mutex);
4793         return 0;
4794 }
4795
4796 static const struct file_operations bpf_stats_fops = {
4797         .release = bpf_stats_release,
4798 };
4799
4800 static int bpf_enable_runtime_stats(void)
4801 {
4802         int fd;
4803
4804         mutex_lock(&bpf_stats_enabled_mutex);
4805
4806         /* Set a very high limit to avoid overflow */
4807         if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
4808                 mutex_unlock(&bpf_stats_enabled_mutex);
4809                 return -EBUSY;
4810         }
4811
4812         fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
4813         if (fd >= 0)
4814                 static_key_slow_inc(&bpf_stats_enabled_key.key);
4815
4816         mutex_unlock(&bpf_stats_enabled_mutex);
4817         return fd;
4818 }
4819
4820 #define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
4821
4822 static int bpf_enable_stats(union bpf_attr *attr)
4823 {
4824
4825         if (CHECK_ATTR(BPF_ENABLE_STATS))
4826                 return -EINVAL;
4827
4828         if (!capable(CAP_SYS_ADMIN))
4829                 return -EPERM;
4830
4831         switch (attr->enable_stats.type) {
4832         case BPF_STATS_RUN_TIME:
4833                 return bpf_enable_runtime_stats();
4834         default:
4835                 break;
4836         }
4837         return -EINVAL;
4838 }
4839
4840 #define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
4841
4842 static int bpf_iter_create(union bpf_attr *attr)
4843 {
4844         struct bpf_link *link;
4845         int err;
4846
4847         if (CHECK_ATTR(BPF_ITER_CREATE))
4848                 return -EINVAL;
4849
4850         if (attr->iter_create.flags)
4851                 return -EINVAL;
4852
4853         link = bpf_link_get_from_fd(attr->iter_create.link_fd);
4854         if (IS_ERR(link))
4855                 return PTR_ERR(link);
4856
4857         err = bpf_iter_new_fd(link);
4858         bpf_link_put(link);
4859
4860         return err;
4861 }
4862
4863 #define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
4864
4865 static int bpf_prog_bind_map(union bpf_attr *attr)
4866 {
4867         struct bpf_prog *prog;
4868         struct bpf_map *map;
4869         struct bpf_map **used_maps_old, **used_maps_new;
4870         int i, ret = 0;
4871
4872         if (CHECK_ATTR(BPF_PROG_BIND_MAP))
4873                 return -EINVAL;
4874
4875         if (attr->prog_bind_map.flags)
4876                 return -EINVAL;
4877
4878         prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
4879         if (IS_ERR(prog))
4880                 return PTR_ERR(prog);
4881
4882         map = bpf_map_get(attr->prog_bind_map.map_fd);
4883         if (IS_ERR(map)) {
4884                 ret = PTR_ERR(map);
4885                 goto out_prog_put;
4886         }
4887
4888         mutex_lock(&prog->aux->used_maps_mutex);
4889
4890         used_maps_old = prog->aux->used_maps;
4891
4892         for (i = 0; i < prog->aux->used_map_cnt; i++)
4893                 if (used_maps_old[i] == map) {
4894                         bpf_map_put(map);
4895                         goto out_unlock;
4896                 }
4897
4898         used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
4899                                       sizeof(used_maps_new[0]),
4900                                       GFP_KERNEL);
4901         if (!used_maps_new) {
4902                 ret = -ENOMEM;
4903                 goto out_unlock;
4904         }
4905
4906         memcpy(used_maps_new, used_maps_old,
4907                sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
4908         used_maps_new[prog->aux->used_map_cnt] = map;
4909
4910         prog->aux->used_map_cnt++;
4911         prog->aux->used_maps = used_maps_new;
4912
4913         kfree(used_maps_old);
4914
4915 out_unlock:
4916         mutex_unlock(&prog->aux->used_maps_mutex);
4917
4918         if (ret)
4919                 bpf_map_put(map);
4920 out_prog_put:
4921         bpf_prog_put(prog);
4922         return ret;
4923 }
4924
4925 static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
4926 {
4927         union bpf_attr attr;
4928         bool capable;
4929         int err;
4930
4931         capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled;
4932
4933         /* Intent here is for unprivileged_bpf_disabled to block key object
4934          * creation commands for unprivileged users; other actions depend
4935          * of fd availability and access to bpffs, so are dependent on
4936          * object creation success.  Capabilities are later verified for
4937          * operations such as load and map create, so even with unprivileged
4938          * BPF disabled, capability checks are still carried out for these
4939          * and other operations.
4940          */
4941         if (!capable &&
4942             (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD))
4943                 return -EPERM;
4944
4945         err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
4946         if (err)
4947                 return err;
4948         size = min_t(u32, size, sizeof(attr));
4949
4950         /* copy attributes from user space, may be less than sizeof(bpf_attr) */
4951         memset(&attr, 0, sizeof(attr));
4952         if (copy_from_bpfptr(&attr, uattr, size) != 0)
4953                 return -EFAULT;
4954
4955         err = security_bpf(cmd, &attr, size);
4956         if (err < 0)
4957                 return err;
4958
4959         switch (cmd) {
4960         case BPF_MAP_CREATE:
4961                 err = map_create(&attr);
4962                 break;
4963         case BPF_MAP_LOOKUP_ELEM:
4964                 err = map_lookup_elem(&attr);
4965                 break;
4966         case BPF_MAP_UPDATE_ELEM:
4967                 err = map_update_elem(&attr, uattr);
4968                 break;
4969         case BPF_MAP_DELETE_ELEM:
4970                 err = map_delete_elem(&attr, uattr);
4971                 break;
4972         case BPF_MAP_GET_NEXT_KEY:
4973                 err = map_get_next_key(&attr);
4974                 break;
4975         case BPF_MAP_FREEZE:
4976                 err = map_freeze(&attr);
4977                 break;
4978         case BPF_PROG_LOAD:
4979                 err = bpf_prog_load(&attr, uattr);
4980                 break;
4981         case BPF_OBJ_PIN:
4982                 err = bpf_obj_pin(&attr);
4983                 break;
4984         case BPF_OBJ_GET:
4985                 err = bpf_obj_get(&attr);
4986                 break;
4987         case BPF_PROG_ATTACH:
4988                 err = bpf_prog_attach(&attr);
4989                 break;
4990         case BPF_PROG_DETACH:
4991                 err = bpf_prog_detach(&attr);
4992                 break;
4993         case BPF_PROG_QUERY:
4994                 err = bpf_prog_query(&attr, uattr.user);
4995                 break;
4996         case BPF_PROG_TEST_RUN:
4997                 err = bpf_prog_test_run(&attr, uattr.user);
4998                 break;
4999         case BPF_PROG_GET_NEXT_ID:
5000                 err = bpf_obj_get_next_id(&attr, uattr.user,
5001                                           &prog_idr, &prog_idr_lock);
5002                 break;
5003         case BPF_MAP_GET_NEXT_ID:
5004                 err = bpf_obj_get_next_id(&attr, uattr.user,
5005                                           &map_idr, &map_idr_lock);
5006                 break;
5007         case BPF_BTF_GET_NEXT_ID:
5008                 err = bpf_obj_get_next_id(&attr, uattr.user,
5009                                           &btf_idr, &btf_idr_lock);
5010                 break;
5011         case BPF_PROG_GET_FD_BY_ID:
5012                 err = bpf_prog_get_fd_by_id(&attr);
5013                 break;
5014         case BPF_MAP_GET_FD_BY_ID:
5015                 err = bpf_map_get_fd_by_id(&attr);
5016                 break;
5017         case BPF_OBJ_GET_INFO_BY_FD:
5018                 err = bpf_obj_get_info_by_fd(&attr, uattr.user);
5019                 break;
5020         case BPF_RAW_TRACEPOINT_OPEN:
5021                 err = bpf_raw_tracepoint_open(&attr);
5022                 break;
5023         case BPF_BTF_LOAD:
5024                 err = bpf_btf_load(&attr, uattr);
5025                 break;
5026         case BPF_BTF_GET_FD_BY_ID:
5027                 err = bpf_btf_get_fd_by_id(&attr);
5028                 break;
5029         case BPF_TASK_FD_QUERY:
5030                 err = bpf_task_fd_query(&attr, uattr.user);
5031                 break;
5032         case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
5033                 err = map_lookup_and_delete_elem(&attr);
5034                 break;
5035         case BPF_MAP_LOOKUP_BATCH:
5036                 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
5037                 break;
5038         case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
5039                 err = bpf_map_do_batch(&attr, uattr.user,
5040                                        BPF_MAP_LOOKUP_AND_DELETE_BATCH);
5041                 break;
5042         case BPF_MAP_UPDATE_BATCH:
5043                 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
5044                 break;
5045         case BPF_MAP_DELETE_BATCH:
5046                 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
5047                 break;
5048         case BPF_LINK_CREATE:
5049                 err = link_create(&attr, uattr);
5050                 break;
5051         case BPF_LINK_UPDATE:
5052                 err = link_update(&attr);
5053                 break;
5054         case BPF_LINK_GET_FD_BY_ID:
5055                 err = bpf_link_get_fd_by_id(&attr);
5056                 break;
5057         case BPF_LINK_GET_NEXT_ID:
5058                 err = bpf_obj_get_next_id(&attr, uattr.user,
5059                                           &link_idr, &link_idr_lock);
5060                 break;
5061         case BPF_ENABLE_STATS:
5062                 err = bpf_enable_stats(&attr);
5063                 break;
5064         case BPF_ITER_CREATE:
5065                 err = bpf_iter_create(&attr);
5066                 break;
5067         case BPF_LINK_DETACH:
5068                 err = link_detach(&attr);
5069                 break;
5070         case BPF_PROG_BIND_MAP:
5071                 err = bpf_prog_bind_map(&attr);
5072                 break;
5073         default:
5074                 err = -EINVAL;
5075                 break;
5076         }
5077
5078         return err;
5079 }
5080
5081 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
5082 {
5083         return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
5084 }
5085
5086 static bool syscall_prog_is_valid_access(int off, int size,
5087                                          enum bpf_access_type type,
5088                                          const struct bpf_prog *prog,
5089                                          struct bpf_insn_access_aux *info)
5090 {
5091         if (off < 0 || off >= U16_MAX)
5092                 return false;
5093         if (off % size != 0)
5094                 return false;
5095         return true;
5096 }
5097
5098 BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
5099 {
5100         switch (cmd) {
5101         case BPF_MAP_CREATE:
5102         case BPF_MAP_DELETE_ELEM:
5103         case BPF_MAP_UPDATE_ELEM:
5104         case BPF_MAP_FREEZE:
5105         case BPF_MAP_GET_FD_BY_ID:
5106         case BPF_PROG_LOAD:
5107         case BPF_BTF_LOAD:
5108         case BPF_LINK_CREATE:
5109         case BPF_RAW_TRACEPOINT_OPEN:
5110                 break;
5111         default:
5112                 return -EINVAL;
5113         }
5114         return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
5115 }
5116
5117
5118 /* To shut up -Wmissing-prototypes.
5119  * This function is used by the kernel light skeleton
5120  * to load bpf programs when modules are loaded or during kernel boot.
5121  * See tools/lib/bpf/skel_internal.h
5122  */
5123 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
5124
5125 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
5126 {
5127         struct bpf_prog * __maybe_unused prog;
5128         struct bpf_tramp_run_ctx __maybe_unused run_ctx;
5129
5130         switch (cmd) {
5131 #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
5132         case BPF_PROG_TEST_RUN:
5133                 if (attr->test.data_in || attr->test.data_out ||
5134                     attr->test.ctx_out || attr->test.duration ||
5135                     attr->test.repeat || attr->test.flags)
5136                         return -EINVAL;
5137
5138                 prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
5139                 if (IS_ERR(prog))
5140                         return PTR_ERR(prog);
5141
5142                 if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
5143                     attr->test.ctx_size_in > U16_MAX) {
5144                         bpf_prog_put(prog);
5145                         return -EINVAL;
5146                 }
5147
5148                 run_ctx.bpf_cookie = 0;
5149                 run_ctx.saved_run_ctx = NULL;
5150                 if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
5151                         /* recursion detected */
5152                         bpf_prog_put(prog);
5153                         return -EBUSY;
5154                 }
5155                 attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
5156                 __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
5157                                                 &run_ctx);
5158                 bpf_prog_put(prog);
5159                 return 0;
5160 #endif
5161         default:
5162                 return ____bpf_sys_bpf(cmd, attr, size);
5163         }
5164 }
5165 EXPORT_SYMBOL(kern_sys_bpf);
5166
5167 static const struct bpf_func_proto bpf_sys_bpf_proto = {
5168         .func           = bpf_sys_bpf,
5169         .gpl_only       = false,
5170         .ret_type       = RET_INTEGER,
5171         .arg1_type      = ARG_ANYTHING,
5172         .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
5173         .arg3_type      = ARG_CONST_SIZE,
5174 };
5175
5176 const struct bpf_func_proto * __weak
5177 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5178 {
5179         return bpf_base_func_proto(func_id);
5180 }
5181
5182 BPF_CALL_1(bpf_sys_close, u32, fd)
5183 {
5184         /* When bpf program calls this helper there should not be
5185          * an fdget() without matching completed fdput().
5186          * This helper is allowed in the following callchain only:
5187          * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
5188          */
5189         return close_fd(fd);
5190 }
5191
5192 static const struct bpf_func_proto bpf_sys_close_proto = {
5193         .func           = bpf_sys_close,
5194         .gpl_only       = false,
5195         .ret_type       = RET_INTEGER,
5196         .arg1_type      = ARG_ANYTHING,
5197 };
5198
5199 BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
5200 {
5201         if (flags)
5202                 return -EINVAL;
5203
5204         if (name_sz <= 1 || name[name_sz - 1])
5205                 return -EINVAL;
5206
5207         if (!bpf_dump_raw_ok(current_cred()))
5208                 return -EPERM;
5209
5210         *res = kallsyms_lookup_name(name);
5211         return *res ? 0 : -ENOENT;
5212 }
5213
5214 static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
5215         .func           = bpf_kallsyms_lookup_name,
5216         .gpl_only       = false,
5217         .ret_type       = RET_INTEGER,
5218         .arg1_type      = ARG_PTR_TO_MEM,
5219         .arg2_type      = ARG_CONST_SIZE_OR_ZERO,
5220         .arg3_type      = ARG_ANYTHING,
5221         .arg4_type      = ARG_PTR_TO_LONG,
5222 };
5223
5224 static const struct bpf_func_proto *
5225 syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5226 {
5227         switch (func_id) {
5228         case BPF_FUNC_sys_bpf:
5229                 return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
5230         case BPF_FUNC_btf_find_by_name_kind:
5231                 return &bpf_btf_find_by_name_kind_proto;
5232         case BPF_FUNC_sys_close:
5233                 return &bpf_sys_close_proto;
5234         case BPF_FUNC_kallsyms_lookup_name:
5235                 return &bpf_kallsyms_lookup_name_proto;
5236         default:
5237                 return tracing_prog_func_proto(func_id, prog);
5238         }
5239 }
5240
5241 const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
5242         .get_func_proto  = syscall_prog_func_proto,
5243         .is_valid_access = syscall_prog_is_valid_access,
5244 };
5245
5246 const struct bpf_prog_ops bpf_syscall_prog_ops = {
5247         .test_run = bpf_prog_test_run_syscall,
5248 };
5249
5250 #ifdef CONFIG_SYSCTL
5251 static int bpf_stats_handler(struct ctl_table *table, int write,
5252                              void *buffer, size_t *lenp, loff_t *ppos)
5253 {
5254         struct static_key *key = (struct static_key *)table->data;
5255         static int saved_val;
5256         int val, ret;
5257         struct ctl_table tmp = {
5258                 .data   = &val,
5259                 .maxlen = sizeof(val),
5260                 .mode   = table->mode,
5261                 .extra1 = SYSCTL_ZERO,
5262                 .extra2 = SYSCTL_ONE,
5263         };
5264
5265         if (write && !capable(CAP_SYS_ADMIN))
5266                 return -EPERM;
5267
5268         mutex_lock(&bpf_stats_enabled_mutex);
5269         val = saved_val;
5270         ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5271         if (write && !ret && val != saved_val) {
5272                 if (val)
5273                         static_key_slow_inc(key);
5274                 else
5275                         static_key_slow_dec(key);
5276                 saved_val = val;
5277         }
5278         mutex_unlock(&bpf_stats_enabled_mutex);
5279         return ret;
5280 }
5281
5282 void __weak unpriv_ebpf_notify(int new_state)
5283 {
5284 }
5285
5286 static int bpf_unpriv_handler(struct ctl_table *table, int write,
5287                               void *buffer, size_t *lenp, loff_t *ppos)
5288 {
5289         int ret, unpriv_enable = *(int *)table->data;
5290         bool locked_state = unpriv_enable == 1;
5291         struct ctl_table tmp = *table;
5292
5293         if (write && !capable(CAP_SYS_ADMIN))
5294                 return -EPERM;
5295
5296         tmp.data = &unpriv_enable;
5297         ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5298         if (write && !ret) {
5299                 if (locked_state && unpriv_enable != 1)
5300                         return -EPERM;
5301                 *(int *)table->data = unpriv_enable;
5302         }
5303
5304         unpriv_ebpf_notify(unpriv_enable);
5305
5306         return ret;
5307 }
5308
5309 static struct ctl_table bpf_syscall_table[] = {
5310         {
5311                 .procname       = "unprivileged_bpf_disabled",
5312                 .data           = &sysctl_unprivileged_bpf_disabled,
5313                 .maxlen         = sizeof(sysctl_unprivileged_bpf_disabled),
5314                 .mode           = 0644,
5315                 .proc_handler   = bpf_unpriv_handler,
5316                 .extra1         = SYSCTL_ZERO,
5317                 .extra2         = SYSCTL_TWO,
5318         },
5319         {
5320                 .procname       = "bpf_stats_enabled",
5321                 .data           = &bpf_stats_enabled_key.key,
5322                 .mode           = 0644,
5323                 .proc_handler   = bpf_stats_handler,
5324         },
5325         { }
5326 };
5327
5328 static int __init bpf_syscall_sysctl_init(void)
5329 {
5330         register_sysctl_init("kernel", bpf_syscall_table);
5331         return 0;
5332 }
5333 late_initcall(bpf_syscall_sysctl_init);
5334 #endif /* CONFIG_SYSCTL */