perf env: Avoid recursively taking env->bpf_progs.lock
[platform/kernel/linux-starfive.git] / kernel / bpf / task_iter.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2020 Facebook */
3
4 #include <linux/init.h>
5 #include <linux/namei.h>
6 #include <linux/pid_namespace.h>
7 #include <linux/fs.h>
8 #include <linux/fdtable.h>
9 #include <linux/filter.h>
10 #include <linux/btf_ids.h>
11 #include "mmap_unlock_work.h"
12
13 static const char * const iter_task_type_names[] = {
14         "ALL",
15         "TID",
16         "PID",
17 };
18
19 struct bpf_iter_seq_task_common {
20         struct pid_namespace *ns;
21         enum bpf_iter_task_type type;
22         u32 pid;
23         u32 pid_visiting;
24 };
25
26 struct bpf_iter_seq_task_info {
27         /* The first field must be struct bpf_iter_seq_task_common.
28          * this is assumed by {init, fini}_seq_pidns() callback functions.
29          */
30         struct bpf_iter_seq_task_common common;
31         u32 tid;
32 };
33
34 static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
35                                                    u32 *tid,
36                                                    bool skip_if_dup_files)
37 {
38         struct task_struct *task, *next_task;
39         struct pid *pid;
40         u32 saved_tid;
41
42         if (!*tid) {
43                 /* The first time, the iterator calls this function. */
44                 pid = find_pid_ns(common->pid, common->ns);
45                 if (!pid)
46                         return NULL;
47
48                 task = get_pid_task(pid, PIDTYPE_TGID);
49                 if (!task)
50                         return NULL;
51
52                 *tid = common->pid;
53                 common->pid_visiting = common->pid;
54
55                 return task;
56         }
57
58         /* If the control returns to user space and comes back to the
59          * kernel again, *tid and common->pid_visiting should be the
60          * same for task_seq_start() to pick up the correct task.
61          */
62         if (*tid == common->pid_visiting) {
63                 pid = find_pid_ns(common->pid_visiting, common->ns);
64                 task = get_pid_task(pid, PIDTYPE_PID);
65
66                 return task;
67         }
68
69         pid = find_pid_ns(common->pid_visiting, common->ns);
70         if (!pid)
71                 return NULL;
72
73         task = get_pid_task(pid, PIDTYPE_PID);
74         if (!task)
75                 return NULL;
76
77 retry:
78         if (!pid_alive(task)) {
79                 put_task_struct(task);
80                 return NULL;
81         }
82
83         next_task = next_thread(task);
84         put_task_struct(task);
85         if (!next_task)
86                 return NULL;
87
88         saved_tid = *tid;
89         *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
90         if (!*tid || *tid == common->pid) {
91                 /* Run out of tasks of a process.  The tasks of a
92                  * thread_group are linked as circular linked list.
93                  */
94                 *tid = saved_tid;
95                 return NULL;
96         }
97
98         get_task_struct(next_task);
99         common->pid_visiting = *tid;
100
101         if (skip_if_dup_files && task->files == task->group_leader->files) {
102                 task = next_task;
103                 goto retry;
104         }
105
106         return next_task;
107 }
108
109 static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
110                                              u32 *tid,
111                                              bool skip_if_dup_files)
112 {
113         struct task_struct *task = NULL;
114         struct pid *pid;
115
116         if (common->type == BPF_TASK_ITER_TID) {
117                 if (*tid && *tid != common->pid)
118                         return NULL;
119                 rcu_read_lock();
120                 pid = find_pid_ns(common->pid, common->ns);
121                 if (pid) {
122                         task = get_pid_task(pid, PIDTYPE_TGID);
123                         *tid = common->pid;
124                 }
125                 rcu_read_unlock();
126
127                 return task;
128         }
129
130         if (common->type == BPF_TASK_ITER_TGID) {
131                 rcu_read_lock();
132                 task = task_group_seq_get_next(common, tid, skip_if_dup_files);
133                 rcu_read_unlock();
134
135                 return task;
136         }
137
138         rcu_read_lock();
139 retry:
140         pid = find_ge_pid(*tid, common->ns);
141         if (pid) {
142                 *tid = pid_nr_ns(pid, common->ns);
143                 task = get_pid_task(pid, PIDTYPE_PID);
144                 if (!task) {
145                         ++*tid;
146                         goto retry;
147                 } else if (skip_if_dup_files && !thread_group_leader(task) &&
148                            task->files == task->group_leader->files) {
149                         put_task_struct(task);
150                         task = NULL;
151                         ++*tid;
152                         goto retry;
153                 }
154         }
155         rcu_read_unlock();
156
157         return task;
158 }
159
160 static void *task_seq_start(struct seq_file *seq, loff_t *pos)
161 {
162         struct bpf_iter_seq_task_info *info = seq->private;
163         struct task_struct *task;
164
165         task = task_seq_get_next(&info->common, &info->tid, false);
166         if (!task)
167                 return NULL;
168
169         if (*pos == 0)
170                 ++*pos;
171         return task;
172 }
173
174 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
175 {
176         struct bpf_iter_seq_task_info *info = seq->private;
177         struct task_struct *task;
178
179         ++*pos;
180         ++info->tid;
181         put_task_struct((struct task_struct *)v);
182         task = task_seq_get_next(&info->common, &info->tid, false);
183         if (!task)
184                 return NULL;
185
186         return task;
187 }
188
189 struct bpf_iter__task {
190         __bpf_md_ptr(struct bpf_iter_meta *, meta);
191         __bpf_md_ptr(struct task_struct *, task);
192 };
193
194 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
195
196 static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
197                            bool in_stop)
198 {
199         struct bpf_iter_meta meta;
200         struct bpf_iter__task ctx;
201         struct bpf_prog *prog;
202
203         meta.seq = seq;
204         prog = bpf_iter_get_info(&meta, in_stop);
205         if (!prog)
206                 return 0;
207
208         ctx.meta = &meta;
209         ctx.task = task;
210         return bpf_iter_run_prog(prog, &ctx);
211 }
212
213 static int task_seq_show(struct seq_file *seq, void *v)
214 {
215         return __task_seq_show(seq, v, false);
216 }
217
218 static void task_seq_stop(struct seq_file *seq, void *v)
219 {
220         if (!v)
221                 (void)__task_seq_show(seq, v, true);
222         else
223                 put_task_struct((struct task_struct *)v);
224 }
225
226 static int bpf_iter_attach_task(struct bpf_prog *prog,
227                                 union bpf_iter_link_info *linfo,
228                                 struct bpf_iter_aux_info *aux)
229 {
230         unsigned int flags;
231         struct pid *pid;
232         pid_t tgid;
233
234         if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
235                 return -EINVAL;
236
237         aux->task.type = BPF_TASK_ITER_ALL;
238         if (linfo->task.tid != 0) {
239                 aux->task.type = BPF_TASK_ITER_TID;
240                 aux->task.pid = linfo->task.tid;
241         }
242         if (linfo->task.pid != 0) {
243                 aux->task.type = BPF_TASK_ITER_TGID;
244                 aux->task.pid = linfo->task.pid;
245         }
246         if (linfo->task.pid_fd != 0) {
247                 aux->task.type = BPF_TASK_ITER_TGID;
248
249                 pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
250                 if (IS_ERR(pid))
251                         return PTR_ERR(pid);
252
253                 tgid = pid_nr_ns(pid, task_active_pid_ns(current));
254                 aux->task.pid = tgid;
255                 put_pid(pid);
256         }
257
258         return 0;
259 }
260
261 static const struct seq_operations task_seq_ops = {
262         .start  = task_seq_start,
263         .next   = task_seq_next,
264         .stop   = task_seq_stop,
265         .show   = task_seq_show,
266 };
267
268 struct bpf_iter_seq_task_file_info {
269         /* The first field must be struct bpf_iter_seq_task_common.
270          * this is assumed by {init, fini}_seq_pidns() callback functions.
271          */
272         struct bpf_iter_seq_task_common common;
273         struct task_struct *task;
274         u32 tid;
275         u32 fd;
276 };
277
278 static struct file *
279 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
280 {
281         u32 saved_tid = info->tid;
282         struct task_struct *curr_task;
283         unsigned int curr_fd = info->fd;
284
285         /* If this function returns a non-NULL file object,
286          * it held a reference to the task/file.
287          * Otherwise, it does not hold any reference.
288          */
289 again:
290         if (info->task) {
291                 curr_task = info->task;
292                 curr_fd = info->fd;
293         } else {
294                 curr_task = task_seq_get_next(&info->common, &info->tid, true);
295                 if (!curr_task) {
296                         info->task = NULL;
297                         return NULL;
298                 }
299
300                 /* set info->task */
301                 info->task = curr_task;
302                 if (saved_tid == info->tid)
303                         curr_fd = info->fd;
304                 else
305                         curr_fd = 0;
306         }
307
308         rcu_read_lock();
309         for (;; curr_fd++) {
310                 struct file *f;
311                 f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
312                 if (!f)
313                         break;
314                 if (!get_file_rcu(f))
315                         continue;
316
317                 /* set info->fd */
318                 info->fd = curr_fd;
319                 rcu_read_unlock();
320                 return f;
321         }
322
323         /* the current task is done, go to the next task */
324         rcu_read_unlock();
325         put_task_struct(curr_task);
326
327         if (info->common.type == BPF_TASK_ITER_TID) {
328                 info->task = NULL;
329                 return NULL;
330         }
331
332         info->task = NULL;
333         info->fd = 0;
334         saved_tid = ++(info->tid);
335         goto again;
336 }
337
338 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
339 {
340         struct bpf_iter_seq_task_file_info *info = seq->private;
341         struct file *file;
342
343         info->task = NULL;
344         file = task_file_seq_get_next(info);
345         if (file && *pos == 0)
346                 ++*pos;
347
348         return file;
349 }
350
351 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
352 {
353         struct bpf_iter_seq_task_file_info *info = seq->private;
354
355         ++*pos;
356         ++info->fd;
357         fput((struct file *)v);
358         return task_file_seq_get_next(info);
359 }
360
361 struct bpf_iter__task_file {
362         __bpf_md_ptr(struct bpf_iter_meta *, meta);
363         __bpf_md_ptr(struct task_struct *, task);
364         u32 fd __aligned(8);
365         __bpf_md_ptr(struct file *, file);
366 };
367
368 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
369                      struct task_struct *task, u32 fd,
370                      struct file *file)
371
372 static int __task_file_seq_show(struct seq_file *seq, struct file *file,
373                                 bool in_stop)
374 {
375         struct bpf_iter_seq_task_file_info *info = seq->private;
376         struct bpf_iter__task_file ctx;
377         struct bpf_iter_meta meta;
378         struct bpf_prog *prog;
379
380         meta.seq = seq;
381         prog = bpf_iter_get_info(&meta, in_stop);
382         if (!prog)
383                 return 0;
384
385         ctx.meta = &meta;
386         ctx.task = info->task;
387         ctx.fd = info->fd;
388         ctx.file = file;
389         return bpf_iter_run_prog(prog, &ctx);
390 }
391
392 static int task_file_seq_show(struct seq_file *seq, void *v)
393 {
394         return __task_file_seq_show(seq, v, false);
395 }
396
397 static void task_file_seq_stop(struct seq_file *seq, void *v)
398 {
399         struct bpf_iter_seq_task_file_info *info = seq->private;
400
401         if (!v) {
402                 (void)__task_file_seq_show(seq, v, true);
403         } else {
404                 fput((struct file *)v);
405                 put_task_struct(info->task);
406                 info->task = NULL;
407         }
408 }
409
410 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
411 {
412         struct bpf_iter_seq_task_common *common = priv_data;
413
414         common->ns = get_pid_ns(task_active_pid_ns(current));
415         common->type = aux->task.type;
416         common->pid = aux->task.pid;
417
418         return 0;
419 }
420
421 static void fini_seq_pidns(void *priv_data)
422 {
423         struct bpf_iter_seq_task_common *common = priv_data;
424
425         put_pid_ns(common->ns);
426 }
427
428 static const struct seq_operations task_file_seq_ops = {
429         .start  = task_file_seq_start,
430         .next   = task_file_seq_next,
431         .stop   = task_file_seq_stop,
432         .show   = task_file_seq_show,
433 };
434
435 struct bpf_iter_seq_task_vma_info {
436         /* The first field must be struct bpf_iter_seq_task_common.
437          * this is assumed by {init, fini}_seq_pidns() callback functions.
438          */
439         struct bpf_iter_seq_task_common common;
440         struct task_struct *task;
441         struct mm_struct *mm;
442         struct vm_area_struct *vma;
443         u32 tid;
444         unsigned long prev_vm_start;
445         unsigned long prev_vm_end;
446 };
447
448 enum bpf_task_vma_iter_find_op {
449         task_vma_iter_first_vma,   /* use find_vma() with addr 0 */
450         task_vma_iter_next_vma,    /* use vma_next() with curr_vma */
451         task_vma_iter_find_vma,    /* use find_vma() to find next vma */
452 };
453
454 static struct vm_area_struct *
455 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
456 {
457         enum bpf_task_vma_iter_find_op op;
458         struct vm_area_struct *curr_vma;
459         struct task_struct *curr_task;
460         struct mm_struct *curr_mm;
461         u32 saved_tid = info->tid;
462
463         /* If this function returns a non-NULL vma, it holds a reference to
464          * the task_struct, holds a refcount on mm->mm_users, and holds
465          * read lock on vma->mm->mmap_lock.
466          * If this function returns NULL, it does not hold any reference or
467          * lock.
468          */
469         if (info->task) {
470                 curr_task = info->task;
471                 curr_vma = info->vma;
472                 curr_mm = info->mm;
473                 /* In case of lock contention, drop mmap_lock to unblock
474                  * the writer.
475                  *
476                  * After relock, call find(mm, prev_vm_end - 1) to find
477                  * new vma to process.
478                  *
479                  *   +------+------+-----------+
480                  *   | VMA1 | VMA2 | VMA3      |
481                  *   +------+------+-----------+
482                  *   |      |      |           |
483                  *  4k     8k     16k         400k
484                  *
485                  * For example, curr_vma == VMA2. Before unlock, we set
486                  *
487                  *    prev_vm_start = 8k
488                  *    prev_vm_end   = 16k
489                  *
490                  * There are a few cases:
491                  *
492                  * 1) VMA2 is freed, but VMA3 exists.
493                  *
494                  *    find_vma() will return VMA3, just process VMA3.
495                  *
496                  * 2) VMA2 still exists.
497                  *
498                  *    find_vma() will return VMA2, process VMA2->next.
499                  *
500                  * 3) no more vma in this mm.
501                  *
502                  *    Process the next task.
503                  *
504                  * 4) find_vma() returns a different vma, VMA2'.
505                  *
506                  *    4.1) If VMA2 covers same range as VMA2', skip VMA2',
507                  *         because we already covered the range;
508                  *    4.2) VMA2 and VMA2' covers different ranges, process
509                  *         VMA2'.
510                  */
511                 if (mmap_lock_is_contended(curr_mm)) {
512                         info->prev_vm_start = curr_vma->vm_start;
513                         info->prev_vm_end = curr_vma->vm_end;
514                         op = task_vma_iter_find_vma;
515                         mmap_read_unlock(curr_mm);
516                         if (mmap_read_lock_killable(curr_mm)) {
517                                 mmput(curr_mm);
518                                 goto finish;
519                         }
520                 } else {
521                         op = task_vma_iter_next_vma;
522                 }
523         } else {
524 again:
525                 curr_task = task_seq_get_next(&info->common, &info->tid, true);
526                 if (!curr_task) {
527                         info->tid++;
528                         goto finish;
529                 }
530
531                 if (saved_tid != info->tid) {
532                         /* new task, process the first vma */
533                         op = task_vma_iter_first_vma;
534                 } else {
535                         /* Found the same tid, which means the user space
536                          * finished data in previous buffer and read more.
537                          * We dropped mmap_lock before returning to user
538                          * space, so it is necessary to use find_vma() to
539                          * find the next vma to process.
540                          */
541                         op = task_vma_iter_find_vma;
542                 }
543
544                 curr_mm = get_task_mm(curr_task);
545                 if (!curr_mm)
546                         goto next_task;
547
548                 if (mmap_read_lock_killable(curr_mm)) {
549                         mmput(curr_mm);
550                         goto finish;
551                 }
552         }
553
554         switch (op) {
555         case task_vma_iter_first_vma:
556                 curr_vma = find_vma(curr_mm, 0);
557                 break;
558         case task_vma_iter_next_vma:
559                 curr_vma = find_vma(curr_mm, curr_vma->vm_end);
560                 break;
561         case task_vma_iter_find_vma:
562                 /* We dropped mmap_lock so it is necessary to use find_vma
563                  * to find the next vma. This is similar to the  mechanism
564                  * in show_smaps_rollup().
565                  */
566                 curr_vma = find_vma(curr_mm, info->prev_vm_end - 1);
567                 /* case 1) and 4.2) above just use curr_vma */
568
569                 /* check for case 2) or case 4.1) above */
570                 if (curr_vma &&
571                     curr_vma->vm_start == info->prev_vm_start &&
572                     curr_vma->vm_end == info->prev_vm_end)
573                         curr_vma = find_vma(curr_mm, curr_vma->vm_end);
574                 break;
575         }
576         if (!curr_vma) {
577                 /* case 3) above, or case 2) 4.1) with vma->next == NULL */
578                 mmap_read_unlock(curr_mm);
579                 mmput(curr_mm);
580                 goto next_task;
581         }
582         info->task = curr_task;
583         info->vma = curr_vma;
584         info->mm = curr_mm;
585         return curr_vma;
586
587 next_task:
588         if (info->common.type == BPF_TASK_ITER_TID)
589                 goto finish;
590
591         put_task_struct(curr_task);
592         info->task = NULL;
593         info->mm = NULL;
594         info->tid++;
595         goto again;
596
597 finish:
598         if (curr_task)
599                 put_task_struct(curr_task);
600         info->task = NULL;
601         info->vma = NULL;
602         info->mm = NULL;
603         return NULL;
604 }
605
606 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
607 {
608         struct bpf_iter_seq_task_vma_info *info = seq->private;
609         struct vm_area_struct *vma;
610
611         vma = task_vma_seq_get_next(info);
612         if (vma && *pos == 0)
613                 ++*pos;
614
615         return vma;
616 }
617
618 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
619 {
620         struct bpf_iter_seq_task_vma_info *info = seq->private;
621
622         ++*pos;
623         return task_vma_seq_get_next(info);
624 }
625
626 struct bpf_iter__task_vma {
627         __bpf_md_ptr(struct bpf_iter_meta *, meta);
628         __bpf_md_ptr(struct task_struct *, task);
629         __bpf_md_ptr(struct vm_area_struct *, vma);
630 };
631
632 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
633                      struct task_struct *task, struct vm_area_struct *vma)
634
635 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
636 {
637         struct bpf_iter_seq_task_vma_info *info = seq->private;
638         struct bpf_iter__task_vma ctx;
639         struct bpf_iter_meta meta;
640         struct bpf_prog *prog;
641
642         meta.seq = seq;
643         prog = bpf_iter_get_info(&meta, in_stop);
644         if (!prog)
645                 return 0;
646
647         ctx.meta = &meta;
648         ctx.task = info->task;
649         ctx.vma = info->vma;
650         return bpf_iter_run_prog(prog, &ctx);
651 }
652
653 static int task_vma_seq_show(struct seq_file *seq, void *v)
654 {
655         return __task_vma_seq_show(seq, false);
656 }
657
658 static void task_vma_seq_stop(struct seq_file *seq, void *v)
659 {
660         struct bpf_iter_seq_task_vma_info *info = seq->private;
661
662         if (!v) {
663                 (void)__task_vma_seq_show(seq, true);
664         } else {
665                 /* info->vma has not been seen by the BPF program. If the
666                  * user space reads more, task_vma_seq_get_next should
667                  * return this vma again. Set prev_vm_start to ~0UL,
668                  * so that we don't skip the vma returned by the next
669                  * find_vma() (case task_vma_iter_find_vma in
670                  * task_vma_seq_get_next()).
671                  */
672                 info->prev_vm_start = ~0UL;
673                 info->prev_vm_end = info->vma->vm_end;
674                 mmap_read_unlock(info->mm);
675                 mmput(info->mm);
676                 info->mm = NULL;
677                 put_task_struct(info->task);
678                 info->task = NULL;
679         }
680 }
681
682 static const struct seq_operations task_vma_seq_ops = {
683         .start  = task_vma_seq_start,
684         .next   = task_vma_seq_next,
685         .stop   = task_vma_seq_stop,
686         .show   = task_vma_seq_show,
687 };
688
689 static const struct bpf_iter_seq_info task_seq_info = {
690         .seq_ops                = &task_seq_ops,
691         .init_seq_private       = init_seq_pidns,
692         .fini_seq_private       = fini_seq_pidns,
693         .seq_priv_size          = sizeof(struct bpf_iter_seq_task_info),
694 };
695
696 static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
697 {
698         switch (aux->task.type) {
699         case BPF_TASK_ITER_TID:
700                 info->iter.task.tid = aux->task.pid;
701                 break;
702         case BPF_TASK_ITER_TGID:
703                 info->iter.task.pid = aux->task.pid;
704                 break;
705         default:
706                 break;
707         }
708         return 0;
709 }
710
711 static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
712 {
713         seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
714         if (aux->task.type == BPF_TASK_ITER_TID)
715                 seq_printf(seq, "tid:\t%u\n", aux->task.pid);
716         else if (aux->task.type == BPF_TASK_ITER_TGID)
717                 seq_printf(seq, "pid:\t%u\n", aux->task.pid);
718 }
719
720 static struct bpf_iter_reg task_reg_info = {
721         .target                 = "task",
722         .attach_target          = bpf_iter_attach_task,
723         .feature                = BPF_ITER_RESCHED,
724         .ctx_arg_info_size      = 1,
725         .ctx_arg_info           = {
726                 { offsetof(struct bpf_iter__task, task),
727                   PTR_TO_BTF_ID_OR_NULL },
728         },
729         .seq_info               = &task_seq_info,
730         .fill_link_info         = bpf_iter_fill_link_info,
731         .show_fdinfo            = bpf_iter_task_show_fdinfo,
732 };
733
734 static const struct bpf_iter_seq_info task_file_seq_info = {
735         .seq_ops                = &task_file_seq_ops,
736         .init_seq_private       = init_seq_pidns,
737         .fini_seq_private       = fini_seq_pidns,
738         .seq_priv_size          = sizeof(struct bpf_iter_seq_task_file_info),
739 };
740
741 static struct bpf_iter_reg task_file_reg_info = {
742         .target                 = "task_file",
743         .attach_target          = bpf_iter_attach_task,
744         .feature                = BPF_ITER_RESCHED,
745         .ctx_arg_info_size      = 2,
746         .ctx_arg_info           = {
747                 { offsetof(struct bpf_iter__task_file, task),
748                   PTR_TO_BTF_ID_OR_NULL },
749                 { offsetof(struct bpf_iter__task_file, file),
750                   PTR_TO_BTF_ID_OR_NULL },
751         },
752         .seq_info               = &task_file_seq_info,
753         .fill_link_info         = bpf_iter_fill_link_info,
754         .show_fdinfo            = bpf_iter_task_show_fdinfo,
755 };
756
757 static const struct bpf_iter_seq_info task_vma_seq_info = {
758         .seq_ops                = &task_vma_seq_ops,
759         .init_seq_private       = init_seq_pidns,
760         .fini_seq_private       = fini_seq_pidns,
761         .seq_priv_size          = sizeof(struct bpf_iter_seq_task_vma_info),
762 };
763
764 static struct bpf_iter_reg task_vma_reg_info = {
765         .target                 = "task_vma",
766         .attach_target          = bpf_iter_attach_task,
767         .feature                = BPF_ITER_RESCHED,
768         .ctx_arg_info_size      = 2,
769         .ctx_arg_info           = {
770                 { offsetof(struct bpf_iter__task_vma, task),
771                   PTR_TO_BTF_ID_OR_NULL },
772                 { offsetof(struct bpf_iter__task_vma, vma),
773                   PTR_TO_BTF_ID_OR_NULL },
774         },
775         .seq_info               = &task_vma_seq_info,
776         .fill_link_info         = bpf_iter_fill_link_info,
777         .show_fdinfo            = bpf_iter_task_show_fdinfo,
778 };
779
780 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
781            bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
782 {
783         struct mmap_unlock_irq_work *work = NULL;
784         struct vm_area_struct *vma;
785         bool irq_work_busy = false;
786         struct mm_struct *mm;
787         int ret = -ENOENT;
788
789         if (flags)
790                 return -EINVAL;
791
792         if (!task)
793                 return -ENOENT;
794
795         mm = task->mm;
796         if (!mm)
797                 return -ENOENT;
798
799         irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
800
801         if (irq_work_busy || !mmap_read_trylock(mm))
802                 return -EBUSY;
803
804         vma = find_vma(mm, start);
805
806         if (vma && vma->vm_start <= start && vma->vm_end > start) {
807                 callback_fn((u64)(long)task, (u64)(long)vma,
808                             (u64)(long)callback_ctx, 0, 0);
809                 ret = 0;
810         }
811         bpf_mmap_unlock_mm(work, mm);
812         return ret;
813 }
814
815 const struct bpf_func_proto bpf_find_vma_proto = {
816         .func           = bpf_find_vma,
817         .ret_type       = RET_INTEGER,
818         .arg1_type      = ARG_PTR_TO_BTF_ID,
819         .arg1_btf_id    = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
820         .arg2_type      = ARG_ANYTHING,
821         .arg3_type      = ARG_PTR_TO_FUNC,
822         .arg4_type      = ARG_PTR_TO_STACK_OR_NULL,
823         .arg5_type      = ARG_ANYTHING,
824 };
825
826 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
827
828 static void do_mmap_read_unlock(struct irq_work *entry)
829 {
830         struct mmap_unlock_irq_work *work;
831
832         if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
833                 return;
834
835         work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
836         mmap_read_unlock_non_owner(work->mm);
837 }
838
839 static int __init task_iter_init(void)
840 {
841         struct mmap_unlock_irq_work *work;
842         int ret, cpu;
843
844         for_each_possible_cpu(cpu) {
845                 work = per_cpu_ptr(&mmap_unlock_work, cpu);
846                 init_irq_work(&work->irq_work, do_mmap_read_unlock);
847         }
848
849         task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
850         ret = bpf_iter_reg_target(&task_reg_info);
851         if (ret)
852                 return ret;
853
854         task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
855         task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
856         ret =  bpf_iter_reg_target(&task_file_reg_info);
857         if (ret)
858                 return ret;
859
860         task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
861         task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
862         return bpf_iter_reg_target(&task_vma_reg_info);
863 }
864 late_initcall(task_iter_init);