sched: Fix yet more sched_fork() races

author Peter Zijlstra <peterz@infradead.org>

Mon, 14 Feb 2022 09:16:57 +0000 (10:16 +0100)

committer Peter Zijlstra <peterz@infradead.org>

Sat, 19 Feb 2022 10:11:05 +0000 (11:11 +0100)
author Peter Zijlstra <peterz@infradead.org>
Mon, 14 Feb 2022 09:16:57 +0000 (10:16 +0100)
committer Peter Zijlstra <peterz@infradead.org>
Sat, 19 Feb 2022 10:11:05 +0000 (11:11 +0100)
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h

index b9198a1..e84e54d 100644 (file)
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
  extern void init_idle(struct task_struct *idle, int cpu);
  
  extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_post_fork(struct task_struct *p,
-                           struct kernel_clone_args *kargs);
+extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern void sched_post_fork(struct task_struct *p);
  extern void sched_dead(struct task_struct *p);
  
  void __noreturn do_task_dead(void);
diff --git a/kernel/fork.c b/kernel/fork.c

index d75a528..c607d23 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2267,6 +2267,17 @@ static __latent_entropy struct task_struct *copy_process(
                 goto bad_fork_put_pidfd;
  
         /*
+        * Now that the cgroups are pinned, re-clone the parent cgroup and put
+        * the new task on the correct runqueue. All this *before* the task
+        * becomes visible.
+        *
+        * This isn't part of ->can_fork() because while the re-cloning is
+        * cgroup specific, it unconditionally needs to place the task on a
+        * runqueue.
+        */
+       sched_cgroup_fork(p, args);
+
+       /*
          * From this point on we must avoid any synchronous user-space
          * communication until we take the tasklist-lock. In particular, we do
          * not want user-space to be able to predict the process start-time by
@@ -2376,7 +2387,7 @@ static __latent_entropy struct task_struct *copy_process(
         write_unlock_irq(&tasklist_lock);
  
         proc_fork_connector(p);
-       sched_post_fork(p, args);
+       sched_post_fork(p);
         cgroup_post_fork(p, args);
         perf_event_fork(p);
  
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index fcf0c18..9745613 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1214,9 +1214,8 @@ int tg_nop(struct task_group *tg, void *data)
  }
  #endif
  
-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
  {
-       bool update_load = !(READ_ONCE(p->__state) & TASK_NEW);
         int prio = p->static_prio - MAX_RT_PRIO;
         struct load_weight *load = &p->se.load;
  
@@ -4407,7 +4406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
                         p->static_prio = NICE_TO_PRIO(0);
  
                 p->prio = p->normal_prio = p->static_prio;
-               set_load_weight(p);
+               set_load_weight(p, false);
  
                 /*
                  * We don't need the reset flag anymore after the fork. It has
@@ -4425,6 +4424,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
  
         init_entity_runnable_average(&p->se);
  
+
  #ifdef CONFIG_SCHED_INFO
         if (likely(sched_info_on()))
                 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -4440,18 +4440,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
         return 0;
  }
  
-void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
  {
         unsigned long flags;
-#ifdef CONFIG_CGROUP_SCHED
-       struct task_group *tg;
-#endif
  
+       /*
+        * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+        * required yet, but lockdep gets upset if rules are violated.
+        */
         raw_spin_lock_irqsave(&p->pi_lock, flags);
  #ifdef CONFIG_CGROUP_SCHED
-       tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
-                         struct task_group, css);
-       p->sched_task_group = autogroup_task_group(p, tg);
+       if (1) {
+               struct task_group *tg;
+               tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+                                 struct task_group, css);
+               tg = autogroup_task_group(p, tg);
+               p->sched_task_group = tg;
+       }
  #endif
         rseq_migrate(p);
         /*
@@ -4462,7 +4467,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
         if (p->sched_class->task_fork)
                 p->sched_class->task_fork(p);
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
  
+void sched_post_fork(struct task_struct *p)
+{
         uclamp_post_fork(p);
  }
  
@@ -6922,7 +6930,7 @@ void set_user_nice(struct task_struct *p, long nice)
                 put_prev_task(rq, p);
  
         p->static_prio = NICE_TO_PRIO(nice);
-       set_load_weight(p);
+       set_load_weight(p, true);
         old_prio = p->prio;
         p->prio = effective_prio(p);
  
@@ -7213,7 +7221,7 @@ static void __setscheduler_params(struct task_struct *p,
          */
         p->rt_priority = attr->sched_priority;
         p->normal_prio = normal_prio(p);
-       set_load_weight(p);
+       set_load_weight(p, true);
  }
  
  /*
@@ -9446,7 +9454,7 @@ void __init sched_init(void)
  #endif
         }
  
-       set_load_weight(&init_task);
+       set_load_weight(&init_task, false);
  
         /*
          * The boot idle thread does lazy MMU switching as well:
author	Peter Zijlstra <peterz@infradead.org>
	Mon, 14 Feb 2022 09:16:57 +0000 (10:16 +0100)
committer	Peter Zijlstra <peterz@infradead.org>
	Sat, 19 Feb 2022 10:11:05 +0000 (11:11 +0100)
include/linux/sched/task.h		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history