* specified at mount time and thus is implemented here.
*/
CGRP_CPUSET_CLONE_CHILDREN,
+
+ /* Control group has to be frozen. */
+ CGRP_FREEZE,
+
+ /* Cgroup is frozen. */
+ CGRP_FROZEN,
};
/* cgroup_root->flags */
struct cgroup *updated_next; /* NULL iff not on the list */
};
+struct cgroup_freezer_state {
+ /* Should the cgroup and its descendants be frozen. */
+ bool freeze;
+
+ /* Should the cgroup actually be frozen? */
+ int e_freeze;
+
+ /* Fields below are protected by css_set_lock */
+
+ /* Number of frozen descendant cgroups */
+ int nr_frozen_descendants;
+
+ /*
+ * Number of tasks, which are counted as frozen:
+ * frozen, SIGSTOPped, and PTRACEd.
+ */
+ int nr_frozen_tasks;
+};
+
struct cgroup {
/* self css with NULL ->ss, points back to this cgroup */
struct cgroup_subsys_state self;
/* If there is block congestion on this cgroup. */
atomic_t congestion_count;
+ /* Used to store internal freezer state */
+ struct cgroup_freezer_state freezer;
+
/* ids of the ancestors at each level including self */
int ancestor_ids[];
};
free_cgroup_ns(ns);
}
+#ifdef CONFIG_CGROUPS
+
+void cgroup_enter_frozen(void);
+void cgroup_leave_frozen(bool always_leave);
+void cgroup_update_frozen(struct cgroup *cgrp);
+void cgroup_freeze(struct cgroup *cgrp, bool freeze);
+void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
+ struct cgroup *dst);
+void cgroup_freezer_frozen_exit(struct task_struct *task);
+static inline bool cgroup_task_freeze(struct task_struct *task)
+{
+ bool ret;
+
+ if (task->flags & PF_KTHREAD)
+ return false;
+
+ rcu_read_lock();
+ ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline bool cgroup_task_frozen(struct task_struct *task)
+{
+ return task->frozen;
+}
+
+#else /* !CONFIG_CGROUPS */
+
+static inline void cgroup_enter_frozen(void) { }
+static inline void cgroup_leave_frozen(bool always_leave) { }
+static inline bool cgroup_task_freeze(struct task_struct *task)
+{
+ return false;
+}
+static inline bool cgroup_task_frozen(struct task_struct *task)
+{
+ return false;
+}
+
+#endif /* !CONFIG_CGROUPS */
+
#endif /* _LINUX_CGROUP_H */
#ifdef CONFIG_CGROUPS
/* disallow userland-initiated cgroup migration */
unsigned no_cgroup_migration:1;
+ /* task is frozen/stopped (used by the cgroup freezer) */
+ unsigned frozen:1;
#endif
#ifdef CONFIG_BLK_CGROUP
/* to be used once the psi infrastructure lands upstream. */
#define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */
#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */
#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
+#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */
#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT)
#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT)
#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT)
+#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT)
#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
# SPDX-License-Identifier: GPL-2.0
-obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o
+obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o
obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
get_css_set(to_cset);
to_cset->nr_tasks++;
css_set_move_task(task, from_cset, to_cset, true);
- put_css_set_locked(from_cset);
from_cset->nr_tasks--;
+ /*
+ * If the source or destination cgroup is frozen,
+ * the task might require to change its state.
+ */
+ cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
+ to_cset->dfl_cgrp);
+ put_css_set_locked(from_cset);
+
}
}
spin_unlock_irq(&css_set_lock);
static int cgroup_events_show(struct seq_file *seq, void *v)
{
- seq_printf(seq, "populated %d\n",
- cgroup_is_populated(seq_css(seq)->cgroup));
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
+ seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
+
return 0;
}
}
#endif
+static int cgroup_freeze_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "%d\n", cgrp->freezer.freeze);
+
+ return 0;
+}
+
+static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ ssize_t ret;
+ int freeze;
+
+ ret = kstrtoint(strstrip(buf), 0, &freeze);
+ if (ret)
+ return ret;
+
+ if (freeze < 0 || freeze > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgroup_freeze(cgrp, freeze);
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
.seq_show = cgroup_stat_show,
},
{
+ .name = "cgroup.freeze",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_freeze_show,
+ .write = cgroup_freeze_write,
+ },
+ {
.name = "cpu.stat",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_stat_show,
if (ret)
goto out_psi_free;
+ /*
+ * New cgroup inherits effective freeze counter, and
+ * if the parent has to be frozen, the child has too.
+ */
+ cgrp->freezer.e_freeze = parent->freezer.e_freeze;
+ if (cgrp->freezer.e_freeze)
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
- if (tcgrp != cgrp)
+ if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
+
+ /*
+ * If the new cgroup is frozen, all ancestor cgroups
+ * get a new frozen descendant, but their state can't
+ * change because of this.
+ */
+ if (cgrp->freezer.e_freeze)
+ tcgrp->freezer.nr_frozen_descendants++;
+ }
}
spin_unlock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
tcgrp->nr_descendants--;
tcgrp->nr_dying_descendants++;
+ /*
+ * If the dying cgroup is frozen, decrease frozen descendants
+ * counters of ancestor cgroups.
+ */
+ if (test_bit(CGRP_FROZEN, &cgrp->flags))
+ tcgrp->freezer.nr_frozen_descendants--;
}
spin_unlock_irq(&css_set_lock);
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
}
+
+ /*
+ * If the cgroup has to be frozen, the new task has too.
+ * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
+ * the task into the frozen state.
+ */
+ if (unlikely(cgroup_task_freeze(child))) {
+ struct cgroup *cgrp;
+
+ spin_lock(&child->sighand->siglock);
+ WARN_ON_ONCE(child->frozen);
+ cgrp = cset->dfl_cgrp;
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
+ spin_unlock(&child->sighand->siglock);
+
+ /*
+ * Calling cgroup_update_frozen() isn't required here,
+ * because it will be called anyway a bit later
+ * from do_freezer_trap(). So we avoid cgroup's
+ * transient switch from the frozen state and back.
+ */
+ }
+
spin_unlock_irq(&css_set_lock);
}
spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
cset->nr_tasks--;
+
+ if (unlikely(cgroup_task_frozen(tsk)))
+ cgroup_freezer_frozen_exit(tsk);
+ else if (unlikely(cgroup_task_freeze(tsk)))
+ cgroup_update_frozen(task_dfl_cgroup(tsk));
+
spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
--- /dev/null
+//SPDX-License-Identifier: GPL-2.0
+#include <linux/cgroup.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
+
+#include "cgroup-internal.h"
+
+/*
+ * Propagate the cgroup frozen state upwards by the cgroup tree.
+ */
+static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
+{
+ int desc = 1;
+
+ /*
+ * If the new state is frozen, some freezing ancestor cgroups may change
+ * their state too, depending on if all their descendants are frozen.
+ *
+ * Otherwise, all ancestor cgroups are forced into the non-frozen state.
+ */
+ while ((cgrp = cgroup_parent(cgrp))) {
+ if (frozen) {
+ cgrp->freezer.nr_frozen_descendants += desc;
+ if (!test_bit(CGRP_FROZEN, &cgrp->flags) &&
+ test_bit(CGRP_FREEZE, &cgrp->flags) &&
+ cgrp->freezer.nr_frozen_descendants ==
+ cgrp->nr_descendants) {
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+ cgroup_file_notify(&cgrp->events_file);
+ desc++;
+ }
+ } else {
+ cgrp->freezer.nr_frozen_descendants -= desc;
+ if (test_bit(CGRP_FROZEN, &cgrp->flags)) {
+ clear_bit(CGRP_FROZEN, &cgrp->flags);
+ cgroup_file_notify(&cgrp->events_file);
+ desc++;
+ }
+ }
+ }
+}
+
+/*
+ * Revisit the cgroup frozen state.
+ * Checks if the cgroup is really frozen and perform all state transitions.
+ */
+void cgroup_update_frozen(struct cgroup *cgrp)
+{
+ bool frozen;
+
+ lockdep_assert_held(&css_set_lock);
+
+ /*
+ * If the cgroup has to be frozen (CGRP_FREEZE bit set),
+ * and all tasks are frozen and/or stopped, let's consider
+ * the cgroup frozen. Otherwise it's not frozen.
+ */
+ frozen = test_bit(CGRP_FREEZE, &cgrp->flags) &&
+ cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp);
+
+ if (frozen) {
+ /* Already there? */
+ if (test_bit(CGRP_FROZEN, &cgrp->flags))
+ return;
+
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+ } else {
+ /* Already there? */
+ if (!test_bit(CGRP_FROZEN, &cgrp->flags))
+ return;
+
+ clear_bit(CGRP_FROZEN, &cgrp->flags);
+ }
+ cgroup_file_notify(&cgrp->events_file);
+
+ /* Update the state of ancestor cgroups. */
+ cgroup_propagate_frozen(cgrp, frozen);
+}
+
+/*
+ * Increment cgroup's nr_frozen_tasks.
+ */
+static void cgroup_inc_frozen_cnt(struct cgroup *cgrp)
+{
+ cgrp->freezer.nr_frozen_tasks++;
+}
+
+/*
+ * Decrement cgroup's nr_frozen_tasks.
+ */
+static void cgroup_dec_frozen_cnt(struct cgroup *cgrp)
+{
+ cgrp->freezer.nr_frozen_tasks--;
+ WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0);
+}
+
+/*
+ * Enter frozen/stopped state, if not yet there. Update cgroup's counters,
+ * and revisit the state of the cgroup, if necessary.
+ */
+void cgroup_enter_frozen(void)
+{
+ struct cgroup *cgrp;
+
+ if (current->frozen)
+ return;
+
+ spin_lock_irq(&css_set_lock);
+ current->frozen = true;
+ cgrp = task_dfl_cgroup(current);
+ cgroup_inc_frozen_cnt(cgrp);
+ cgroup_update_frozen(cgrp);
+ spin_unlock_irq(&css_set_lock);
+}
+
+/*
+ * Conditionally leave frozen/stopped state. Update cgroup's counters,
+ * and revisit the state of the cgroup, if necessary.
+ *
+ * If always_leave is not set, and the cgroup is freezing,
+ * we're racing with the cgroup freezing. In this case, we don't
+ * drop the frozen counter to avoid a transient switch to
+ * the unfrozen state.
+ */
+void cgroup_leave_frozen(bool always_leave)
+{
+ struct cgroup *cgrp;
+
+ spin_lock_irq(&css_set_lock);
+ cgrp = task_dfl_cgroup(current);
+ if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) {
+ cgroup_dec_frozen_cnt(cgrp);
+ cgroup_update_frozen(cgrp);
+ WARN_ON_ONCE(!current->frozen);
+ current->frozen = false;
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ if (unlikely(current->frozen)) {
+ /*
+ * If the task remained in the frozen state,
+ * make sure it won't reach userspace without
+ * entering the signal handling loop.
+ */
+ spin_lock_irq(¤t->sighand->siglock);
+ recalc_sigpending();
+ spin_unlock_irq(¤t->sighand->siglock);
+ }
+}
+
+/*
+ * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE
+ * jobctl bit.
+ */
+static void cgroup_freeze_task(struct task_struct *task, bool freeze)
+{
+ unsigned long flags;
+
+ /* If the task is about to die, don't bother with freezing it. */
+ if (!lock_task_sighand(task, &flags))
+ return;
+
+ if (freeze) {
+ task->jobctl |= JOBCTL_TRAP_FREEZE;
+ signal_wake_up(task, false);
+ } else {
+ task->jobctl &= ~JOBCTL_TRAP_FREEZE;
+ wake_up_process(task);
+ }
+
+ unlock_task_sighand(task, &flags);
+}
+
+/*
+ * Freeze or unfreeze all tasks in the given cgroup.
+ */
+static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ spin_lock_irq(&css_set_lock);
+ if (freeze)
+ set_bit(CGRP_FREEZE, &cgrp->flags);
+ else
+ clear_bit(CGRP_FREEZE, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
+
+ css_task_iter_start(&cgrp->self, 0, &it);
+ while ((task = css_task_iter_next(&it))) {
+ /*
+ * Ignore kernel threads here. Freezing cgroups containing
+ * kthreads isn't supported.
+ */
+ if (task->flags & PF_KTHREAD)
+ continue;
+ cgroup_freeze_task(task, freeze);
+ }
+ css_task_iter_end(&it);
+
+ /*
+ * Cgroup state should be revisited here to cover empty leaf cgroups
+ * and cgroups which descendants are already in the desired state.
+ */
+ spin_lock_irq(&css_set_lock);
+ if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants)
+ cgroup_update_frozen(cgrp);
+ spin_unlock_irq(&css_set_lock);
+}
+
+/*
+ * Adjust the task state (freeze or unfreeze) and revisit the state of
+ * source and destination cgroups.
+ */
+void cgroup_freezer_migrate_task(struct task_struct *task,
+ struct cgroup *src, struct cgroup *dst)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ /*
+ * Kernel threads are not supposed to be frozen at all.
+ */
+ if (task->flags & PF_KTHREAD)
+ return;
+
+ /*
+ * Adjust counters of freezing and frozen tasks.
+ * Note, that if the task is frozen, but the destination cgroup is not
+ * frozen, we bump both counters to keep them balanced.
+ */
+ if (task->frozen) {
+ cgroup_inc_frozen_cnt(dst);
+ cgroup_dec_frozen_cnt(src);
+ }
+ cgroup_update_frozen(dst);
+ cgroup_update_frozen(src);
+
+ /*
+ * Force the task to the desired state.
+ */
+ cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags));
+}
+
+void cgroup_freezer_frozen_exit(struct task_struct *task)
+{
+ struct cgroup *cgrp = task_dfl_cgroup(task);
+
+ lockdep_assert_held(&css_set_lock);
+
+ cgroup_dec_frozen_cnt(cgrp);
+ cgroup_update_frozen(cgrp);
+}
+
+void cgroup_freeze(struct cgroup *cgrp, bool freeze)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *dsct;
+ bool applied = false;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * Nothing changed? Just exit.
+ */
+ if (cgrp->freezer.freeze == freeze)
+ return;
+
+ cgrp->freezer.freeze = freeze;
+
+ /*
+ * Propagate changes downwards the cgroup tree.
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ dsct = css->cgroup;
+
+ if (cgroup_is_dead(dsct))
+ continue;
+
+ if (freeze) {
+ dsct->freezer.e_freeze++;
+ /*
+ * Already frozen because of ancestor's settings?
+ */
+ if (dsct->freezer.e_freeze > 1)
+ continue;
+ } else {
+ dsct->freezer.e_freeze--;
+ /*
+ * Still frozen because of ancestor's settings?
+ */
+ if (dsct->freezer.e_freeze > 0)
+ continue;
+
+ WARN_ON_ONCE(dsct->freezer.e_freeze < 0);
+ }
+
+ /*
+ * Do change actual state: freeze or unfreeze.
+ */
+ cgroup_do_freeze(dsct, freeze);
+ applied = true;
+ }
+
+ /*
+ * Even if the actual state hasn't changed, let's notify a user.
+ * The state can be enforced by an ancestor cgroup: the cgroup
+ * can already be in the desired state or it can be locked in the
+ * opposite state, so that the transition will never happen.
+ * In both cases it's better to notify a user, that there is
+ * nothing to wait for.
+ */
+ if (!applied)
+ cgroup_file_notify(&cgrp->events_file);
+}
int killed;
freezer_do_not_count();
+ cgroup_enter_frozen();
killed = wait_for_completion_killable(vfork);
+ cgroup_leave_frozen(false);
freezer_count();
if (killed) {
#include <linux/compiler.h>
#include <linux/posix-timers.h>
#include <linux/livepatch.h>
+#include <linux/cgroup.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
static bool recalc_sigpending_tsk(struct task_struct *t)
{
- if ((t->jobctl & JOBCTL_PENDING_MASK) ||
+ if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||
PENDING(&t->pending, &t->blocked) ||
- PENDING(&t->signal->shared_pending, &t->blocked)) {
+ PENDING(&t->signal->shared_pending, &t->blocked) ||
+ cgroup_task_frozen(t)) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
return true;
}
preempt_disable();
read_unlock(&tasklist_lock);
preempt_enable_no_resched();
+ cgroup_enter_frozen();
freezable_schedule();
} else {
/*
}
/* Now we don't run again until woken by SIGCONT or SIGKILL */
+ cgroup_enter_frozen();
freezable_schedule();
return true;
} else {
}
}
+/**
+ * do_freezer_trap - handle the freezer jobctl trap
+ *
+ * Puts the task into frozen state, if only the task is not about to quit.
+ * In this case it drops JOBCTL_TRAP_FREEZE.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held,
+ * which is always released before returning.
+ */
+static void do_freezer_trap(void)
+ __releases(¤t->sighand->siglock)
+{
+ /*
+ * If there are other trap bits pending except JOBCTL_TRAP_FREEZE,
+ * let's make another loop to give it a chance to be handled.
+ * In any case, we'll return back.
+ */
+ if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) !=
+ JOBCTL_TRAP_FREEZE) {
+ spin_unlock_irq(¤t->sighand->siglock);
+ return;
+ }
+
+ /*
+ * Now we're sure that there is no pending fatal signal and no
+ * pending traps. Clear TIF_SIGPENDING to not get out of schedule()
+ * immediately (if there is a non-fatal signal pending), and
+ * put the task into sleep.
+ */
+ __set_current_state(TASK_INTERRUPTIBLE);
+ clear_thread_flag(TIF_SIGPENDING);
+ spin_unlock_irq(¤t->sighand->siglock);
+ cgroup_enter_frozen();
+ freezable_schedule();
+}
+
static int ptrace_signal(int signr, kernel_siginfo_t *info)
{
/*
ksig->info.si_signo = signr = SIGKILL;
sigdelset(¤t->pending.signal, SIGKILL);
recalc_sigpending();
+ current->jobctl &= ~JOBCTL_TRAP_FREEZE;
+ spin_unlock_irq(&sighand->siglock);
+ if (unlikely(cgroup_task_frozen(current)))
+ cgroup_leave_frozen(true);
goto fatal;
}
do_signal_stop(0))
goto relock;
- if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
- do_jobctl_trap();
+ if (unlikely(current->jobctl &
+ (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) {
+ if (current->jobctl & JOBCTL_TRAP_MASK) {
+ do_jobctl_trap();
+ spin_unlock_irq(&sighand->siglock);
+ } else if (current->jobctl & JOBCTL_TRAP_FREEZE)
+ do_freezer_trap();
+
+ goto relock;
+ }
+
+ /*
+ * If the task is leaving the frozen state, let's update
+ * cgroup counters and reset the frozen bit.
+ */
+ if (unlikely(cgroup_task_frozen(current))) {
spin_unlock_irq(&sighand->siglock);
+ cgroup_leave_frozen(true);
goto relock;
}
continue;
}
- fatal:
spin_unlock_irq(&sighand->siglock);
+ fatal:
/*
* Anything else is fatal, maybe with a core dump.