2019-05-19 15:08:55 +03:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2016-12-27 22:49:06 +03:00
|
|
|
#include "cgroup-internal.h"
|
|
|
|
|
2016-12-27 22:49:08 +03:00
|
|
|
#include <linux/ctype.h>
|
2016-12-27 22:49:06 +03:00
|
|
|
#include <linux/kmod.h>
|
|
|
|
#include <linux/sort.h>
|
2016-12-27 22:49:08 +03:00
|
|
|
#include <linux/delay.h>
|
2016-12-27 22:49:06 +03:00
|
|
|
#include <linux/mm.h>
|
2017-02-02 10:35:14 +03:00
|
|
|
#include <linux/sched/signal.h>
|
2017-02-06 12:57:33 +03:00
|
|
|
#include <linux/sched/task.h>
|
2017-02-05 18:03:58 +03:00
|
|
|
#include <linux/magic.h>
|
2016-12-27 22:49:06 +03:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/vmalloc.h>
|
|
|
|
#include <linux/delayacct.h>
|
|
|
|
#include <linux/pid_namespace.h>
|
|
|
|
#include <linux/cgroupstats.h>
|
2019-01-17 08:15:11 +03:00
|
|
|
#include <linux/fs_parser.h>
|
2016-12-27 22:49:06 +03:00
|
|
|
|
|
|
|
#include <trace/events/cgroup.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pidlists linger the following amount before being destroyed. The goal
|
|
|
|
* is avoiding frequent destruction in the middle of consecutive read calls
|
|
|
|
* Expiring in the middle is a performance problem not a correctness one.
|
|
|
|
* 1 sec should be enough.
|
|
|
|
*/
|
|
|
|
#define CGROUP_PIDLIST_DESTROY_DELAY HZ
|
|
|
|
|
|
|
|
/* Controllers blocked by the commandline in v1 */
|
|
|
|
static u16 cgroup_no_v1_mask;
|
|
|
|
|
2018-12-28 21:31:07 +03:00
|
|
|
/* disable named v1 mounts */
|
|
|
|
static bool cgroup_no_v1_named;
|
|
|
|
|
2016-12-27 22:49:06 +03:00
|
|
|
/*
|
|
|
|
* pidlist destructions need to be flushed on cgroup destruction. Use a
|
|
|
|
* separate workqueue as flush domain.
|
|
|
|
*/
|
|
|
|
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
|
|
|
|
|
2020-03-12 23:44:35 +03:00
|
|
|
/* protects cgroup_subsys->release_agent_path */
|
2016-12-27 22:49:08 +03:00
|
|
|
static DEFINE_SPINLOCK(release_agent_path_lock);
|
2016-12-27 22:49:06 +03:00
|
|
|
|
2016-12-27 22:49:08 +03:00
|
|
|
bool cgroup1_ssid_disabled(int ssid)
|
2016-12-27 22:49:06 +03:00
|
|
|
{
|
|
|
|
return cgroup_no_v1_mask & (1 << ssid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
|
|
|
|
* @from: attach to all cgroups of a given task
|
|
|
|
* @tsk: the task to be attached
|
2021-08-11 03:03:49 +03:00
|
|
|
*
|
|
|
|
* Return: %0 on success or a negative errno code on failure
|
2016-12-27 22:49:06 +03:00
|
|
|
*/
|
|
|
|
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
struct cgroup_root *root;
|
|
|
|
int retval = 0;
|
|
|
|
|
|
|
|
mutex_lock(&cgroup_mutex);
|
2022-08-26 05:48:29 +03:00
|
|
|
cgroup_attach_lock(true);
|
2016-12-27 22:49:06 +03:00
|
|
|
for_each_root(root) {
|
|
|
|
struct cgroup *from_cgrp;
|
|
|
|
|
|
|
|
spin_lock_irq(&css_set_lock);
|
|
|
|
from_cgrp = task_cgroup_from_root(from, root);
|
|
|
|
spin_unlock_irq(&css_set_lock);
|
|
|
|
|
|
|
|
retval = cgroup_attach_task(from_cgrp, tsk, false);
|
|
|
|
if (retval)
|
|
|
|
break;
|
|
|
|
}
|
2022-08-26 05:48:29 +03:00
|
|
|
cgroup_attach_unlock(true);
|
2016-12-27 22:49:06 +03:00
|
|
|
mutex_unlock(&cgroup_mutex);
|
|
|
|
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
|
|
|
|
|
|
|
|
/**
|
2021-08-11 03:03:49 +03:00
|
|
|
* cgroup_transfer_tasks - move tasks from one cgroup to another
|
2016-12-27 22:49:06 +03:00
|
|
|
* @to: cgroup to which the tasks will be moved
|
|
|
|
* @from: cgroup in which the tasks currently reside
|
|
|
|
*
|
|
|
|
* Locking rules between cgroup_post_fork() and the migration path
|
|
|
|
* guarantee that, if a task is forking while being migrated, the new child
|
|
|
|
* is guaranteed to be either visible in the source cgroup after the
|
|
|
|
* parent's migration is complete or put into the target cgroup. No task
|
|
|
|
* can slip out of migration through forking.
|
2021-08-11 03:03:49 +03:00
|
|
|
*
|
|
|
|
* Return: %0 on success or a negative errno code on failure
|
2016-12-27 22:49:06 +03:00
|
|
|
*/
|
|
|
|
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
|
|
|
|
{
|
2017-01-16 03:03:41 +03:00
|
|
|
DEFINE_CGROUP_MGCTX(mgctx);
|
2016-12-27 22:49:06 +03:00
|
|
|
struct cgrp_cset_link *link;
|
|
|
|
struct css_task_iter it;
|
|
|
|
struct task_struct *task;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (cgroup_on_dfl(to))
|
|
|
|
return -EINVAL;
|
|
|
|
|
cgroup: implement cgroup v2 thread support
This patch implements cgroup v2 thread support. The goal of the
thread mode is supporting hierarchical accounting and control at
thread granularity while staying inside the resource domain model
which allows coordination across different resource controllers and
handling of anonymous resource consumptions.
A cgroup is always created as a domain and can be made threaded by
writing to the "cgroup.type" file. When a cgroup becomes threaded, it
becomes a member of a threaded subtree which is anchored at the
closest ancestor which isn't threaded.
The threads of the processes which are in a threaded subtree can be
placed anywhere without being restricted by process granularity or
no-internal-process constraint. Note that the threads aren't allowed
to escape to a different threaded subtree. To be used inside a
threaded subtree, a controller should explicitly support threaded mode
and be able to handle internal competition in the way which is
appropriate for the resource.
The root of a threaded subtree, the nearest ancestor which isn't
threaded, is called the threaded domain and serves as the resource
domain for the whole subtree. This is the last cgroup where domain
controllers are operational and where all the domain-level resource
consumptions in the subtree are accounted. This allows threaded
controllers to operate at thread granularity when requested while
staying inside the scope of system-level resource distribution.
As the root cgroup is exempt from the no-internal-process constraint,
it can serve as both a threaded domain and a parent to normal cgroups,
so, unlike non-root cgroups, the root cgroup can have both domain and
threaded children.
Internally, in a threaded subtree, each css_set has its ->dom_cset
pointing to a matching css_set which belongs to the threaded domain.
This ensures that thread root level cgroup_subsys_state for all
threaded controllers are readily accessible for domain-level
operations.
This patch enables threaded mode for the pids and perf_events
controllers. Neither has to worry about domain-level resource
consumptions and it's enough to simply set the flag.
For more details on the interface and behavior of the thread mode,
please refer to the section 2-2-2 in Documentation/cgroup-v2.txt added
by this patch.
v5: - Dropped silly no-op ->dom_cgrp init from cgroup_create().
Spotted by Waiman.
- Documentation updated as suggested by Waiman.
- cgroup.type content slightly reformatted.
- Mark the debug controller threaded.
v4: - Updated to the general idea of marking specific cgroups
domain/threaded as suggested by PeterZ.
v3: - Dropped "join" and always make mixed children join the parent's
threaded subtree.
v2: - After discussions with Waiman, support for mixed thread mode is
added. This should address the issue that Peter pointed out
where any nesting should be avoided for thread subtrees while
coexisting with other domain cgroups.
- Enabling / disabling thread mode now piggy backs on the existing
control mask update mechanism.
- Bug fixes and cleanup.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
2017-07-21 18:14:51 +03:00
|
|
|
ret = cgroup_migrate_vet_dst(to);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2016-12-27 22:49:06 +03:00
|
|
|
|
|
|
|
mutex_lock(&cgroup_mutex);
|
|
|
|
|
|
|
|
percpu_down_write(&cgroup_threadgroup_rwsem);
|
|
|
|
|
|
|
|
/* all tasks in @from are being moved, all csets are source */
|
|
|
|
spin_lock_irq(&css_set_lock);
|
|
|
|
list_for_each_entry(link, &from->cset_links, cset_link)
|
2017-01-16 03:03:41 +03:00
|
|
|
cgroup_migrate_add_src(link->cset, to, &mgctx);
|
2016-12-27 22:49:06 +03:00
|
|
|
spin_unlock_irq(&css_set_lock);
|
|
|
|
|
2017-01-16 03:03:41 +03:00
|
|
|
ret = cgroup_migrate_prepare_dst(&mgctx);
|
2016-12-27 22:49:06 +03:00
|
|
|
if (ret)
|
|
|
|
goto out_err;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Migrate tasks one-by-one until @from is empty. This fails iff
|
|
|
|
* ->can_attach() fails.
|
|
|
|
*/
|
|
|
|
do {
|
2017-05-15 16:34:01 +03:00
|
|
|
css_task_iter_start(&from->self, 0, &it);
|
2017-12-19 10:26:57 +03:00
|
|
|
|
|
|
|
do {
|
|
|
|
task = css_task_iter_next(&it);
|
|
|
|
} while (task && (task->flags & PF_EXITING));
|
|
|
|
|
2016-12-27 22:49:06 +03:00
|
|
|
if (task)
|
|
|
|
get_task_struct(task);
|
|
|
|
css_task_iter_end(&it);
|
|
|
|
|
|
|
|
if (task) {
|
2017-01-16 03:03:41 +03:00
|
|
|
ret = cgroup_migrate(task, false, &mgctx);
|
2016-12-27 22:49:06 +03:00
|
|
|
if (!ret)
|
cgroup/tracing: Move taking of spin lock out of trace event handlers
It is unwise to take spin locks from the handlers of trace events.
Mainly, because they can introduce lockups, because it introduces locks
in places that are normally not tested. Worse yet, because trace events
are tucked away in the include/trace/events/ directory, locks that are
taken there are forgotten about.
As a general rule, I tell people never to take any locks in a trace
event handler.
Several cgroup trace event handlers call cgroup_path() which eventually
takes the kernfs_rename_lock spinlock. This injects the spinlock in the
code without people realizing it. It also can cause issues for the
PREEMPT_RT patch, as the spinlock becomes a mutex, and the trace event
handlers are called with preemption disabled.
By moving the calculation of the cgroup_path() out of the trace event
handlers and into a macro (surrounded by a
trace_cgroup_##type##_enabled()), then we could place the cgroup_path
into a string, and pass that to the trace event. Not only does this
remove the taking of the spinlock out of the trace event handler, but
it also means that the cgroup_path() only needs to be called once (it
is currently called twice, once to get the length to reserver the
buffer for, and once again to get the path itself. Now it only needs to
be done once.
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
2018-07-10 00:48:54 +03:00
|
|
|
TRACE_CGROUP_PATH(transfer_tasks, to, task, false);
|
2016-12-27 22:49:06 +03:00
|
|
|
put_task_struct(task);
|
|
|
|
}
|
|
|
|
} while (task && !ret);
|
|
|
|
out_err:
|
2017-01-16 03:03:41 +03:00
|
|
|
cgroup_migrate_finish(&mgctx);
|
2016-12-27 22:49:06 +03:00
|
|
|
percpu_up_write(&cgroup_threadgroup_rwsem);
|
|
|
|
mutex_unlock(&cgroup_mutex);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Stuff for reading the 'tasks'/'procs' files.
|
|
|
|
*
|
|
|
|
* Reading this file can return large amounts of data if a cgroup has
|
|
|
|
* *lots* of attached tasks. So it may need several calls to read(),
|
|
|
|
* but we cannot guarantee that the information we produce is correct
|
|
|
|
* unless we produce it entirely atomically.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* which pidlist file are we talking about? */
|
|
|
|
enum cgroup_filetype {
|
|
|
|
CGROUP_FILE_PROCS,
|
|
|
|
CGROUP_FILE_TASKS,
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A pidlist is a list of pids that virtually represents the contents of one
|
|
|
|
* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
|
|
|
|
* a pair (one each for procs, tasks) for each pid namespace that's relevant
|
|
|
|
* to the cgroup.
|
|
|
|
*/
|
|
|
|
struct cgroup_pidlist {
|
|
|
|
/*
|
|
|
|
* used to find which pidlist is wanted. doesn't change as long as
|
|
|
|
* this particular list stays in the list.
|
|
|
|
*/
|
|
|
|
struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
|
|
|
|
/* array of xids */
|
|
|
|
pid_t *list;
|
|
|
|
/* how many elements the above list has */
|
|
|
|
int length;
|
|
|
|
/* each of these stored in a list by its cgroup */
|
|
|
|
struct list_head links;
|
|
|
|
/* pointer to the cgroup we belong to, for list removal purposes */
|
|
|
|
struct cgroup *owner;
|
|
|
|
/* for delayed destruction */
|
|
|
|
struct delayed_work destroy_dwork;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Used to destroy all pidlists lingering waiting for destroy timer. None
|
|
|
|
* should be left afterwards.
|
|
|
|
*/
|
2016-12-27 22:49:08 +03:00
|
|
|
void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
|
2016-12-27 22:49:06 +03:00
|
|
|
{
|
|
|
|
struct cgroup_pidlist *l, *tmp_l;
|
|
|
|
|
|
|
|
mutex_lock(&cgrp->pidlist_mutex);
|
|
|
|
list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
|
|
|
|
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
|
|
|
|
mutex_unlock(&cgrp->pidlist_mutex);
|
|
|
|
|
|
|
|
flush_workqueue(cgroup_pidlist_destroy_wq);
|
|
|
|
BUG_ON(!list_empty(&cgrp->pidlists));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct delayed_work *dwork = to_delayed_work(work);
|
|
|
|
struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
|
|
|
|
destroy_dwork);
|
|
|
|
struct cgroup_pidlist *tofree = NULL;
|
|
|
|
|
|
|
|
mutex_lock(&l->owner->pidlist_mutex);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Destroy iff we didn't get queued again. The state won't change
|
|
|
|
* as destroy_dwork can only be queued while locked.
|
|
|
|
*/
|
|
|
|
if (!delayed_work_pending(dwork)) {
|
|
|
|
list_del(&l->links);
|
2019-08-06 16:24:12 +03:00
|
|
|
kvfree(l->list);
|
2016-12-27 22:49:06 +03:00
|
|
|
put_pid_ns(l->key.ns);
|
|
|
|
tofree = l;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_unlock(&l->owner->pidlist_mutex);
|
|
|
|
kfree(tofree);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
|
|
|
|
* Returns the number of unique elements.
|
|
|
|
*/
|
|
|
|
static int pidlist_uniq(pid_t *list, int length)
|
|
|
|
{
|
|
|
|
int src, dest = 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* we presume the 0th element is unique, so i starts at 1. trivial
|
|
|
|
* edge cases first; no work needs to be done for either
|
|
|
|
*/
|
|
|
|
if (length == 0 || length == 1)
|
|
|
|
return length;
|
|
|
|
/* src and dest walk down the list; dest counts unique elements */
|
|
|
|
for (src = 1; src < length; src++) {
|
|
|
|
/* find next unique element */
|
|
|
|
while (list[src] == list[src-1]) {
|
|
|
|
src++;
|
|
|
|
if (src == length)
|
|
|
|
goto after;
|
|
|
|
}
|
|
|
|
/* dest always points to where the next unique element goes */
|
|
|
|
list[dest] = list[src];
|
|
|
|
dest++;
|
|
|
|
}
|
|
|
|
after:
|
|
|
|
return dest;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The two pid files - task and cgroup.procs - guaranteed that the result
|
|
|
|
* is sorted, which forced this whole pidlist fiasco. As pid order is
|
|
|
|
* different per namespace, each namespace needs differently sorted list,
|
|
|
|
* making it impossible to use, for example, single rbtree of member tasks
|
|
|
|
* sorted by task pointer. As pidlists can be fairly large, allocating one
|
|
|
|
* per open file is dangerous, so cgroup had to implement shared pool of
|
|
|
|
* pidlists keyed by cgroup and namespace.
|
|
|
|
*/
|
|
|
|
static int cmppid(const void *a, const void *b)
|
|
|
|
{
|
|
|
|
return *(pid_t *)a - *(pid_t *)b;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
|
|
|
|
enum cgroup_filetype type)
|
|
|
|
{
|
|
|
|
struct cgroup_pidlist *l;
|
|
|
|
/* don't need task_nsproxy() if we're looking at ourself */
|
|
|
|
struct pid_namespace *ns = task_active_pid_ns(current);
|
|
|
|
|
|
|
|
lockdep_assert_held(&cgrp->pidlist_mutex);
|
|
|
|
|
|
|
|
list_for_each_entry(l, &cgrp->pidlists, links)
|
|
|
|
if (l->key.type == type && l->key.ns == ns)
|
|
|
|
return l;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* find the appropriate pidlist for our purpose (given procs vs tasks)
|
|
|
|
* returns with the lock on that pidlist already held, and takes care
|
|
|
|
* of the use count, or returns NULL with no locks held if we're out of
|
|
|
|
* memory.
|
|
|
|
*/
|
|
|
|
static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
|
|
|
|
enum cgroup_filetype type)
|
|
|
|
{
|
|
|
|
struct cgroup_pidlist *l;
|
|
|
|
|
|
|
|
lockdep_assert_held(&cgrp->pidlist_mutex);
|
|
|
|
|
|
|
|
l = cgroup_pidlist_find(cgrp, type);
|
|
|
|
if (l)
|
|
|
|
return l;
|
|
|
|
|
|
|
|
/* entry not found; create a new one */
|
|
|
|
l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
|
|
|
|
if (!l)
|
|
|
|
return l;
|
|
|
|
|
|
|
|
INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
|
|
|
|
l->key.type = type;
|
|
|
|
/* don't need task_nsproxy() if we're looking at ourself */
|
|
|
|
l->key.ns = get_pid_ns(task_active_pid_ns(current));
|
|
|
|
l->owner = cgrp;
|
|
|
|
list_add(&l->links, &cgrp->pidlists);
|
|
|
|
return l;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
|
|
|
|
*/
|
|
|
|
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
|
|
|
|
struct cgroup_pidlist **lp)
|
|
|
|
{
|
|
|
|
pid_t *array;
|
|
|
|
int length;
|
|
|
|
int pid, n = 0; /* used for populating the array */
|
|
|
|
struct css_task_iter it;
|
|
|
|
struct task_struct *tsk;
|
|
|
|
struct cgroup_pidlist *l;
|
|
|
|
|
|
|
|
lockdep_assert_held(&cgrp->pidlist_mutex);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If cgroup gets more users after we read count, we won't have
|
|
|
|
* enough space - tough. This race is indistinguishable to the
|
|
|
|
* caller from the case that the additional cgroup users didn't
|
|
|
|
* show up until sometime later on.
|
|
|
|
*/
|
|
|
|
length = cgroup_task_count(cgrp);
|
2019-08-06 16:24:12 +03:00
|
|
|
array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
|
2016-12-27 22:49:06 +03:00
|
|
|
if (!array)
|
|
|
|
return -ENOMEM;
|
|
|
|
/* now, populate the array */
|
2017-05-15 16:34:01 +03:00
|
|
|
css_task_iter_start(&cgrp->self, 0, &it);
|
2016-12-27 22:49:06 +03:00
|
|
|
while ((tsk = css_task_iter_next(&it))) {
|
|
|
|
if (unlikely(n == length))
|
|
|
|
break;
|
|
|
|
/* get tgid or pid for procs or tasks file respectively */
|
|
|
|
if (type == CGROUP_FILE_PROCS)
|
|
|
|
pid = task_tgid_vnr(tsk);
|
|
|
|
else
|
|
|
|
pid = task_pid_vnr(tsk);
|
|
|
|
if (pid > 0) /* make sure to only use valid results */
|
|
|
|
array[n++] = pid;
|
|
|
|
}
|
|
|
|
css_task_iter_end(&it);
|
|
|
|
length = n;
|
|
|
|
/* now sort & (if procs) strip out duplicates */
|
|
|
|
sort(array, length, sizeof(pid_t), cmppid, NULL);
|
|
|
|
if (type == CGROUP_FILE_PROCS)
|
|
|
|
length = pidlist_uniq(array, length);
|
|
|
|
|
|
|
|
l = cgroup_pidlist_find_create(cgrp, type);
|
|
|
|
if (!l) {
|
2019-08-06 16:24:12 +03:00
|
|
|
kvfree(array);
|
2016-12-27 22:49:06 +03:00
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* store array, freeing old if necessary */
|
2019-08-06 16:24:12 +03:00
|
|
|
kvfree(l->list);
|
2016-12-27 22:49:06 +03:00
|
|
|
l->list = array;
|
|
|
|
l->length = length;
|
|
|
|
*lp = l;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* seq_file methods for the tasks/procs files. The seq_file position is the
|
|
|
|
* next pid to display; the seq_file iterator is a pointer to the pid
|
|
|
|
* in the cgroup->l->list array.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Initially we receive a position value that corresponds to
|
|
|
|
* one more than the last pid shown (or 0 on the first call or
|
|
|
|
* after a seek to the start). Use a binary-search to find the
|
|
|
|
* next pid to display, if any
|
|
|
|
*/
|
|
|
|
struct kernfs_open_file *of = s->private;
|
2022-01-07 00:02:29 +03:00
|
|
|
struct cgroup_file_ctx *ctx = of->priv;
|
2016-12-27 22:49:06 +03:00
|
|
|
struct cgroup *cgrp = seq_css(s)->cgroup;
|
|
|
|
struct cgroup_pidlist *l;
|
|
|
|
enum cgroup_filetype type = seq_cft(s)->private;
|
|
|
|
int index = 0, pid = *pos;
|
|
|
|
int *iter, ret;
|
|
|
|
|
|
|
|
mutex_lock(&cgrp->pidlist_mutex);
|
|
|
|
|
|
|
|
/*
|
2022-01-07 00:02:29 +03:00
|
|
|
* !NULL @ctx->procs1.pidlist indicates that this isn't the first
|
|
|
|
* start() after open. If the matching pidlist is around, we can use
|
|
|
|
* that. Look for it. Note that @ctx->procs1.pidlist can't be used
|
|
|
|
* directly. It could already have been destroyed.
|
2016-12-27 22:49:06 +03:00
|
|
|
*/
|
2022-01-07 00:02:29 +03:00
|
|
|
if (ctx->procs1.pidlist)
|
|
|
|
ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
|
2016-12-27 22:49:06 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Either this is the first start() after open or the matching
|
|
|
|
* pidlist has been destroyed inbetween. Create a new one.
|
|
|
|
*/
|
2022-01-07 00:02:29 +03:00
|
|
|
if (!ctx->procs1.pidlist) {
|
|
|
|
ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
|
2016-12-27 22:49:06 +03:00
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
2022-01-07 00:02:29 +03:00
|
|
|
l = ctx->procs1.pidlist;
|
2016-12-27 22:49:06 +03:00
|
|
|
|
|
|
|
if (pid) {
|
|
|
|
int end = l->length;
|
|
|
|
|
|
|
|
while (index < end) {
|
|
|
|
int mid = (index + end) / 2;
|
|
|
|
if (l->list[mid] == pid) {
|
|
|
|
index = mid;
|
|
|
|
break;
|
|
|
|
} else if (l->list[mid] <= pid)
|
|
|
|
index = mid + 1;
|
|
|
|
else
|
|
|
|
end = mid;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* If we're off the end of the array, we're done */
|
|
|
|
if (index >= l->length)
|
|
|
|
return NULL;
|
|
|
|
/* Update the abstract position to be the actual pid that we found */
|
|
|
|
iter = l->list + index;
|
|
|
|
*pos = *iter;
|
|
|
|
return iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
|
|
|
|
{
|
|
|
|
struct kernfs_open_file *of = s->private;
|
2022-01-07 00:02:29 +03:00
|
|
|
struct cgroup_file_ctx *ctx = of->priv;
|
|
|
|
struct cgroup_pidlist *l = ctx->procs1.pidlist;
|
2016-12-27 22:49:06 +03:00
|
|
|
|
|
|
|
if (l)
|
|
|
|
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
|
|
|
|
CGROUP_PIDLIST_DESTROY_DELAY);
|
|
|
|
mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
|
|
|
|
{
|
|
|
|
struct kernfs_open_file *of = s->private;
|
2022-01-07 00:02:29 +03:00
|
|
|
struct cgroup_file_ctx *ctx = of->priv;
|
|
|
|
struct cgroup_pidlist *l = ctx->procs1.pidlist;
|
2016-12-27 22:49:06 +03:00
|
|
|
pid_t *p = v;
|
|
|
|
pid_t *end = l->list + l->length;
|
|
|
|
/*
|
|
|
|
* Advance to the next pid in the array. If this goes off the
|
|
|
|
* end, we're done
|
|
|
|
*/
|
|
|
|
p++;
|
|
|
|
if (p >= end) {
|
2020-01-30 13:34:49 +03:00
|
|
|
(*pos)++;
|
2016-12-27 22:49:06 +03:00
|
|
|
return NULL;
|
|
|
|
} else {
|
|
|
|
*pos = *p;
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cgroup_pidlist_show(struct seq_file *s, void *v)
|
|
|
|
{
|
|
|
|
seq_printf(s, "%d\n", *(int *)v);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-05-15 16:34:00 +03:00
|
|
|
static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off,
|
|
|
|
bool threadgroup)
|
2016-12-27 22:49:06 +03:00
|
|
|
{
|
2017-05-15 16:34:00 +03:00
|
|
|
struct cgroup *cgrp;
|
|
|
|
struct task_struct *task;
|
|
|
|
const struct cred *cred, *tcred;
|
|
|
|
ssize_t ret;
|
cgroup: Optimize single thread migration
There are reports of users who use thread migrations between cgroups and
they report performance drop after d59cfc09c32a ("sched, cgroup: replace
signal_struct->group_rwsem with a global percpu_rwsem"). The effect is
pronounced on machines with more CPUs.
The migration is affected by forking noise happening in the background,
after the mentioned commit a migrating thread must wait for all
(forking) processes on the system, not only of its threadgroup.
There are several places that need to synchronize with migration:
a) do_exit,
b) de_thread,
c) copy_process,
d) cgroup_update_dfl_csses,
e) parallel migration (cgroup_{proc,thread}s_write).
In the case of self-migrating thread, we relax the synchronization on
cgroup_threadgroup_rwsem to avoid the cost of waiting. d) and e) are
excluded with cgroup_mutex, c) does not matter in case of single thread
migration and the executing thread cannot exec(2) or exit(2) while it is
writing into cgroup.threads. In case of do_exit because of signal
delivery, we either exit before the migration or finish the migration
(of not yet PF_EXITING thread) and die afterwards.
This patch handles only the case of self-migration by writing "0" into
cgroup.threads. For simplicity, we always take cgroup_threadgroup_rwsem
with numeric PIDs.
This change improves migration dependent workload performance similar
to per-signal_struct state.
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2019-10-04 13:57:40 +03:00
|
|
|
bool locked;
|
2017-05-15 16:34:00 +03:00
|
|
|
|
|
|
|
cgrp = cgroup_kn_lock_live(of->kn, false);
|
|
|
|
if (!cgrp)
|
|
|
|
return -ENODEV;
|
|
|
|
|
cgroup: Optimize single thread migration
There are reports of users who use thread migrations between cgroups and
they report performance drop after d59cfc09c32a ("sched, cgroup: replace
signal_struct->group_rwsem with a global percpu_rwsem"). The effect is
pronounced on machines with more CPUs.
The migration is affected by forking noise happening in the background,
after the mentioned commit a migrating thread must wait for all
(forking) processes on the system, not only of its threadgroup.
There are several places that need to synchronize with migration:
a) do_exit,
b) de_thread,
c) copy_process,
d) cgroup_update_dfl_csses,
e) parallel migration (cgroup_{proc,thread}s_write).
In the case of self-migrating thread, we relax the synchronization on
cgroup_threadgroup_rwsem to avoid the cost of waiting. d) and e) are
excluded with cgroup_mutex, c) does not matter in case of single thread
migration and the executing thread cannot exec(2) or exit(2) while it is
writing into cgroup.threads. In case of do_exit because of signal
delivery, we either exit before the migration or finish the migration
(of not yet PF_EXITING thread) and die afterwards.
This patch handles only the case of self-migration by writing "0" into
cgroup.threads. For simplicity, we always take cgroup_threadgroup_rwsem
with numeric PIDs.
This change improves migration dependent workload performance similar
to per-signal_struct state.
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2019-10-04 13:57:40 +03:00
|
|
|
task = cgroup_procs_write_start(buf, threadgroup, &locked);
|
2017-05-15 16:34:00 +03:00
|
|
|
ret = PTR_ERR_OR_ZERO(task);
|
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
/*
|
2022-01-07 00:02:28 +03:00
|
|
|
* Even if we're attaching all tasks in the thread group, we only need
|
|
|
|
* to check permissions on one of them. Check permissions using the
|
|
|
|
* credentials from file open to protect against inherited fd attacks.
|
2017-05-15 16:34:00 +03:00
|
|
|
*/
|
2022-01-07 00:02:28 +03:00
|
|
|
cred = of->file->f_cred;
|
2017-05-15 16:34:00 +03:00
|
|
|
tcred = get_task_cred(task);
|
|
|
|
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
|
|
|
|
!uid_eq(cred->euid, tcred->uid) &&
|
|
|
|
!uid_eq(cred->euid, tcred->suid))
|
|
|
|
ret = -EACCES;
|
|
|
|
put_cred(tcred);
|
|
|
|
if (ret)
|
|
|
|
goto out_finish;
|
|
|
|
|
|
|
|
ret = cgroup_attach_task(cgrp, task, threadgroup);
|
|
|
|
|
|
|
|
out_finish:
|
cgroup: Optimize single thread migration
There are reports of users who use thread migrations between cgroups and
they report performance drop after d59cfc09c32a ("sched, cgroup: replace
signal_struct->group_rwsem with a global percpu_rwsem"). The effect is
pronounced on machines with more CPUs.
The migration is affected by forking noise happening in the background,
after the mentioned commit a migrating thread must wait for all
(forking) processes on the system, not only of its threadgroup.
There are several places that need to synchronize with migration:
a) do_exit,
b) de_thread,
c) copy_process,
d) cgroup_update_dfl_csses,
e) parallel migration (cgroup_{proc,thread}s_write).
In the case of self-migrating thread, we relax the synchronization on
cgroup_threadgroup_rwsem to avoid the cost of waiting. d) and e) are
excluded with cgroup_mutex, c) does not matter in case of single thread
migration and the executing thread cannot exec(2) or exit(2) while it is
writing into cgroup.threads. In case of do_exit because of signal
delivery, we either exit before the migration or finish the migration
(of not yet PF_EXITING thread) and die afterwards.
This patch handles only the case of self-migration by writing "0" into
cgroup.threads. For simplicity, we always take cgroup_threadgroup_rwsem
with numeric PIDs.
This change improves migration dependent workload performance similar
to per-signal_struct state.
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2019-10-04 13:57:40 +03:00
|
|
|
cgroup_procs_write_finish(task, locked);
|
2017-05-15 16:34:00 +03:00
|
|
|
out_unlock:
|
|
|
|
cgroup_kn_unlock(of->kn);
|
|
|
|
|
|
|
|
return ret ?: nbytes;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
|
|
|
{
|
|
|
|
return __cgroup1_procs_write(of, buf, nbytes, off, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
|
|
|
{
|
|
|
|
return __cgroup1_procs_write(of, buf, nbytes, off, false);
|
2016-12-27 22:49:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
|
|
|
{
|
|
|
|
struct cgroup *cgrp;
|
2022-02-17 19:11:28 +03:00
|
|
|
struct cgroup_file_ctx *ctx;
|
2016-12-27 22:49:06 +03:00
|
|
|
|
|
|
|
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
|
|
|
|
|
2022-01-20 20:04:01 +03:00
|
|
|
/*
|
|
|
|
* Release agent gets called with all capabilities,
|
|
|
|
* require capabilities to set release agent.
|
|
|
|
*/
|
2022-02-17 19:11:28 +03:00
|
|
|
ctx = of->priv;
|
|
|
|
if ((ctx->ns->user_ns != &init_user_ns) ||
|
|
|
|
!file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
|
2022-01-20 20:04:01 +03:00
|
|
|
return -EPERM;
|
|
|
|
|
2016-12-27 22:49:06 +03:00
|
|
|
cgrp = cgroup_kn_lock_live(of->kn, false);
|
|
|
|
if (!cgrp)
|
|
|
|
return -ENODEV;
|
|
|
|
spin_lock(&release_agent_path_lock);
|
|
|
|
strlcpy(cgrp->root->release_agent_path, strstrip(buf),
|
|
|
|
sizeof(cgrp->root->release_agent_path));
|
|
|
|
spin_unlock(&release_agent_path_lock);
|
|
|
|
cgroup_kn_unlock(of->kn);
|
|
|
|
return nbytes;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cgroup_release_agent_show(struct seq_file *seq, void *v)
|
|
|
|
{
|
|
|
|
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
|
|
|
|
|
|
|
spin_lock(&release_agent_path_lock);
|
|
|
|
seq_puts(seq, cgrp->root->release_agent_path);
|
|
|
|
spin_unlock(&release_agent_path_lock);
|
|
|
|
seq_putc(seq, '\n');
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
|
|
|
|
{
|
|
|
|
seq_puts(seq, "0\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
|
|
|
|
struct cftype *cft)
|
|
|
|
{
|
|
|
|
return notify_on_release(css->cgroup);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
|
|
|
|
struct cftype *cft, u64 val)
|
|
|
|
{
|
|
|
|
if (val)
|
|
|
|
set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
|
|
|
|
else
|
|
|
|
clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
|
|
|
|
struct cftype *cft)
|
|
|
|
{
|
|
|
|
return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
|
|
|
|
struct cftype *cft, u64 val)
|
|
|
|
{
|
|
|
|
if (val)
|
|
|
|
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
|
|
|
|
else
|
|
|
|
clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* cgroup core interface files for the legacy hierarchies */
|
2016-12-27 22:49:08 +03:00
|
|
|
struct cftype cgroup1_base_files[] = {
|
2016-12-27 22:49:06 +03:00
|
|
|
{
|
|
|
|
.name = "cgroup.procs",
|
|
|
|
.seq_start = cgroup_pidlist_start,
|
|
|
|
.seq_next = cgroup_pidlist_next,
|
|
|
|
.seq_stop = cgroup_pidlist_stop,
|
|
|
|
.seq_show = cgroup_pidlist_show,
|
|
|
|
.private = CGROUP_FILE_PROCS,
|
2017-05-15 16:34:00 +03:00
|
|
|
.write = cgroup1_procs_write,
|
2016-12-27 22:49:06 +03:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "cgroup.clone_children",
|
|
|
|
.read_u64 = cgroup_clone_children_read,
|
|
|
|
.write_u64 = cgroup_clone_children_write,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "cgroup.sane_behavior",
|
|
|
|
.flags = CFTYPE_ONLY_ON_ROOT,
|
|
|
|
.seq_show = cgroup_sane_behavior_show,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "tasks",
|
|
|
|
.seq_start = cgroup_pidlist_start,
|
|
|
|
.seq_next = cgroup_pidlist_next,
|
|
|
|
.seq_stop = cgroup_pidlist_stop,
|
|
|
|
.seq_show = cgroup_pidlist_show,
|
|
|
|
.private = CGROUP_FILE_TASKS,
|
2017-05-15 16:34:00 +03:00
|
|
|
.write = cgroup1_tasks_write,
|
2016-12-27 22:49:06 +03:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "notify_on_release",
|
|
|
|
.read_u64 = cgroup_read_notify_on_release,
|
|
|
|
.write_u64 = cgroup_write_notify_on_release,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "release_agent",
|
|
|
|
.flags = CFTYPE_ONLY_ON_ROOT,
|
|
|
|
.seq_show = cgroup_release_agent_show,
|
|
|
|
.write = cgroup_release_agent_write,
|
|
|
|
.max_write_len = PATH_MAX - 1,
|
|
|
|
},
|
|
|
|
{ } /* terminate */
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Display information about each subsystem and each hierarchy */
|
2018-05-15 16:57:23 +03:00
|
|
|
int proc_cgroupstats_show(struct seq_file *m, void *v)
|
2016-12-27 22:49:06 +03:00
|
|
|
{
|
|
|
|
struct cgroup_subsys *ss;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
|
|
|
|
/*
|
2021-10-25 09:19:16 +03:00
|
|
|
* Grab the subsystems state racily. No need to add avenue to
|
|
|
|
* cgroup_mutex contention.
|
2016-12-27 22:49:06 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
for_each_subsys(ss, i)
|
|
|
|
seq_printf(m, "%s\t%d\t%d\t%d\n",
|
|
|
|
ss->legacy_name, ss->root->hierarchy_id,
|
|
|
|
atomic_read(&ss->root->nr_cgrps),
|
|
|
|
cgroup_ssid_enabled(i));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* cgroupstats_build - build and fill cgroupstats
|
|
|
|
* @stats: cgroupstats to fill information into
|
|
|
|
* @dentry: A dentry entry belonging to the cgroup for which stats have
|
|
|
|
* been requested.
|
|
|
|
*
|
|
|
|
* Build and fill cgroupstats so that taskstats can export it to user
|
|
|
|
* space.
|
2021-08-11 03:03:49 +03:00
|
|
|
*
|
|
|
|
* Return: %0 on success or a negative errno code on failure
|
2016-12-27 22:49:06 +03:00
|
|
|
*/
|
|
|
|
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
|
|
|
|
struct cgroup *cgrp;
|
|
|
|
struct css_task_iter it;
|
|
|
|
struct task_struct *tsk;
|
|
|
|
|
|
|
|
/* it should be kernfs_node belonging to cgroupfs and is a directory */
|
|
|
|
if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
|
|
|
|
kernfs_type(kn) != KERNFS_DIR)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We aren't being called from kernfs and there's no guarantee on
|
|
|
|
* @kn->priv's validity. For this and css_tryget_online_from_dir(),
|
|
|
|
* @kn->priv is RCU safe. Let's do the RCU dancing.
|
|
|
|
*/
|
|
|
|
rcu_read_lock();
|
2016-12-27 22:49:09 +03:00
|
|
|
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
|
2021-10-25 09:19:15 +03:00
|
|
|
if (!cgrp || !cgroup_tryget(cgrp)) {
|
2016-12-27 22:49:06 +03:00
|
|
|
rcu_read_unlock();
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
2017-05-15 16:34:01 +03:00
|
|
|
css_task_iter_start(&cgrp->self, 0, &it);
|
2016-12-27 22:49:06 +03:00
|
|
|
while ((tsk = css_task_iter_next(&it))) {
|
2021-06-11 11:28:17 +03:00
|
|
|
switch (READ_ONCE(tsk->__state)) {
|
2016-12-27 22:49:06 +03:00
|
|
|
case TASK_RUNNING:
|
|
|
|
stats->nr_running++;
|
|
|
|
break;
|
|
|
|
case TASK_INTERRUPTIBLE:
|
|
|
|
stats->nr_sleeping++;
|
|
|
|
break;
|
|
|
|
case TASK_UNINTERRUPTIBLE:
|
|
|
|
stats->nr_uninterruptible++;
|
|
|
|
break;
|
|
|
|
case TASK_STOPPED:
|
|
|
|
stats->nr_stopped++;
|
|
|
|
break;
|
|
|
|
default:
|
2021-04-13 04:39:05 +03:00
|
|
|
if (tsk->in_iowait)
|
2016-12-27 22:49:06 +03:00
|
|
|
stats->nr_io_wait++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
css_task_iter_end(&it);
|
|
|
|
|
2021-10-25 09:19:15 +03:00
|
|
|
cgroup_put(cgrp);
|
2016-12-27 22:49:06 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-12-27 22:49:08 +03:00
|
|
|
void cgroup1_check_for_release(struct cgroup *cgrp)
|
2016-12-27 22:49:06 +03:00
|
|
|
{
|
|
|
|
if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
|
|
|
|
!css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
|
|
|
|
schedule_work(&cgrp->release_agent_work);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Notify userspace when a cgroup is released, by running the
|
|
|
|
* configured release agent with the name of the cgroup (path
|
|
|
|
* relative to the root of cgroup file system) as the argument.
|
|
|
|
*
|
|
|
|
* Most likely, this user command will try to rmdir this cgroup.
|
|
|
|
*
|
|
|
|
* This races with the possibility that some other task will be
|
|
|
|
* attached to this cgroup before it is removed, or that some other
|
|
|
|
* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
|
|
|
|
* The presumed 'rmdir' will fail quietly if this cgroup is no longer
|
|
|
|
* unused, and this cgroup will be reprieved from its death sentence,
|
|
|
|
* to continue to serve a useful existence. Next time it's released,
|
|
|
|
* we will get notified again, if it still has 'notify_on_release' set.
|
|
|
|
*
|
|
|
|
* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
|
|
|
|
* means only wait until the task is successfully execve()'d. The
|
|
|
|
* separate release agent task is forked by call_usermodehelper(),
|
|
|
|
* then control in this thread returns here, without waiting for the
|
|
|
|
* release agent task. We don't bother to wait because the caller of
|
|
|
|
* this routine has no use for the exit status of the release agent
|
|
|
|
* task, so no sense holding our caller up for that.
|
|
|
|
*/
|
2016-12-27 22:49:08 +03:00
|
|
|
void cgroup1_release_agent(struct work_struct *work)
|
2016-12-27 22:49:06 +03:00
|
|
|
{
|
|
|
|
struct cgroup *cgrp =
|
|
|
|
container_of(work, struct cgroup, release_agent_work);
|
2020-03-12 23:44:35 +03:00
|
|
|
char *pathbuf, *agentbuf;
|
2016-12-27 22:49:06 +03:00
|
|
|
char *argv[3], *envp[3];
|
|
|
|
int ret;
|
|
|
|
|
2020-03-12 23:44:35 +03:00
|
|
|
/* snoop agent path and exit early if empty */
|
|
|
|
if (!cgrp->root->release_agent_path[0])
|
|
|
|
return;
|
2016-12-27 22:49:06 +03:00
|
|
|
|
2020-03-12 23:44:35 +03:00
|
|
|
/* prepare argument buffers */
|
2016-12-27 22:49:06 +03:00
|
|
|
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
|
2020-03-12 23:44:35 +03:00
|
|
|
agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
|
|
|
|
if (!pathbuf || !agentbuf)
|
|
|
|
goto out_free;
|
2016-12-27 22:49:06 +03:00
|
|
|
|
2020-03-12 23:44:35 +03:00
|
|
|
spin_lock(&release_agent_path_lock);
|
|
|
|
strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
|
|
|
|
spin_unlock(&release_agent_path_lock);
|
|
|
|
if (!agentbuf[0])
|
|
|
|
goto out_free;
|
|
|
|
|
|
|
|
ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
|
2016-12-27 22:49:06 +03:00
|
|
|
if (ret < 0 || ret >= PATH_MAX)
|
2020-03-12 23:44:35 +03:00
|
|
|
goto out_free;
|
2016-12-27 22:49:06 +03:00
|
|
|
|
|
|
|
argv[0] = agentbuf;
|
|
|
|
argv[1] = pathbuf;
|
|
|
|
argv[2] = NULL;
|
|
|
|
|
|
|
|
/* minimal command environment */
|
|
|
|
envp[0] = "HOME=/";
|
|
|
|
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
|
|
|
|
envp[2] = NULL;
|
|
|
|
|
|
|
|
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
|
|
|
|
out_free:
|
|
|
|
kfree(agentbuf);
|
|
|
|
kfree(pathbuf);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* cgroup_rename - Only allow simple rename of directories in place.
|
|
|
|
*/
|
2016-12-27 22:49:08 +03:00
|
|
|
static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
|
|
|
|
const char *new_name_str)
|
2016-12-27 22:49:06 +03:00
|
|
|
{
|
|
|
|
struct cgroup *cgrp = kn->priv;
|
|
|
|
int ret;
|
|
|
|
|
2021-06-09 10:17:19 +03:00
|
|
|
/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
|
|
|
|
if (strchr(new_name_str, '\n'))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2016-12-27 22:49:06 +03:00
|
|
|
if (kernfs_type(kn) != KERNFS_DIR)
|
|
|
|
return -ENOTDIR;
|
|
|
|
if (kn->parent != new_parent)
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We're gonna grab cgroup_mutex which nests outside kernfs
|
|
|
|
* active_ref. kernfs_rename() doesn't require active_ref
|
|
|
|
* protection. Break them before grabbing cgroup_mutex.
|
|
|
|
*/
|
|
|
|
kernfs_break_active_protection(new_parent);
|
|
|
|
kernfs_break_active_protection(kn);
|
|
|
|
|
|
|
|
mutex_lock(&cgroup_mutex);
|
|
|
|
|
|
|
|
ret = kernfs_rename(kn, new_parent, new_name_str);
|
|
|
|
if (!ret)
|
cgroup/tracing: Move taking of spin lock out of trace event handlers
It is unwise to take spin locks from the handlers of trace events.
Mainly, because they can introduce lockups, because it introduces locks
in places that are normally not tested. Worse yet, because trace events
are tucked away in the include/trace/events/ directory, locks that are
taken there are forgotten about.
As a general rule, I tell people never to take any locks in a trace
event handler.
Several cgroup trace event handlers call cgroup_path() which eventually
takes the kernfs_rename_lock spinlock. This injects the spinlock in the
code without people realizing it. It also can cause issues for the
PREEMPT_RT patch, as the spinlock becomes a mutex, and the trace event
handlers are called with preemption disabled.
By moving the calculation of the cgroup_path() out of the trace event
handlers and into a macro (surrounded by a
trace_cgroup_##type##_enabled()), then we could place the cgroup_path
into a string, and pass that to the trace event. Not only does this
remove the taking of the spinlock out of the trace event handler, but
it also means that the cgroup_path() only needs to be called once (it
is currently called twice, once to get the length to reserver the
buffer for, and once again to get the path itself. Now it only needs to
be done once.
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
2018-07-10 00:48:54 +03:00
|
|
|
TRACE_CGROUP_PATH(rename, cgrp);
|
2016-12-27 22:49:06 +03:00
|
|
|
|
|
|
|
mutex_unlock(&cgroup_mutex);
|
|
|
|
|
|
|
|
kernfs_unbreak_active_protection(kn);
|
|
|
|
kernfs_unbreak_active_protection(new_parent);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-12-27 22:49:08 +03:00
|
|
|
static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
|
|
|
|
{
|
|
|
|
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
|
|
|
|
struct cgroup_subsys *ss;
|
|
|
|
int ssid;
|
|
|
|
|
|
|
|
for_each_subsys(ss, ssid)
|
|
|
|
if (root->subsys_mask & (1 << ssid))
|
|
|
|
seq_show_option(seq, ss->legacy_name, NULL);
|
|
|
|
if (root->flags & CGRP_ROOT_NOPREFIX)
|
|
|
|
seq_puts(seq, ",noprefix");
|
|
|
|
if (root->flags & CGRP_ROOT_XATTR)
|
|
|
|
seq_puts(seq, ",xattr");
|
2017-08-17 22:33:09 +03:00
|
|
|
if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
|
|
|
|
seq_puts(seq, ",cpuset_v2_mode");
|
2022-07-23 17:28:28 +03:00
|
|
|
if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
|
|
|
|
seq_puts(seq, ",favordynmods");
|
2016-12-27 22:49:08 +03:00
|
|
|
|
|
|
|
spin_lock(&release_agent_path_lock);
|
|
|
|
if (strlen(root->release_agent_path))
|
|
|
|
seq_show_option(seq, "release_agent",
|
|
|
|
root->release_agent_path);
|
|
|
|
spin_unlock(&release_agent_path_lock);
|
|
|
|
|
|
|
|
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
|
|
|
|
seq_puts(seq, ",clone_children");
|
|
|
|
if (strlen(root->name))
|
|
|
|
seq_show_option(seq, "name", root->name);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-01-17 08:15:11 +03:00
|
|
|
enum cgroup1_param {
|
|
|
|
Opt_all,
|
|
|
|
Opt_clone_children,
|
|
|
|
Opt_cpuset_v2_mode,
|
|
|
|
Opt_name,
|
|
|
|
Opt_none,
|
|
|
|
Opt_noprefix,
|
|
|
|
Opt_release_agent,
|
|
|
|
Opt_xattr,
|
2022-07-23 17:28:28 +03:00
|
|
|
Opt_favordynmods,
|
|
|
|
Opt_nofavordynmods,
|
2019-01-17 08:15:11 +03:00
|
|
|
};
|
2016-12-27 22:49:08 +03:00
|
|
|
|
2019-09-07 14:23:15 +03:00
|
|
|
const struct fs_parameter_spec cgroup1_fs_parameters[] = {
|
2019-01-17 08:15:11 +03:00
|
|
|
fsparam_flag ("all", Opt_all),
|
|
|
|
fsparam_flag ("clone_children", Opt_clone_children),
|
|
|
|
fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
|
|
|
|
fsparam_string("name", Opt_name),
|
|
|
|
fsparam_flag ("none", Opt_none),
|
|
|
|
fsparam_flag ("noprefix", Opt_noprefix),
|
|
|
|
fsparam_string("release_agent", Opt_release_agent),
|
|
|
|
fsparam_flag ("xattr", Opt_xattr),
|
2022-07-23 17:28:28 +03:00
|
|
|
fsparam_flag ("favordynmods", Opt_favordynmods),
|
|
|
|
fsparam_flag ("nofavordynmods", Opt_nofavordynmods),
|
2019-01-17 08:15:11 +03:00
|
|
|
{}
|
|
|
|
};
|
2016-12-27 22:49:08 +03:00
|
|
|
|
2019-01-17 08:15:11 +03:00
|
|
|
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
|
|
|
|
{
|
|
|
|
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
|
|
|
|
struct cgroup_subsys *ss;
|
|
|
|
struct fs_parse_result result;
|
|
|
|
int opt, i;
|
|
|
|
|
2019-09-07 14:23:15 +03:00
|
|
|
opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
|
2019-01-17 08:15:11 +03:00
|
|
|
if (opt == -ENOPARAM) {
|
2021-07-14 16:47:50 +03:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = vfs_parse_fs_param_source(fc, param);
|
|
|
|
if (ret != -ENOPARAM)
|
|
|
|
return ret;
|
2016-12-27 22:49:08 +03:00
|
|
|
for_each_subsys(ss, i) {
|
2019-01-17 08:15:11 +03:00
|
|
|
if (strcmp(param->key, ss->legacy_name))
|
2016-12-27 22:49:08 +03:00
|
|
|
continue;
|
2021-01-15 12:37:17 +03:00
|
|
|
if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
|
|
|
|
return invalfc(fc, "Disabled controller '%s'",
|
|
|
|
param->key);
|
2019-01-17 07:42:38 +03:00
|
|
|
ctx->subsys_mask |= (1 << i);
|
2019-01-17 08:15:11 +03:00
|
|
|
return 0;
|
2016-12-27 22:49:08 +03:00
|
|
|
}
|
2019-12-22 05:35:27 +03:00
|
|
|
return invalfc(fc, "Unknown subsys name '%s'", param->key);
|
2019-01-17 08:15:11 +03:00
|
|
|
}
|
|
|
|
if (opt < 0)
|
|
|
|
return opt;
|
|
|
|
|
|
|
|
switch (opt) {
|
|
|
|
case Opt_none:
|
|
|
|
/* Explicitly have no subsystems */
|
|
|
|
ctx->none = true;
|
|
|
|
break;
|
|
|
|
case Opt_all:
|
|
|
|
ctx->all_ss = true;
|
|
|
|
break;
|
|
|
|
case Opt_noprefix:
|
|
|
|
ctx->flags |= CGRP_ROOT_NOPREFIX;
|
|
|
|
break;
|
|
|
|
case Opt_clone_children:
|
|
|
|
ctx->cpuset_clone_children = true;
|
|
|
|
break;
|
|
|
|
case Opt_cpuset_v2_mode:
|
|
|
|
ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
|
|
|
|
break;
|
|
|
|
case Opt_xattr:
|
|
|
|
ctx->flags |= CGRP_ROOT_XATTR;
|
|
|
|
break;
|
2022-07-23 17:28:28 +03:00
|
|
|
case Opt_favordynmods:
|
|
|
|
ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
|
|
|
|
break;
|
|
|
|
case Opt_nofavordynmods:
|
|
|
|
ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
|
|
|
|
break;
|
2019-01-17 08:15:11 +03:00
|
|
|
case Opt_release_agent:
|
|
|
|
/* Specifying two release agents is forbidden */
|
|
|
|
if (ctx->release_agent)
|
2019-12-22 05:35:27 +03:00
|
|
|
return invalfc(fc, "release_agent respecified");
|
2022-01-20 20:04:01 +03:00
|
|
|
/*
|
|
|
|
* Release agent gets called with all capabilities,
|
|
|
|
* require capabilities to set release agent.
|
|
|
|
*/
|
|
|
|
if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
|
|
|
|
return invalfc(fc, "Setting release_agent not allowed");
|
2019-01-17 08:15:11 +03:00
|
|
|
ctx->release_agent = param->string;
|
|
|
|
param->string = NULL;
|
|
|
|
break;
|
|
|
|
case Opt_name:
|
|
|
|
/* blocked by boot param? */
|
|
|
|
if (cgroup_no_v1_named)
|
2016-12-27 22:49:08 +03:00
|
|
|
return -ENOENT;
|
2019-01-17 08:15:11 +03:00
|
|
|
/* Can't specify an empty name */
|
|
|
|
if (!param->size)
|
2019-12-22 05:35:27 +03:00
|
|
|
return invalfc(fc, "Empty name");
|
2019-01-17 08:15:11 +03:00
|
|
|
if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
|
2019-12-22 05:35:27 +03:00
|
|
|
return invalfc(fc, "Name too long");
|
2019-01-17 08:15:11 +03:00
|
|
|
/* Must match [\w.-]+ */
|
|
|
|
for (i = 0; i < param->size; i++) {
|
|
|
|
char c = param->string[i];
|
|
|
|
if (isalnum(c))
|
|
|
|
continue;
|
|
|
|
if ((c == '.') || (c == '-') || (c == '_'))
|
|
|
|
continue;
|
2019-12-22 05:35:27 +03:00
|
|
|
return invalfc(fc, "Invalid name");
|
2019-01-17 08:15:11 +03:00
|
|
|
}
|
|
|
|
/* Specifying two names is forbidden */
|
|
|
|
if (ctx->name)
|
2019-12-22 05:35:27 +03:00
|
|
|
return invalfc(fc, "name respecified");
|
2019-01-17 08:15:11 +03:00
|
|
|
ctx->name = param->string;
|
|
|
|
param->string = NULL;
|
|
|
|
break;
|
2016-12-27 22:49:08 +03:00
|
|
|
}
|
2019-01-17 07:42:38 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-01-17 08:15:11 +03:00
|
|
|
static int check_cgroupfs_options(struct fs_context *fc)
|
2019-01-17 07:42:38 +03:00
|
|
|
{
|
2019-01-17 08:15:11 +03:00
|
|
|
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
|
2019-01-17 07:42:38 +03:00
|
|
|
u16 mask = U16_MAX;
|
|
|
|
u16 enabled = 0;
|
|
|
|
struct cgroup_subsys *ss;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
#ifdef CONFIG_CPUSETS
|
|
|
|
mask = ~((u16)1 << cpuset_cgrp_id);
|
|
|
|
#endif
|
|
|
|
for_each_subsys(ss, i)
|
|
|
|
if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
|
|
|
|
enabled |= 1 << i;
|
|
|
|
|
|
|
|
ctx->subsys_mask &= enabled;
|
2016-12-27 22:49:08 +03:00
|
|
|
|
|
|
|
/*
|
2021-05-24 11:29:43 +03:00
|
|
|
* In absence of 'none', 'name=' and subsystem name options,
|
2019-01-17 07:42:38 +03:00
|
|
|
* let's default to 'all'.
|
2016-12-27 22:49:08 +03:00
|
|
|
*/
|
2019-01-17 07:42:38 +03:00
|
|
|
if (!ctx->subsys_mask && !ctx->none && !ctx->name)
|
|
|
|
ctx->all_ss = true;
|
|
|
|
|
|
|
|
if (ctx->all_ss) {
|
|
|
|
/* Mutually exclusive option 'all' + subsystem name */
|
|
|
|
if (ctx->subsys_mask)
|
2019-12-22 05:35:27 +03:00
|
|
|
return invalfc(fc, "subsys name conflicts with all");
|
2019-01-17 07:42:38 +03:00
|
|
|
/* 'all' => select all the subsystems */
|
|
|
|
ctx->subsys_mask = enabled;
|
|
|
|
}
|
2016-12-27 22:49:08 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We either have to specify by name or by subsystems. (So all
|
|
|
|
* empty hierarchies must have a name).
|
|
|
|
*/
|
2019-01-17 07:42:38 +03:00
|
|
|
if (!ctx->subsys_mask && !ctx->name)
|
2019-12-22 05:35:27 +03:00
|
|
|
return invalfc(fc, "Need name or subsystem set");
|
2016-12-27 22:49:08 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Option noprefix was introduced just for backward compatibility
|
|
|
|
* with the old cpuset, so we allow noprefix only if mounting just
|
|
|
|
* the cpuset subsystem.
|
|
|
|
*/
|
2019-01-17 07:42:38 +03:00
|
|
|
if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
|
2019-12-22 05:35:27 +03:00
|
|
|
return invalfc(fc, "noprefix used incorrectly");
|
2016-12-27 22:49:08 +03:00
|
|
|
|
|
|
|
/* Can't specify "none" and some subsystems */
|
2019-01-17 07:42:38 +03:00
|
|
|
if (ctx->subsys_mask && ctx->none)
|
2019-12-22 05:35:27 +03:00
|
|
|
return invalfc(fc, "none used incorrectly");
|
2016-12-27 22:49:08 +03:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-01-05 08:38:03 +03:00
|
|
|
int cgroup1_reconfigure(struct fs_context *fc)
|
2016-12-27 22:49:08 +03:00
|
|
|
{
|
2019-01-05 08:38:03 +03:00
|
|
|
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
|
|
|
|
struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
|
2016-12-27 22:49:08 +03:00
|
|
|
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
|
2019-01-05 08:38:03 +03:00
|
|
|
int ret = 0;
|
2016-12-27 22:49:08 +03:00
|
|
|
u16 added_mask, removed_mask;
|
|
|
|
|
|
|
|
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
|
|
|
|
|
|
|
|
/* See what subsystems are wanted */
|
2019-01-17 08:15:11 +03:00
|
|
|
ret = check_cgroupfs_options(fc);
|
2016-12-27 22:49:08 +03:00
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2019-01-17 07:42:38 +03:00
|
|
|
if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
|
2016-12-27 22:49:08 +03:00
|
|
|
pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
|
|
|
|
task_tgid_nr(current), current->comm);
|
|
|
|
|
2019-01-17 07:42:38 +03:00
|
|
|
added_mask = ctx->subsys_mask & ~root->subsys_mask;
|
|
|
|
removed_mask = root->subsys_mask & ~ctx->subsys_mask;
|
2016-12-27 22:49:08 +03:00
|
|
|
|
|
|
|
/* Don't allow flags or name to change at remount */
|
2019-01-17 07:42:38 +03:00
|
|
|
if ((ctx->flags ^ root->flags) ||
|
|
|
|
(ctx->name && strcmp(ctx->name, root->name))) {
|
2019-12-22 05:35:27 +03:00
|
|
|
errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
|
2019-01-17 07:42:38 +03:00
|
|
|
ctx->flags, ctx->name ?: "", root->flags, root->name);
|
2016-12-27 22:49:08 +03:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* remounting is not allowed for populated hierarchies */
|
|
|
|
if (!list_empty(&root->cgrp.self.children)) {
|
|
|
|
ret = -EBUSY;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = rebind_subsystems(root, added_mask);
|
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
|
|
|
|
|
2019-01-17 07:42:38 +03:00
|
|
|
if (ctx->release_agent) {
|
2016-12-27 22:49:08 +03:00
|
|
|
spin_lock(&release_agent_path_lock);
|
2019-01-17 07:42:38 +03:00
|
|
|
strcpy(root->release_agent_path, ctx->release_agent);
|
2016-12-27 22:49:08 +03:00
|
|
|
spin_unlock(&release_agent_path_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
trace_cgroup_remount(root);
|
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&cgroup_mutex);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
|
|
|
|
.rename = cgroup1_rename,
|
|
|
|
.show_options = cgroup1_show_options,
|
|
|
|
.mkdir = cgroup_mkdir,
|
|
|
|
.rmdir = cgroup_rmdir,
|
|
|
|
.show_path = cgroup_show_path,
|
|
|
|
};
|
|
|
|
|
2019-01-17 17:42:30 +03:00
|
|
|
/*
|
|
|
|
* The guts of cgroup1 mount - find or create cgroup_root to use.
|
|
|
|
* Called with cgroup_mutex held; returns 0 on success, -E... on
|
|
|
|
* error and positive - in case when the candidate is busy dying.
|
|
|
|
* On success it stashes a reference to cgroup_root into given
|
|
|
|
* cgroup_fs_context; that reference is *NOT* counting towards the
|
|
|
|
* cgroup_root refcount.
|
|
|
|
*/
|
|
|
|
static int cgroup1_root_to_use(struct fs_context *fc)
|
2016-12-27 22:49:08 +03:00
|
|
|
{
|
2019-01-17 05:23:02 +03:00
|
|
|
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
|
2016-12-27 22:49:08 +03:00
|
|
|
struct cgroup_root *root;
|
|
|
|
struct cgroup_subsys *ss;
|
|
|
|
int i, ret;
|
|
|
|
|
|
|
|
/* First find the desired set of subsystems */
|
2019-01-17 08:15:11 +03:00
|
|
|
ret = check_cgroupfs_options(fc);
|
2016-12-27 22:49:08 +03:00
|
|
|
if (ret)
|
2019-01-17 17:42:30 +03:00
|
|
|
return ret;
|
2016-12-27 22:49:08 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Destruction of cgroup root is asynchronous, so subsystems may
|
|
|
|
* still be dying after the previous unmount. Let's drain the
|
|
|
|
* dying subsystems. We just need to ensure that the ones
|
|
|
|
* unmounted previously finish dying and don't care about new ones
|
|
|
|
* starting. Testing ref liveliness is good enough.
|
|
|
|
*/
|
|
|
|
for_each_subsys(ss, i) {
|
2019-01-17 07:42:38 +03:00
|
|
|
if (!(ctx->subsys_mask & (1 << i)) ||
|
2016-12-27 22:49:08 +03:00
|
|
|
ss->root == &cgrp_dfl_root)
|
|
|
|
continue;
|
|
|
|
|
2019-01-17 17:42:30 +03:00
|
|
|
if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
|
|
|
|
return 1; /* restart */
|
2016-12-27 22:49:08 +03:00
|
|
|
cgroup_put(&ss->root->cgrp);
|
|
|
|
}
|
|
|
|
|
|
|
|
for_each_root(root) {
|
|
|
|
bool name_match = false;
|
|
|
|
|
|
|
|
if (root == &cgrp_dfl_root)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we asked for a name then it must match. Also, if
|
|
|
|
* name matches but sybsys_mask doesn't, we should fail.
|
|
|
|
* Remember whether name matched.
|
|
|
|
*/
|
2019-01-17 07:42:38 +03:00
|
|
|
if (ctx->name) {
|
|
|
|
if (strcmp(ctx->name, root->name))
|
2016-12-27 22:49:08 +03:00
|
|
|
continue;
|
|
|
|
name_match = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we asked for subsystems (or explicitly for no
|
|
|
|
* subsystems) then they must match.
|
|
|
|
*/
|
2019-01-17 07:42:38 +03:00
|
|
|
if ((ctx->subsys_mask || ctx->none) &&
|
|
|
|
(ctx->subsys_mask != root->subsys_mask)) {
|
2016-12-27 22:49:08 +03:00
|
|
|
if (!name_match)
|
|
|
|
continue;
|
2019-01-17 17:42:30 +03:00
|
|
|
return -EBUSY;
|
2016-12-27 22:49:08 +03:00
|
|
|
}
|
|
|
|
|
2019-01-17 07:42:38 +03:00
|
|
|
if (root->flags ^ ctx->flags)
|
2016-12-27 22:49:08 +03:00
|
|
|
pr_warn("new mount options do not match the existing superblock, will be ignored\n");
|
|
|
|
|
2019-01-17 10:25:51 +03:00
|
|
|
ctx->root = root;
|
2019-01-17 17:42:30 +03:00
|
|
|
return 0;
|
2016-12-27 22:49:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* No such thing, create a new one. name= matching without subsys
|
|
|
|
* specification is allowed for already existing hierarchies but we
|
|
|
|
* can't create new one without subsys specification.
|
|
|
|
*/
|
2019-01-17 17:42:30 +03:00
|
|
|
if (!ctx->subsys_mask && !ctx->none)
|
2019-12-22 05:35:27 +03:00
|
|
|
return invalfc(fc, "No subsys list or none specified");
|
2016-12-27 22:49:08 +03:00
|
|
|
|
|
|
|
/* Hierarchies may only be created in the initial cgroup namespace. */
|
2019-01-17 18:14:26 +03:00
|
|
|
if (ctx->ns != &init_cgroup_ns)
|
2019-01-17 17:42:30 +03:00
|
|
|
return -EPERM;
|
2016-12-27 22:49:08 +03:00
|
|
|
|
|
|
|
root = kzalloc(sizeof(*root), GFP_KERNEL);
|
2019-01-17 17:42:30 +03:00
|
|
|
if (!root)
|
|
|
|
return -ENOMEM;
|
2016-12-27 22:49:08 +03:00
|
|
|
|
2019-01-17 10:25:51 +03:00
|
|
|
ctx->root = root;
|
|
|
|
init_cgroup_root(ctx);
|
2016-12-27 22:49:08 +03:00
|
|
|
|
2019-01-17 07:42:38 +03:00
|
|
|
ret = cgroup_setup_root(root, ctx->subsys_mask);
|
2022-07-23 17:28:28 +03:00
|
|
|
if (!ret)
|
|
|
|
cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
|
|
|
|
else
|
2016-12-27 22:49:08 +03:00
|
|
|
cgroup_free_root(root);
|
2022-07-23 17:28:28 +03:00
|
|
|
|
2019-01-17 17:42:30 +03:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int cgroup1_get_tree(struct fs_context *fc)
|
|
|
|
{
|
|
|
|
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* Check if the caller has permission to mount. */
|
2019-01-17 18:14:26 +03:00
|
|
|
if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
|
2019-01-17 17:42:30 +03:00
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
|
|
|
|
|
|
|
|
ret = cgroup1_root_to_use(fc);
|
|
|
|
if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
|
|
|
|
ret = 1; /* restart */
|
2016-12-27 22:49:08 +03:00
|
|
|
|
|
|
|
mutex_unlock(&cgroup_mutex);
|
|
|
|
|
2019-01-17 17:42:30 +03:00
|
|
|
if (!ret)
|
2019-01-17 18:14:26 +03:00
|
|
|
ret = cgroup_do_get_tree(fc);
|
2019-01-17 17:42:30 +03:00
|
|
|
|
|
|
|
if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
|
cgroup1: fix leaked context root causing sporadic NULL deref in LTP
Richard reported sporadic (roughly one in 10 or so) null dereferences and
other strange behaviour for a set of automated LTP tests. Things like:
BUG: kernel NULL pointer dereference, address: 0000000000000008
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 0 PID: 1516 Comm: umount Not tainted 5.10.0-yocto-standard #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-48-gd9c812dda519-prebuilt.qemu.org 04/01/2014
RIP: 0010:kernfs_sop_show_path+0x1b/0x60
...or these others:
RIP: 0010:do_mkdirat+0x6a/0xf0
RIP: 0010:d_alloc_parallel+0x98/0x510
RIP: 0010:do_readlinkat+0x86/0x120
There were other less common instances of some kind of a general scribble
but the common theme was mount and cgroup and a dubious dentry triggering
the NULL dereference. I was only able to reproduce it under qemu by
replicating Richard's setup as closely as possible - I never did get it
to happen on bare metal, even while keeping everything else the same.
In commit 71d883c37e8d ("cgroup_do_mount(): massage calling conventions")
we see this as a part of the overall change:
--------------
struct cgroup_subsys *ss;
- struct dentry *dentry;
[...]
- dentry = cgroup_do_mount(&cgroup_fs_type, fc->sb_flags, root,
- CGROUP_SUPER_MAGIC, ns);
[...]
- if (percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
- struct super_block *sb = dentry->d_sb;
- dput(dentry);
+ ret = cgroup_do_mount(fc, CGROUP_SUPER_MAGIC, ns);
+ if (!ret && percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ struct super_block *sb = fc->root->d_sb;
+ dput(fc->root);
deactivate_locked_super(sb);
msleep(10);
return restart_syscall();
}
--------------
In changing from the local "*dentry" variable to using fc->root, we now
export/leave that dentry pointer in the file context after doing the dput()
in the unlikely "is_dying" case. With LTP doing a crazy amount of back to
back mount/unmount [testcases/bin/cgroup_regression_5_1.sh] the unlikely
becomes slightly likely and then bad things happen.
A fix would be to not leave the stale reference in fc->root as follows:
--------------
dput(fc->root);
+ fc->root = NULL;
deactivate_locked_super(sb);
--------------
...but then we are just open-coding a duplicate of fc_drop_locked() so we
simply use that instead.
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: stable@vger.kernel.org # v5.1+
Reported-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Fixes: 71d883c37e8d ("cgroup_do_mount(): massage calling conventions")
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2021-06-16 15:51:57 +03:00
|
|
|
fc_drop_locked(fc);
|
2019-01-17 17:42:30 +03:00
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(ret > 0)) {
|
2019-01-12 08:20:54 +03:00
|
|
|
msleep(10);
|
2019-01-17 05:23:02 +03:00
|
|
|
return restart_syscall();
|
2017-04-19 05:15:59 +03:00
|
|
|
}
|
2019-01-17 10:44:07 +03:00
|
|
|
return ret;
|
2016-12-27 22:49:08 +03:00
|
|
|
}
|
|
|
|
|
2016-12-27 22:49:06 +03:00
|
|
|
static int __init cgroup1_wq_init(void)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Used to destroy pidlists and separate to serve as flush domain.
|
|
|
|
* Cap @max_active to 1 too.
|
|
|
|
*/
|
|
|
|
cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
|
|
|
|
0, 1);
|
|
|
|
BUG_ON(!cgroup_pidlist_destroy_wq);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
core_initcall(cgroup1_wq_init);
|
|
|
|
|
|
|
|
static int __init cgroup_no_v1(char *str)
|
|
|
|
{
|
|
|
|
struct cgroup_subsys *ss;
|
|
|
|
char *token;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
while ((token = strsep(&str, ",")) != NULL) {
|
|
|
|
if (!*token)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!strcmp(token, "all")) {
|
|
|
|
cgroup_no_v1_mask = U16_MAX;
|
2018-12-28 21:31:07 +03:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!strcmp(token, "named")) {
|
|
|
|
cgroup_no_v1_named = true;
|
|
|
|
continue;
|
2016-12-27 22:49:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
for_each_subsys(ss, i) {
|
|
|
|
if (strcmp(token, ss->name) &&
|
|
|
|
strcmp(token, ss->legacy_name))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
cgroup_no_v1_mask |= 1 << i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("cgroup_no_v1=", cgroup_no_v1);
|