diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 290792a13e3c..e90c7c164c83 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -437,10 +437,10 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, return &blkg->rl; } -static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, - u64 val) +static int blkcg_reset_stats(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 val) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; int i; @@ -614,15 +614,13 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) { struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; - struct cgroup *pos_cgrp; - u64 sum; + struct cgroup_subsys_state *pos_css; + u64 sum = 0; lockdep_assert_held(pd->blkg->q->queue_lock); - sum = blkg_stat_read((void *)pd + off); - rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); struct blkg_stat *stat = (void *)pos_pd + off; @@ -649,16 +647,14 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, { struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; - struct cgroup *pos_cgrp; - struct blkg_rwstat sum; + struct cgroup_subsys_state *pos_css; + struct blkg_rwstat sum = { }; int i; lockdep_assert_held(pd->blkg->q->queue_lock); - sum = blkg_rwstat_read((void *)pd + off); - rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); struct blkg_rwstat *rwstat = (void *)pos_pd + off; struct blkg_rwstat tmp; @@ -765,18 +761,18 @@ struct cftype blkcg_files[] = { /** * blkcg_css_offline - cgroup css_offline callback - * @cgroup: cgroup of interest + * @css: css of interest * - * This function is called when @cgroup is about to go away and responsible - * for shooting down all blkgs associated with @cgroup. blkgs should be + * This function is called when @css is about to go away and responsible + * for shooting down all blkgs associated with @css. blkgs should be * removed while holding both q and blkcg locks. As blkcg lock is nested * inside q lock, this function performs reverse double lock dancing. * * This is the blkcg counterpart of ioc_release_fn(). */ -static void blkcg_css_offline(struct cgroup *cgroup) +static void blkcg_css_offline(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); spin_lock_irq(&blkcg->lock); @@ -798,21 +794,21 @@ static void blkcg_css_offline(struct cgroup *cgroup) spin_unlock_irq(&blkcg->lock); } -static void blkcg_css_free(struct cgroup *cgroup) +static void blkcg_css_free(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); if (blkcg != &blkcg_root) kfree(blkcg); } -static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { static atomic64_t id_seq = ATOMIC64_INIT(0); struct blkcg *blkcg; - struct cgroup *parent = cgroup->parent; - if (!parent) { + if (!parent_css) { blkcg = &blkcg_root; goto done; } @@ -883,14 +879,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *task; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) @@ -1127,7 +1124,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) /* kill the intf files first */ if (pol->cftypes) - cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); + cgroup_rm_cftypes(pol->cftypes); /* unregister and update blkgs */ blkcg_policy[pol->plid] = NULL; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8056c03a3382..ae6969a7ffd4 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -179,22 +179,20 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, void blkg_conf_finish(struct blkg_conf_ctx *ctx); -static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) +static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), - struct blkcg, css); + return css ? container_of(css, struct blkcg, css) : NULL; } static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return container_of(task_subsys_state(tsk, blkio_subsys_id), - struct blkcg, css); + return css_to_blkcg(task_css(tsk, blkio_subsys_id)); } static inline struct blkcg *bio_blkcg(struct bio *bio) { if (bio && bio->bi_css) - return container_of(bio->bi_css, struct blkcg, css); + return css_to_blkcg(bio->bi_css); return task_blkcg(current); } @@ -206,9 +204,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) */ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) { - struct cgroup *pcg = blkcg->css.cgroup->parent; - - return pcg ? cgroup_to_blkcg(pcg) : NULL; + return css_to_blkcg(css_parent(&blkcg->css)); } /** @@ -288,32 +284,33 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may - * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip - * subtree. + * update @pos_css by calling css_rightmost_descendant() to skip subtree. + * @p_blkg is included in the iteration and the first node to be visited. */ -#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ - cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ - if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ +#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order - * traversal instead. Synchronization rules are the same. + * traversal instead. Synchronization rules are the same. @p_blkg is + * included in the iteration and the last node to be visited. */ -#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \ - cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ - if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ +#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) /** @@ -576,7 +573,6 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } -static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 08a32dfd3844..8331aba9426f 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1293,10 +1293,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, &rwstat); } -static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, cft->private, true); @@ -1325,31 +1325,31 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, return __blkg_prfill_u64(sf, pd, v); } -static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int tg_print_conf_u64(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, + blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64, &blkcg_policy_throtl, cft->private, false); return 0; } -static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int tg_print_conf_uint(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, + blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint, &blkcg_policy_throtl, cft->private, false); return 0; } -static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, - bool is_u64) +static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, + const char *buf, bool is_u64) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); struct blkg_conf_ctx ctx; struct throtl_grp *tg; struct throtl_service_queue *sq; struct blkcg_gq *blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; int ret; ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); @@ -1379,8 +1379,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - tg_update_has_rules(tg); - blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg) + blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) tg_update_has_rules(blkg_to_tg(blkg)); /* @@ -1403,16 +1402,16 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, return 0; } -static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, +static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft, const char *buf) { - return tg_set_conf(cgrp, cft, buf, true); + return tg_set_conf(css, cft, buf, true); } -static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, +static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft, const char *buf) { - return tg_set_conf(cgrp, cft, buf, false); + return tg_set_conf(css, cft, buf, false); } static struct cftype throtl_files[] = { @@ -1623,7 +1622,7 @@ void blk_throtl_drain(struct request_queue *q) { struct throtl_data *td = q->td; struct blkcg_gq *blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; struct bio *bio; int rw; @@ -1636,11 +1635,9 @@ void blk_throtl_drain(struct request_queue *q) * better to walk service_queue tree directly but blkg walk is * easier. */ - blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg) + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) tg_drain_bios(&blkg_to_tg(blkg)->service_queue); - tg_drain_bios(&td_root_tg(td)->service_queue); - /* finally, transfer bios from top-level tg's into the td */ tg_drain_bios(&td->service_queue); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d5bbdcfd0dab..dabb9d02cf9a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1607,12 +1607,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); } -static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), - cfqg_prfill_weight_device, &blkcg_policy_cfq, 0, - false); + blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device, + &blkcg_policy_cfq, 0, false); return 0; } @@ -1626,35 +1625,34 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); } -static int cfqg_print_leaf_weight_device(struct cgroup *cgrp, +static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), - cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0, - false); + blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device, + &blkcg_policy_cfq, 0, false); return 0; } -static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, +static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *sf) { - seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight); + seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight); return 0; } -static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfq_print_leaf_weight(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - seq_printf(sf, "%u\n", - cgroup_to_blkcg(cgrp)->cfq_leaf_weight); + seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight); return 0; } -static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf, bool is_leaf_weight) +static int __cfqg_set_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf, + bool is_leaf_weight) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; int ret; @@ -1680,22 +1678,22 @@ static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, return ret; } -static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cfqg_set_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - return __cfqg_set_weight_device(cgrp, cft, buf, false); + return __cfqg_set_weight_device(css, cft, buf, false); } -static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - return __cfqg_set_weight_device(cgrp, cft, buf, true); + return __cfqg_set_weight_device(css, cft, buf, true); } -static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, - bool is_leaf_weight) +static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val, bool is_leaf_weight) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) @@ -1727,30 +1725,32 @@ static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, return 0; } -static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) { - return __cfq_set_weight(cgrp, cft, val, false); + return __cfq_set_weight(css, cft, val, false); } -static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - return __cfq_set_weight(cgrp, cft, val, true); + return __cfq_set_weight(css, cft, val, true); } -static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, +static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, cft->private, false); return 0; } -static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_rwstat(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, cft->private, true); @@ -1773,20 +1773,20 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, &sum); } -static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, &blkcg_policy_cfq, cft->private, false); return 0; } -static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, cft->private, true); @@ -1810,10 +1810,10 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, } /* print avg_queue_size */ -static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, 0, false); diff --git a/fs/bio.c b/fs/bio.c index c5eae7251490..b3b20ed9510e 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1956,7 +1956,7 @@ int bio_associate_current(struct bio *bio) /* associate blkcg if exists */ rcu_read_lock(); - css = task_subsys_state(current, blkio_subsys_id); + css = task_css(current, blkio_subsys_id); if (css && css_tryget(css)) bio->bi_css = css; rcu_read_unlock(); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e9ac882868c0..3561d305b1e0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -66,22 +66,25 @@ enum cgroup_subsys_id { /* Per-subsystem/per-cgroup state maintained by the system. */ struct cgroup_subsys_state { - /* - * The cgroup that this subsystem is attached to. Useful - * for subsystems that want to know about the cgroup - * hierarchy structure - */ + /* the cgroup that this css is attached to */ struct cgroup *cgroup; + /* the cgroup subsystem that this css is attached to */ + struct cgroup_subsys *ss; + /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; + /* the parent css */ + struct cgroup_subsys_state *parent; + unsigned long flags; /* ID for this css, if possible */ struct css_id __rcu *id; - /* Used to put @cgroup->dentry on the last css_put() */ - struct work_struct dput_work; + /* percpu_ref killing and RCU release */ + struct rcu_head rcu_head; + struct work_struct destroy_work; }; /* bits in struct cgroup_subsys_state flags field */ @@ -161,7 +164,16 @@ struct cgroup_name { struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ - int id; /* ida allocated in-hierarchy ID */ + /* + * idr allocated in-hierarchy ID. + * + * The ID of the root cgroup is always 0, and a new cgroup + * will be assigned with a smallest available ID. + */ + int id; + + /* the number of attached css's */ + int nr_css; /* * We link our 'sibling' struct into our parent's 'children'. @@ -196,7 +208,7 @@ struct cgroup { struct cgroup_name __rcu *name; /* Private pointers for each registered subsystem */ - struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; + struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; struct cgroupfs_root *root; @@ -220,10 +232,12 @@ struct cgroup { struct list_head pidlists; struct mutex pidlist_mutex; + /* dummy css with NULL ->ss, points back to this cgroup */ + struct cgroup_subsys_state dummy_css; + /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; - atomic_t css_kill_cnt; /* List of events which userspace want to receive */ struct list_head event_list; @@ -322,7 +336,7 @@ struct cgroupfs_root { unsigned long flags; /* IDs for cgroups in this hierarchy */ - struct ida cgroup_ida; + struct idr cgroup_idr; /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; @@ -394,9 +408,10 @@ struct cgroup_map_cb { /* cftype->flags */ enum { - CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cg */ - CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cg */ + CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ + CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ + CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ }; #define MAX_CFTYPE_NAME 64 @@ -424,35 +439,41 @@ struct cftype { /* CFTYPE_* flags */ unsigned int flags; + /* + * The subsys this file belongs to. Initialized automatically + * during registration. NULL for cgroup core files. + */ + struct cgroup_subsys *ss; + int (*open)(struct inode *inode, struct file *file); - ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, + ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft, struct file *file, char __user *buf, size_t nbytes, loff_t *ppos); /* * read_u64() is a shortcut for the common case of returning a * single integer. Use it in place of read() */ - u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft); + u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); /* * read_s64() is a signed version of read_u64() */ - s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft); + s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); /* * read_map() is used for defining a map of key/value * pairs. It should call cb->fill(cb, key, value) for each * entry. The key/value pairs (and their ordering) should not * change between reboots. */ - int (*read_map)(struct cgroup *cgrp, struct cftype *cft, + int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb); /* * read_seq_string() is used for outputting a simple sequence * using seqfile. */ - int (*read_seq_string)(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *m); + int (*read_seq_string)(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m); - ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, + ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft, struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos); @@ -461,18 +482,20 @@ struct cftype { * a single integer (as parsed by simple_strtoull) from * userspace. Use in place of write(); return 0 or error. */ - int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val); + int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val); /* * write_s64() is a signed version of write_u64() */ - int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); + int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val); /* * write_string() is passed a nul-terminated kernelspace * buffer of maximum length determined by max_write_len. * Returns 0 or -ve error code. */ - int (*write_string)(struct cgroup *cgrp, struct cftype *cft, + int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer); /* * trigger() callback can be used to get some kick from the @@ -480,7 +503,7 @@ struct cftype { * at all. The private field can be used to determine the * kick type for multiplexing. */ - int (*trigger)(struct cgroup *cgrp, unsigned int event); + int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); int (*release)(struct inode *inode, struct file *file); @@ -490,16 +513,18 @@ struct cftype { * you want to provide this functionality. Use eventfd_signal() * on eventfd to send notification to userspace. */ - int (*register_event)(struct cgroup *cgrp, struct cftype *cft, - struct eventfd_ctx *eventfd, const char *args); + int (*register_event)(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd, + const char *args); /* * unregister_event() callback will be called when userspace * closes the eventfd or on cgroup removing. * This callback must be implemented, if you want provide * notification functionality. */ - void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, - struct eventfd_ctx *eventfd); + void (*unregister_event)(struct cgroup_subsys_state *css, + struct cftype *cft, + struct eventfd_ctx *eventfd); }; /* @@ -512,15 +537,6 @@ struct cftype_set { struct cftype *cfts; }; -struct cgroup_scanner { - struct cgroup *cg; - int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); - void (*process_task)(struct task_struct *p, - struct cgroup_scanner *scan); - struct ptr_heap *heap; - void *data; -}; - /* * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This * function can be called as long as @cgrp is accessible. @@ -537,7 +553,7 @@ static inline const char *cgroup_name(const struct cgroup *cgrp) } int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); -int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_rm_cftypes(struct cftype *cfts); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); @@ -553,20 +569,22 @@ int cgroup_task_count(const struct cgroup *cgrp); struct cgroup_taskset; struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); -struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset); +struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, + int subsys_id); int cgroup_taskset_size(struct cgroup_taskset *tset); /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor - * @skip_cgrp: skip if task's cgroup matches this, %NULL to iterate through all + * @skip_css: skip if task's css matches this, %NULL to iterate through all * @tset: taskset to iterate */ -#define cgroup_taskset_for_each(task, skip_cgrp, tset) \ +#define cgroup_taskset_for_each(task, skip_css, tset) \ for ((task) = cgroup_taskset_first((tset)); (task); \ (task) = cgroup_taskset_next((tset))) \ - if (!(skip_cgrp) || \ - cgroup_taskset_cur_cgroup((tset)) != (skip_cgrp)) + if (!(skip_css) || \ + cgroup_taskset_cur_css((tset), \ + (skip_css)->ss->subsys_id) != (skip_css)) /* * Control Group subsystem type. @@ -574,18 +592,22 @@ int cgroup_taskset_size(struct cgroup_taskset *tset); */ struct cgroup_subsys { - struct cgroup_subsys_state *(*css_alloc)(struct cgroup *cgrp); - int (*css_online)(struct cgroup *cgrp); - void (*css_offline)(struct cgroup *cgrp); - void (*css_free)(struct cgroup *cgrp); + struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); + int (*css_online)(struct cgroup_subsys_state *css); + void (*css_offline)(struct cgroup_subsys_state *css); + void (*css_free)(struct cgroup_subsys_state *css); - int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); - void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); + int (*can_attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); void (*fork)(struct task_struct *task); - void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, + void (*exit)(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, struct task_struct *task); - void (*bind)(struct cgroup *root); + void (*bind)(struct cgroup_subsys_state *root_css); int subsys_id; int disabled; @@ -641,10 +663,17 @@ struct cgroup_subsys { #undef IS_SUBSYS_ENABLED #undef SUBSYS -static inline struct cgroup_subsys_state *cgroup_subsys_state( - struct cgroup *cgrp, int subsys_id) +/** + * css_parent - find the parent css + * @css: the target cgroup_subsys_state + * + * Return the parent css of @css. This function is guaranteed to return + * non-NULL parent as long as @css isn't the root. + */ +static inline +struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) { - return cgrp->subsys[subsys_id]; + return css->parent; } /** @@ -672,7 +701,7 @@ extern struct mutex cgroup_mutex; #endif /** - * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds + * task_css_check - obtain css for (task, subsys) w/ extra access conds * @task: the target task * @subsys_id: the target subsystem ID * @__c: extra condition expression to be passed to rcu_dereference_check() @@ -680,7 +709,7 @@ extern struct mutex cgroup_mutex; * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The * synchronization rules are the same as task_css_set_check(). */ -#define task_subsys_state_check(task, subsys_id, __c) \ +#define task_css_check(task, subsys_id, __c) \ task_css_set_check((task), (__c))->subsys[(subsys_id)] /** @@ -695,87 +724,92 @@ static inline struct css_set *task_css_set(struct task_struct *task) } /** - * task_subsys_state - obtain css for (task, subsys) + * task_css - obtain css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * - * See task_subsys_state_check(). + * See task_css_check(). */ -static inline struct cgroup_subsys_state * -task_subsys_state(struct task_struct *task, int subsys_id) +static inline struct cgroup_subsys_state *task_css(struct task_struct *task, + int subsys_id) { - return task_subsys_state_check(task, subsys_id, false); + return task_css_check(task, subsys_id, false); } -static inline struct cgroup* task_cgroup(struct task_struct *task, - int subsys_id) +static inline struct cgroup *task_cgroup(struct task_struct *task, + int subsys_id) { - return task_subsys_state(task, subsys_id)->cgroup; + return task_css(task, subsys_id)->cgroup; } -struct cgroup *cgroup_next_sibling(struct cgroup *pos); +struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *parent); + +struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); /** - * cgroup_for_each_child - iterate through children of a cgroup - * @pos: the cgroup * to use as the loop cursor - * @cgrp: cgroup whose children to walk + * css_for_each_child - iterate through children of a css + * @pos: the css * to use as the loop cursor + * @parent: css whose children to walk * - * Walk @cgrp's children. Must be called under rcu_read_lock(). A child - * cgroup which hasn't finished ->css_online() or already has finished + * Walk @parent's children. Must be called under rcu_read_lock(). A child + * css which hasn't finished ->css_online() or already has finished * ->css_offline() may show up during traversal and it's each subsystem's * responsibility to verify that each @pos is alive. * * If a subsystem synchronizes against the parent in its ->css_online() and - * before starting iterating, a cgroup which finished ->css_online() is + * before starting iterating, a css which finished ->css_online() is * guaranteed to be visible in the future iterations. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ -#define cgroup_for_each_child(pos, cgrp) \ - for ((pos) = list_first_or_null_rcu(&(cgrp)->children, \ - struct cgroup, sibling); \ - (pos); (pos) = cgroup_next_sibling((pos))) +#define css_for_each_child(pos, parent) \ + for ((pos) = css_next_child(NULL, (parent)); (pos); \ + (pos) = css_next_child((pos), (parent))) -struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, - struct cgroup *cgroup); -struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); +struct cgroup_subsys_state * +css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *css); + +struct cgroup_subsys_state * +css_rightmost_descendant(struct cgroup_subsys_state *pos); /** - * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants - * @pos: the cgroup * to use as the loop cursor - * @cgroup: cgroup whose descendants to walk + * css_for_each_descendant_pre - pre-order walk of a css's descendants + * @pos: the css * to use as the loop cursor + * @root: css whose descendants to walk * - * Walk @cgroup's descendants. Must be called under rcu_read_lock(). A - * descendant cgroup which hasn't finished ->css_online() or already has + * Walk @root's descendants. @root is included in the iteration and the + * first node to be visited. Must be called under rcu_read_lock(). A + * descendant css which hasn't finished ->css_online() or already has * finished ->css_offline() may show up during traversal and it's each * subsystem's responsibility to verify that each @pos is alive. * * If a subsystem synchronizes against the parent in its ->css_online() and * before starting iterating, and synchronizes against @pos on each - * iteration, any descendant cgroup which finished ->css_online() is + * iteration, any descendant css which finished ->css_online() is * guaranteed to be visible in the future iterations. * * In other words, the following guarantees that a descendant can't escape * state updates of its ancestors. * - * my_online(@cgrp) + * my_online(@css) * { - * Lock @cgrp->parent and @cgrp; - * Inherit state from @cgrp->parent; + * Lock @css's parent and @css; + * Inherit state from the parent; * Unlock both. * } * - * my_update_state(@cgrp) + * my_update_state(@css) * { - * Lock @cgrp; - * Update @cgrp's state; - * Unlock @cgrp; - * - * cgroup_for_each_descendant_pre(@pos, @cgrp) { + * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; - * Verify @pos is alive and inherit state from @pos->parent; + * if (@pos == @css) + * Update @css's state; + * else + * Verify @pos is alive and inherit state from its parent; * Unlock @pos; * } * } @@ -786,8 +820,7 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); * visible by walking order and, as long as inheriting operations to the * same @pos are atomic to each other, multiple updates racing each other * still result in the correct state. It's guaranateed that at least one - * inheritance happens for any cgroup after the latest update to its - * parent. + * inheritance happens for any css after the latest update to its parent. * * If checking parent's state requires locking the parent, each inheriting * iteration should lock and unlock both @pos->parent and @pos. @@ -800,52 +833,45 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ -#define cgroup_for_each_descendant_pre(pos, cgroup) \ - for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \ - pos = cgroup_next_descendant_pre((pos), (cgroup))) +#define css_for_each_descendant_pre(pos, css) \ + for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ + (pos) = css_next_descendant_pre((pos), (css))) -struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, - struct cgroup *cgroup); +struct cgroup_subsys_state * +css_next_descendant_post(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *css); /** - * cgroup_for_each_descendant_post - post-order walk of a cgroup's descendants - * @pos: the cgroup * to use as the loop cursor - * @cgroup: cgroup whose descendants to walk + * css_for_each_descendant_post - post-order walk of a css's descendants + * @pos: the css * to use as the loop cursor + * @css: css whose descendants to walk * - * Similar to cgroup_for_each_descendant_pre() but performs post-order - * traversal instead. Note that the walk visibility guarantee described in - * pre-order walk doesn't apply the same to post-order walks. + * Similar to css_for_each_descendant_pre() but performs post-order + * traversal instead. @root is included in the iteration and the last + * node to be visited. Note that the walk visibility guarantee described + * in pre-order walk doesn't apply the same to post-order walks. */ -#define cgroup_for_each_descendant_post(pos, cgroup) \ - for (pos = cgroup_next_descendant_post(NULL, (cgroup)); (pos); \ - pos = cgroup_next_descendant_post((pos), (cgroup))) +#define css_for_each_descendant_post(pos, css) \ + for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ + (pos) = css_next_descendant_post((pos), (css))) -/* A cgroup_iter should be treated as an opaque object */ -struct cgroup_iter { - struct list_head *cset_link; - struct list_head *task; +/* A css_task_iter should be treated as an opaque object */ +struct css_task_iter { + struct cgroup_subsys_state *origin_css; + struct list_head *cset_link; + struct list_head *task; }; -/* - * To iterate across the tasks in a cgroup: - * - * 1) call cgroup_iter_start to initialize an iterator - * - * 2) call cgroup_iter_next() to retrieve member tasks until it - * returns NULL or until you want to end the iteration - * - * 3) call cgroup_iter_end() to destroy the iterator. - * - * Or, call cgroup_scan_tasks() to iterate through every task in a - * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling - * the test_task() callback, but not while calling the process_task() - * callback. - */ -void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); -struct task_struct *cgroup_iter_next(struct cgroup *cgrp, - struct cgroup_iter *it); -void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); -int cgroup_scan_tasks(struct cgroup_scanner *scan); +void css_task_iter_start(struct cgroup_subsys_state *css, + struct css_task_iter *it); +struct task_struct *css_task_iter_next(struct css_task_iter *it); +void css_task_iter_end(struct css_task_iter *it); + +int css_scan_tasks(struct cgroup_subsys_state *css, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap); + int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); @@ -878,7 +904,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg, /* Get id and depth of css */ unsigned short css_id(struct cgroup_subsys_state *css); -struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); +struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss); #else /* !CONFIG_CGROUPS */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7b4d9d79570b..6c416092e324 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -85,7 +85,7 @@ extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm); extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); -extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont); +extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css); static inline bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 7dc17e2456de..3f3788d49362 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -34,10 +34,12 @@ extern void vmpressure_cleanup(struct vmpressure *vmpr); extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); -extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, +extern int vmpressure_register_event(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args); -extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, +extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd); #else static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index 0fee0617fb7d..52adaa75dac9 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -35,7 +35,7 @@ static inline u32 task_cls_classid(struct task_struct *p) return 0; rcu_read_lock(); - classid = container_of(task_subsys_state(p, net_cls_subsys_id), + classid = container_of(task_css(p, net_cls_subsys_id), struct cgroup_cls_state, css)->classid; rcu_read_unlock(); @@ -51,7 +51,7 @@ static inline u32 task_cls_classid(struct task_struct *p) return 0; rcu_read_lock(); - css = task_subsys_state(p, net_cls_subsys_id); + css = task_css(p, net_cls_subsys_id); if (css) classid = container_of(css, struct cgroup_cls_state, css)->classid; diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index 50ab8c26ab59..a24f8bb3ca47 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -25,10 +25,6 @@ struct netprio_map { u32 priomap[]; }; -struct cgroup_netprio_state { - struct cgroup_subsys_state css; -}; - extern void sock_update_netprioidx(struct sock *sk); #if IS_BUILTIN(CONFIG_NETPRIO_CGROUP) @@ -39,7 +35,7 @@ static inline u32 task_netprioidx(struct task_struct *p) u32 idx; rcu_read_lock(); - css = task_subsys_state(p, net_prio_subsys_id); + css = task_css(p, net_prio_subsys_id); idx = css->cgroup->id; rcu_read_unlock(); return idx; @@ -53,7 +49,7 @@ static inline u32 task_netprioidx(struct task_struct *p) u32 idx = 0; rcu_read_lock(); - css = task_subsys_state(p, net_prio_subsys_id); + css = task_css(p, net_prio_subsys_id); if (css) idx = css->cgroup->id; rcu_read_unlock(); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e91963302c0d..e0aeb32415ff 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -81,7 +81,7 @@ */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ +EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ #else static DEFINE_MUTEX(cgroup_mutex); #endif @@ -117,6 +117,7 @@ struct cfent { struct list_head node; struct dentry *dentry; struct cftype *type; + struct cgroup_subsys_state *css; /* file xattrs */ struct simple_xattrs xattrs; @@ -159,9 +160,9 @@ struct css_id { */ struct cgroup_event { /* - * Cgroup which the event belongs to. + * css which the event belongs to. */ - struct cgroup *cgrp; + struct cgroup_subsys_state *css; /* * Control file which the event associated. */ @@ -215,10 +216,33 @@ static u64 cgroup_serial_nr_next = 1; */ static int need_forkexit_callback __read_mostly; -static void cgroup_offline_fn(struct work_struct *work); +static struct cftype cgroup_base_files[]; + +static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype cfts[], bool is_add); +static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], + bool is_add); + +/** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns the dummy_css) + * + * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This + * function must be called either under cgroup_mutex or rcu_read_lock() and + * the caller is responsible for pinning the returned css if it wants to + * keep accessing it outside the said locks. This function may return + * %NULL if @cgrp doesn't have @subsys_id enabled. + */ +static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + if (ss) + return rcu_dereference_check(cgrp->subsys[ss->subsys_id], + lockdep_is_held(&cgroup_mutex)); + else + return &cgrp->dummy_css; +} /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) @@ -365,9 +389,11 @@ static struct cgrp_cset_link init_cgrp_cset_link; static int cgroup_init_idr(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); -/* css_set_lock protects the list of css_set objects, and the - * chain of tasks off each css_set. Nests outside task->alloc_lock - * due to cgroup_iter_start() */ +/* + * css_set_lock protects the list of css_set objects, and the chain of + * tasks off each css_set. Nests outside task->alloc_lock due to + * css_task_iter_start(). + */ static DEFINE_RWLOCK(css_set_lock); static int css_set_count; @@ -392,10 +418,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -/* We don't maintain the lists running through each css_set to its - * task until after the first call to cgroup_iter_start(). This - * reduces the fork()/exit() overhead for people who have cgroups - * compiled into their kernel but not actually in use */ +/* + * We don't maintain the lists running through each css_set to its task + * until after the first call to css_task_iter_start(). This reduces the + * fork()/exit() overhead for people who have cgroups compiled into their + * kernel but not actually in use. + */ static int use_task_css_set_links __read_mostly; static void __put_css_set(struct css_set *cset, int taskexit) @@ -464,7 +492,7 @@ static inline void put_css_set_taskexit(struct css_set *cset) * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * - * Returns true if "cg" matches "old_cg" except for the hierarchy + * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ static bool compare_css_sets(struct css_set *cset, @@ -555,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ - template[i] = cgrp->subsys[i]; + template[i] = cgroup_css(cgrp, ss); } else { /* Subsystem is not in this hierarchy, so we * don't want to change the subsystem state */ @@ -803,8 +831,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask); +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); static const struct inode_operations cgroup_dir_inode_operations; static const struct file_operations proc_cgroupstats_operations; @@ -813,8 +840,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static int alloc_css_id(struct cgroup_subsys *ss, - struct cgroup *parent, struct cgroup *child); +static int alloc_css_id(struct cgroup_subsys_state *child_css); static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { @@ -845,15 +871,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - struct cgroup_subsys *ss; mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_root_subsys(cgrp->root, ss) - ss->css_free(cgrp); - cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -864,8 +883,6 @@ static void cgroup_free_fn(struct work_struct *work) */ dput(cgrp->parent->dentry); - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); - /* * Drop the active superblock reference that we took when we * created the cgroup. This will free cgrp->root, if we are @@ -956,27 +973,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_directory - selective removal of base and subsystem files - * @dir: directory containing the files - * @base_files: true if the base files should be removed + * cgroup_clear_dir - remove subsys files in a cgroup directory + * @cgrp: target cgroup * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_directory(struct dentry *dir, bool base_files, - unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) { - struct cgroup *cgrp = __d_cgrp(dir); struct cgroup_subsys *ss; + int i; - for_each_root_subsys(cgrp->root, ss) { + for_each_subsys(ss, i) { struct cftype_set *set; - if (!test_bit(ss->subsys_id, &subsys_mask)) + + if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, NULL, set->cfts, false); - } - if (base_files) { - while (!list_empty(&cgrp->files)) - cgroup_rm_file(cgrp, NULL); + cgroup_addrm_files(cgrp, set->cfts, false); } } @@ -986,9 +998,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, static void cgroup_d_remove_dir(struct dentry *dentry) { struct dentry *parent; - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - - cgroup_clear_directory(dentry, true, root->subsys_mask); parent = dentry->d_parent; spin_lock(&parent->d_lock); @@ -1009,79 +1018,84 @@ static int rebind_subsystems(struct cgroupfs_root *root, { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; - int i; + unsigned long pinned = 0; + int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); /* Check that any added subsystems are currently free */ for_each_subsys(ss, i) { - unsigned long bit = 1UL << i; - - if (!(bit & added_mask)) + if (!(added_mask & (1 << i))) continue; + /* is the subsystem mounted elsewhere? */ if (ss->root != &cgroup_dummy_root) { - /* Subsystem isn't free */ - return -EBUSY; + ret = -EBUSY; + goto out_put; } + + /* pin the module */ + if (!try_module_get(ss->module)) { + ret = -ENOENT; + goto out_put; + } + pinned |= 1 << i; } - /* Currently we don't handle adding/removing subsystems when - * any child cgroups exist. This is theoretically supportable - * but involves complex error handling, so it's being left until - * later */ - if (root->number_of_cgroups > 1) - return -EBUSY; + /* subsys could be missing if unloaded between parsing and here */ + if (added_mask != pinned) { + ret = -ENOENT; + goto out_put; + } + + ret = cgroup_populate_dir(cgrp, added_mask); + if (ret) + goto out_put; + + /* + * Nothing can fail from this point on. Remove files for the + * removed subsystems and rebind each subsystem. + */ + cgroup_clear_dir(cgrp, removed_mask); - /* Process each subsystem */ for_each_subsys(ss, i) { unsigned long bit = 1UL << i; if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgrp->subsys[i]); - BUG_ON(!cgroup_dummy_top->subsys[i]); - BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); + BUG_ON(cgroup_css(cgrp, ss)); + BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); + BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); + + rcu_assign_pointer(cgrp->subsys[i], + cgroup_css(cgroup_dummy_top, ss)); + cgroup_css(cgrp, ss)->cgroup = cgrp; - cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; - cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgrp); + ss->bind(cgroup_css(cgrp, ss)); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ - BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); - BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); + BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_dummy_top); - cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; - cgrp->subsys[i] = NULL; + ss->bind(cgroup_css(cgroup_dummy_top, ss)); + + cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; + RCU_INIT_POINTER(cgrp->subsys[i], NULL); + cgroup_subsys[i]->root = &cgroup_dummy_root; list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); /* subsystem is now free - drop reference on module */ module_put(ss->module); root->subsys_mask &= ~bit; - } else if (bit & root->subsys_mask) { - /* Subsystem state should already exist */ - BUG_ON(!cgrp->subsys[i]); - /* - * a refcount was taken, but we already had one, so - * drop the extra reference. - */ - module_put(ss->module); -#ifdef CONFIG_MODULE_UNLOAD - BUG_ON(ss->module && !module_refcount(ss->module)); -#endif - } else { - /* Subsystem state shouldn't exist */ - BUG_ON(cgrp->subsys[i]); } } @@ -1092,6 +1106,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, root->flags |= CGRP_ROOT_SUBSYS_BOUND; return 0; + +out_put: + for_each_subsys(ss, i) + if (pinned & (1 << i)) + module_put(ss->module); + return ret; } static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) @@ -1142,7 +1162,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) char *token, *o = data; bool all_ss = false, one_ss = false; unsigned long mask = (unsigned long)-1; - bool module_pin_failed = false; struct cgroup_subsys *ss; int i; @@ -1285,52 +1304,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (!opts->subsys_mask && !opts->name) return -EINVAL; - /* - * Grab references on all the modules we'll need, so the subsystems - * don't dance around before rebind_subsystems attaches them. This may - * take duplicate reference counts on a subsystem that's already used, - * but rebind_subsystems handles this case. - */ - for_each_subsys(ss, i) { - if (!(opts->subsys_mask & (1UL << i))) - continue; - if (!try_module_get(cgroup_subsys[i]->module)) { - module_pin_failed = true; - break; - } - } - if (module_pin_failed) { - /* - * oops, one of the modules was going away. this means that we - * raced with a module_delete call, and to the user this is - * essentially a "subsystem doesn't exist" case. - */ - for (i--; i >= 0; i--) { - /* drop refcounts only on the ones we took */ - unsigned long bit = 1UL << i; - - if (!(bit & opts->subsys_mask)) - continue; - module_put(cgroup_subsys[i]->module); - } - return -ENOENT; - } - return 0; } -static void drop_parsed_module_refcounts(unsigned long subsys_mask) -{ - struct cgroup_subsys *ss; - int i; - - mutex_lock(&cgroup_mutex); - for_each_subsys(ss, i) - if (subsys_mask & (1UL << i)) - module_put(cgroup_subsys[i]->module); - mutex_unlock(&cgroup_mutex); -} - static int cgroup_remount(struct super_block *sb, int *flags, char *data) { int ret = 0; @@ -1370,22 +1346,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } - /* - * Clear out the files of subsystems that should be removed, do - * this before rebind_subsystems, since rebind_subsystems may - * change this hierarchy's subsys_list. - */ - cgroup_clear_directory(cgrp->dentry, false, removed_mask); - - ret = rebind_subsystems(root, added_mask, removed_mask); - if (ret) { - /* rebind_subsystems failed, re-populate the removed files */ - cgroup_populate_dir(cgrp, false, removed_mask); + /* remounting is not allowed for populated hierarchies */ + if (root->number_of_cgroups > 1) { + ret = -EBUSY; goto out_unlock; } - /* re-populate subsystem files */ - cgroup_populate_dir(cgrp, false, added_mask); + ret = rebind_subsystems(root, added_mask, removed_mask); + if (ret) + goto out_unlock; if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1395,8 +1364,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - if (ret) - drop_parsed_module_refcounts(opts.subsys_mask); return ret; } @@ -1416,6 +1383,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); + cgrp->dummy_css.cgroup = cgrp; INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); simple_xattrs_init(&cgrp->xattrs); @@ -1431,6 +1399,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) cgrp->root = root; RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); init_cgroup_housekeeping(cgrp); + idr_init(&root->cgroup_idr); } static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) @@ -1503,7 +1472,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) */ root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; - ida_init(&root->cgroup_ida); if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) @@ -1519,7 +1487,7 @@ static void cgroup_free_root(struct cgroupfs_root *root) /* hierarhcy ID shoulid already have been released */ WARN_ON_ONCE(root->hierarchy_id); - ida_destroy(&root->cgroup_ida); + idr_destroy(&root->cgroup_idr); kfree(root); } } @@ -1584,7 +1552,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int ret = 0; struct super_block *sb; struct cgroupfs_root *new_root; + struct list_head tmp_links; struct inode *inode; + const struct cred *cred; /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); @@ -1600,7 +1570,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, new_root = cgroup_root_from_opts(&opts); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - goto drop_modules; + goto out_err; } opts.new_root = new_root; @@ -1609,17 +1579,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) { ret = PTR_ERR(sb); cgroup_free_root(opts.new_root); - goto drop_modules; + goto out_err; } root = sb->s_fs_info; BUG_ON(!root); if (root == opts.new_root) { /* We used the new root structure, so this is a new hierarchy */ - struct list_head tmp_links; struct cgroup *root_cgrp = &root->top_cgroup; struct cgroupfs_root *existing_root; - const struct cred *cred; int i; struct css_set *cset; @@ -1634,6 +1602,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); + root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, + 0, 1, GFP_KERNEL); + if (root_cgrp->id < 0) + goto unlock_drop; + /* Check for name clashes with existing mounts */ ret = -EBUSY; if (strlen(root->name)) @@ -1657,26 +1630,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; + sb->s_root->d_fsdata = root_cgrp; + root_cgrp->dentry = sb->s_root; + + /* + * We're inside get_sb() and will call lookup_one_len() to + * create the root files, which doesn't work if SELinux is + * in use. The following cred dancing somehow works around + * it. See 2ce9738ba ("cgroupfs: use init_cred when + * populating new cgroupfs mount") for more details. + */ + cred = override_creds(&init_cred); + + ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); + if (ret) + goto rm_base_files; + ret = rebind_subsystems(root, root->subsys_mask, 0); - if (ret == -EBUSY) { - free_cgrp_cset_links(&tmp_links); - goto unlock_drop; - } + if (ret) + goto rm_base_files; + + revert_creds(cred); + /* * There must be no failure case after here, since rebinding * takes care of subsystems' refcounts, which are explicitly * dropped in the failure exit path. */ - /* EBUSY should be the only error here */ - BUG_ON(ret); - list_add(&root->root_list, &cgroup_roots); cgroup_root_count++; - sb->s_root->d_fsdata = root_cgrp; - root->top_cgroup.dentry = sb->s_root; - /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); @@ -1689,9 +1673,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - cred = override_creds(&init_cred); - cgroup_populate_dir(root_cgrp, true, root->subsys_mask); - revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -1711,15 +1692,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } - - /* no subsys rebinding, so refcounts don't change */ - drop_parsed_module_refcounts(opts.subsys_mask); } kfree(opts.release_agent); kfree(opts.name); return dget(sb->s_root); + rm_base_files: + free_cgrp_cset_links(&tmp_links); + cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); + revert_creds(cred); unlock_drop: cgroup_exit_root_id(root); mutex_unlock(&cgroup_root_mutex); @@ -1727,8 +1709,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_unlock(&inode->i_mutex); drop_new_super: deactivate_locked_super(sb); - drop_modules: - drop_parsed_module_refcounts(opts.subsys_mask); out_err: kfree(opts.release_agent); kfree(opts.name); @@ -1746,6 +1726,7 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); + mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -1778,6 +1759,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgrp->dentry->d_inode->i_mutex); simple_xattrs_free(&cgrp->xattrs); @@ -1889,7 +1871,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path); struct task_and_cgroup { struct task_struct *task; struct cgroup *cgrp; - struct css_set *cg; + struct css_set *cset; }; struct cgroup_taskset { @@ -1939,18 +1921,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) EXPORT_SYMBOL_GPL(cgroup_taskset_next); /** - * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task + * cgroup_taskset_cur_css - return the matching css for the current task * @tset: taskset of interest + * @subsys_id: the ID of the target subsystem * - * Return the cgroup for the current (last returned) task of @tset. This - * function must be preceded by either cgroup_taskset_first() or - * cgroup_taskset_next(). + * Return the css for the current (last returned) task of @tset for + * subsystem specified by @subsys_id. This function must be preceded by + * either cgroup_taskset_first() or cgroup_taskset_next(). */ -struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) +struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, + int subsys_id) { - return tset->cur_cgrp; + return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); } -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); +EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); /** * cgroup_taskset_size - return the number of tasks in taskset @@ -2089,8 +2073,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); + if (ss->can_attach) { - retval = ss->can_attach(cgrp, &tset); + retval = ss->can_attach(css, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2107,8 +2093,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, tc = flex_array_get(group, i); old_cset = task_css_set(tc->task); - tc->cg = find_css_set(old_cset, cgrp); - if (!tc->cg) { + tc->cset = find_css_set(old_cset, cgrp); + if (!tc->cset) { retval = -ENOMEM; goto out_put_css_set_refs; } @@ -2121,7 +2107,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); + cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); } /* nothing is sensitive to fork() after this point. */ @@ -2129,8 +2115,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); + if (ss->attach) - ss->attach(cgrp, &tset); + ss->attach(css, &tset); } /* @@ -2141,18 +2129,20 @@ out_put_css_set_refs: if (retval) { for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - if (!tc->cg) + if (!tc->cset) break; - put_css_set(tc->cg); + put_css_set(tc->cset); } } out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); + if (ss == failed_ss) break; if (ss->cancel_attach) - ss->cancel_attach(cgrp, &tset); + ss->cancel_attach(css, &tset); } } out_free_group_list: @@ -2253,9 +2243,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) mutex_lock(&cgroup_mutex); for_each_active_root(root) { - struct cgroup *from_cg = task_cgroup_from_root(from, root); + struct cgroup *from_cgrp = task_cgroup_from_root(from, root); - retval = cgroup_attach_task(from_cg, tsk, false); + retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } @@ -2265,34 +2255,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +static int cgroup_tasks_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 pid) { - return attach_task_by_pid(cgrp, pid, false); + return attach_task_by_pid(css->cgroup, pid, false); } -static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) +static int cgroup_procs_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 tgid) { - return attach_task_by_pid(cgrp, tgid, true); + return attach_task_by_pid(css->cgroup, tgid, true); } -static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int cgroup_release_agent_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); if (strlen(buffer) >= PATH_MAX) return -EINVAL; - if (!cgroup_lock_live_group(cgrp)) + if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; mutex_lock(&cgroup_root_mutex); - strcpy(cgrp->root->release_agent_path, buffer); + strcpy(css->cgroup->root->release_agent_path, buffer); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return 0; } -static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) +static int cgroup_release_agent_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { + struct cgroup *cgrp = css->cgroup; + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; seq_puts(seq, cgrp->root->release_agent_path); @@ -2301,20 +2295,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, return 0; } -static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) +static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { - seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); return 0; } /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 -static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + const char __user *userbuf, size_t nbytes, + loff_t *unused_ppos) { char buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; @@ -2332,22 +2326,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - retval = cft->write_u64(cgrp, cft, val); + retval = cft->write_u64(css, cft, val); } else { s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - retval = cft->write_s64(cgrp, cft, val); + retval = cft->write_s64(css, cft, val); } if (!retval) retval = nbytes; return retval; } -static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + const char __user *userbuf, size_t nbytes, + loff_t *unused_ppos) { char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; @@ -2370,7 +2364,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - retval = cft->write_string(cgrp, cft, strstrip(buffer)); + retval = cft->write_string(css, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: @@ -2380,65 +2374,60 @@ out: } static ssize_t cgroup_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct cgroup_subsys_state *css = cfe->css; - if (cgroup_is_dead(cgrp)) - return -ENODEV; if (cft->write) - return cft->write(cgrp, cft, file, buf, nbytes, ppos); + return cft->write(css, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) - return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); if (cft->write_string) - return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_write_string(css, cft, file, buf, nbytes, ppos); if (cft->trigger) { - int ret = cft->trigger(cgrp, (unsigned int)cft->private); + int ret = cft->trigger(css, (unsigned int)cft->private); return ret ? ret : nbytes; } return -EINVAL; } -static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - u64 val = cft->read_u64(cgrp, cft); + u64 val = cft->read_u64(css, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } -static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - s64 val = cft->read_s64(cgrp, cft); + s64 val = cft->read_s64(css, cft); int len = sprintf(tmp, "%lld\n", (long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } static ssize_t cgroup_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - - if (cgroup_is_dead(cgrp)) - return -ENODEV; + struct cgroup_subsys_state *css = cfe->css; if (cft->read) - return cft->read(cgrp, cft, file, buf, nbytes, ppos); + return cft->read(css, cft, file, buf, nbytes, ppos); if (cft->read_u64) - return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); if (cft->read_s64) - return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); return -EINVAL; } @@ -2447,11 +2436,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, * supports string->u64 maps, but can be extended in future. */ -struct cgroup_seqfile_state { - struct cftype *cft; - struct cgroup *cgroup; -}; - static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) { struct seq_file *sf = cb->state; @@ -2460,69 +2444,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) static int cgroup_seqfile_show(struct seq_file *m, void *arg) { - struct cgroup_seqfile_state *state = m->private; - struct cftype *cft = state->cft; + struct cfent *cfe = m->private; + struct cftype *cft = cfe->type; + struct cgroup_subsys_state *css = cfe->css; + if (cft->read_map) { struct cgroup_map_cb cb = { .fill = cgroup_map_add, .state = m, }; - return cft->read_map(state->cgroup, cft, &cb); + return cft->read_map(css, cft, &cb); } - return cft->read_seq_string(state->cgroup, cft, m); -} - -static int cgroup_seqfile_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - kfree(seq->private); - return single_release(inode, file); + return cft->read_seq_string(css, cft, m); } static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, - .release = cgroup_seqfile_release, + .release = single_release, }; static int cgroup_file_open(struct inode *inode, struct file *file) { + struct cfent *cfe = __d_cfe(file->f_dentry); + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + struct cgroup_subsys_state *css; int err; - struct cftype *cft; err = generic_file_open(inode, file); if (err) return err; - cft = __d_cft(file->f_dentry); + + /* + * If the file belongs to a subsystem, pin the css. Will be + * unpinned either on open failure or release. This ensures that + * @css stays alive for all file operations. + */ + rcu_read_lock(); + css = cgroup_css(cgrp, cft->ss); + if (cft->ss && !css_tryget(css)) + css = NULL; + rcu_read_unlock(); + + if (!css) + return -ENODEV; + + /* + * @cfe->css is used by read/write/close to determine the + * associated css. @file->private_data would be a better place but + * that's already used by seqfile. Multiple accessors may use it + * simultaneously which is okay as the association never changes. + */ + WARN_ON_ONCE(cfe->css && cfe->css != css); + cfe->css = css; if (cft->read_map || cft->read_seq_string) { - struct cgroup_seqfile_state *state; - - state = kzalloc(sizeof(*state), GFP_USER); - if (!state) - return -ENOMEM; - - state->cft = cft; - state->cgroup = __d_cgrp(file->f_dentry->d_parent); file->f_op = &cgroup_seqfile_operations; - err = single_open(file, cgroup_seqfile_show, state); - if (err < 0) - kfree(state); - } else if (cft->open) + err = single_open(file, cgroup_seqfile_show, cfe); + } else if (cft->open) { err = cft->open(inode, file); - else - err = 0; + } + if (css->ss && err) + css_put(css); return err; } static int cgroup_file_release(struct inode *inode, struct file *file) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cfe->css; + int ret = 0; + if (cft->release) - return cft->release(inode, file); - return 0; + ret = cft->release(inode, file); + if (css->ss) + css_put(css); + return ret; } /* @@ -2736,8 +2737,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) return mode; } -static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype *cft) +static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { struct dentry *dir = cgrp->dentry; struct cgroup *parent = __d_cgrp(dir); @@ -2747,8 +2747,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { - strcpy(name, subsys->name); + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { + strcpy(name, cft->ss->name); strcat(name, "."); } strcat(name, cft->name); @@ -2782,11 +2783,25 @@ out: return error; } -static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype cfts[], bool is_add) +/** + * cgroup_addrm_files - add or remove files to a cgroup directory + * @cgrp: the target cgroup + * @cfts: array of cftypes to be added + * @is_add: whether to add or remove + * + * Depending on @is_add, add or remove files defined by @cfts on @cgrp. + * For removals, this function never fails. If addition fails, this + * function doesn't remove files already added. The caller is responsible + * for cleaning up. + */ +static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], + bool is_add) { struct cftype *cft; - int err, ret = 0; + int ret; + + lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2798,16 +2813,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, continue; if (is_add) { - err = cgroup_add_file(cgrp, subsys, cft); - if (err) + ret = cgroup_add_file(cgrp, cft); + if (ret) { pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", - cft->name, err); - ret = err; + cft->name, ret); + return ret; + } } else { cgroup_rm_file(cgrp, cft); } } - return ret; + return 0; } static void cgroup_cfts_prepare(void) @@ -2816,28 +2832,30 @@ static void cgroup_cfts_prepare(void) /* * Thanks to the entanglement with vfs inode locking, we can't walk * the existing cgroups under cgroup_mutex and create files. - * Instead, we use cgroup_for_each_descendant_pre() and drop RCU - * read lock before calling cgroup_addrm_files(). + * Instead, we use css_for_each_descendant_pre() and drop RCU read + * lock before calling cgroup_addrm_files(). */ mutex_lock(&cgroup_mutex); } -static void cgroup_cfts_commit(struct cgroup_subsys *ss, - struct cftype *cfts, bool is_add) +static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) { LIST_HEAD(pending); - struct cgroup *cgrp, *root = &ss->root->top_cgroup; + struct cgroup_subsys *ss = cfts[0].ss; + struct cgroup *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; struct dentry *prev = NULL; struct inode *inode; + struct cgroup_subsys_state *css; u64 update_before; + int ret = 0; /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ if (!cfts || ss->root == &cgroup_dummy_root || !atomic_inc_not_zero(&sb->s_active)) { mutex_unlock(&cgroup_mutex); - return; + return 0; } /* @@ -2849,17 +2867,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_unlock(&cgroup_mutex); - /* @root always needs to be updated */ - inode = root->dentry->d_inode; - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - cgroup_addrm_files(root, ss, cfts, is_add); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - /* add/rm files for all cgroups created before */ rcu_read_lock(); - cgroup_for_each_descendant_pre(cgrp, root) { + css_for_each_descendant_pre(css, cgroup_css(root, ss)) { + struct cgroup *cgrp = css->cgroup; + if (cgroup_is_dead(cgrp)) continue; @@ -2873,15 +2885,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - cgroup_addrm_files(cgrp, ss, cfts, is_add); + ret = cgroup_addrm_files(cgrp, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); rcu_read_lock(); + if (ret) + break; } rcu_read_unlock(); dput(prev); deactivate_super(sb); + return ret; } /** @@ -2901,49 +2916,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; + struct cftype *cft; + int ret; set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; + for (cft = cfts; cft->name[0] != '\0'; cft++) + cft->ss = ss; + cgroup_cfts_prepare(); set->cfts = cfts; list_add_tail(&set->node, &ss->cftsets); - cgroup_cfts_commit(ss, cfts, true); - - return 0; + ret = cgroup_cfts_commit(cfts, true); + if (ret) + cgroup_rm_cftypes(cfts); + return ret; } EXPORT_SYMBOL_GPL(cgroup_add_cftypes); /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem - * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * - * Unregister @cfts from @ss. Files described by @cfts are removed from - * all existing cgroups to which @ss is attached and all future cgroups - * won't have them either. This function can be called anytime whether @ss - * is attached or not. + * Unregister @cfts. Files described by @cfts are removed from all + * existing cgroups and all future cgroups won't have them either. This + * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not - * registered with @ss. + * registered. */ -int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_rm_cftypes(struct cftype *cfts) { struct cftype_set *set; + if (!cfts || !cfts[0].ss) + return -ENOENT; + cgroup_cfts_prepare(); - list_for_each_entry(set, &ss->cftsets, node) { + list_for_each_entry(set, &cfts[0].ss->cftsets, node) { if (set->cfts == cfts) { list_del(&set->node); kfree(set); - cgroup_cfts_commit(ss, cfts, false); + cgroup_cfts_commit(cfts, false); return 0; } } - cgroup_cfts_commit(ss, NULL, false); + cgroup_cfts_commit(NULL, false); return -ENOENT; } @@ -2966,34 +2988,10 @@ int cgroup_task_count(const struct cgroup *cgrp) } /* - * Advance a list_head iterator. The iterator should be positioned at - * the start of a css_set - */ -static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) -{ - struct list_head *l = it->cset_link; - struct cgrp_cset_link *link; - struct css_set *cset; - - /* Advance to the next non-empty css_set */ - do { - l = l->next; - if (l == &cgrp->cset_links) { - it->cset_link = NULL; - return; - } - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; - } while (list_empty(&cset->tasks)); - it->cset_link = l; - it->task = cset->tasks.next; -} - -/* - * To reduce the fork() overhead for systems that are not actually - * using their cgroups capability, we don't maintain the lists running - * through each css_set to its tasks until we see the list actually - * used - in other words after the first call to cgroup_iter_start(). + * To reduce the fork() overhead for systems that are not actually using + * their cgroups capability, we don't maintain the lists running through + * each css_set to its tasks until we see the list actually used - in other + * words after the first call to css_task_iter_start(). */ static void cgroup_enable_task_cg_lists(void) { @@ -3024,16 +3022,21 @@ static void cgroup_enable_task_cg_lists(void) } /** - * cgroup_next_sibling - find the next sibling of a given cgroup - * @pos: the current cgroup + * css_next_child - find the next child of a given css + * @pos_css: the current position (%NULL to initiate traversal) + * @parent_css: css whose children to walk * - * This function returns the next sibling of @pos and should be called - * under RCU read lock. The only requirement is that @pos is accessible. - * The next sibling is guaranteed to be returned regardless of @pos's - * state. + * This function returns the next child of @parent_css and should be called + * under RCU read lock. The only requirement is that @parent_css and + * @pos_css are accessible. The next sibling is guaranteed to be returned + * regardless of their states. */ -struct cgroup *cgroup_next_sibling(struct cgroup *pos) +struct cgroup_subsys_state * +css_next_child(struct cgroup_subsys_state *pos_css, + struct cgroup_subsys_state *parent_css) { + struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; + struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -3048,78 +3051,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) * safe to dereference from this RCU critical section. If * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed * to be visible as %true here. + * + * If @pos is dead, its next pointer can't be dereferenced; + * however, as each cgroup is given a monotonically increasing + * unique serial number and always appended to the sibling list, + * the next one can be found by walking the parent's children until + * we see a cgroup with higher serial number than @pos's. While + * this path can be slower, it's taken only when either the current + * cgroup is removed or iteration and removal race. */ - if (likely(!cgroup_is_dead(pos))) { + if (!pos) { + next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); + } else if (likely(!cgroup_is_dead(pos))) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); - if (&next->sibling != &pos->parent->children) - return next; - return NULL; + } else { + list_for_each_entry_rcu(next, &cgrp->children, sibling) + if (next->serial_nr > pos->serial_nr) + break; } - /* - * Can't dereference the next pointer. Each cgroup is given a - * monotonically increasing unique serial number and always - * appended to the sibling list, so the next one can be found by - * walking the parent's children until we see a cgroup with higher - * serial number than @pos's. - * - * While this path can be slow, it's taken only when either the - * current cgroup is removed or iteration and removal race. - */ - list_for_each_entry_rcu(next, &pos->parent->children, sibling) - if (next->serial_nr > pos->serial_nr) - return next; - return NULL; + if (&next->sibling == &cgrp->children) + return NULL; + + return cgroup_css(next, parent_css->ss); } -EXPORT_SYMBOL_GPL(cgroup_next_sibling); +EXPORT_SYMBOL_GPL(css_next_child); /** - * cgroup_next_descendant_pre - find the next descendant for pre-order walk + * css_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) - * @cgroup: cgroup whose descendants to walk + * @root: css whose descendants to walk * - * To be used by cgroup_for_each_descendant_pre(). Find the next - * descendant to visit for pre-order traversal of @cgroup's descendants. + * To be used by css_for_each_descendant_pre(). Find the next descendant + * to visit for pre-order traversal of @root's descendants. @root is + * included in the iteration and the first node to be visited. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This * function will return the correct next descendant as long as both @pos - * and @cgroup are accessible and @pos is a descendant of @cgroup. + * and @root are accessible and @pos is a descendant of @root. */ -struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, - struct cgroup *cgroup) +struct cgroup_subsys_state * +css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) { - struct cgroup *next; + struct cgroup_subsys_state *next; WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, pretend we just visited @cgroup */ + /* if first iteration, visit @root */ if (!pos) - pos = cgroup; + return root; /* visit the first child if exists */ - next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); + next = css_next_child(NULL, pos); if (next) return next; /* no child, visit my or the closest ancestor's next sibling */ - while (pos != cgroup) { - next = cgroup_next_sibling(pos); + while (pos != root) { + next = css_next_child(pos, css_parent(pos)); if (next) return next; - pos = pos->parent; + pos = css_parent(pos); } return NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** - * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup - * @pos: cgroup of interest + * css_rightmost_descendant - return the rightmost descendant of a css + * @pos: css of interest * - * Return the rightmost descendant of @pos. If there's no descendant, - * @pos is returned. This can be used during pre-order traversal to skip + * Return the rightmost descendant of @pos. If there's no descendant, @pos + * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * * While this function requires RCU read locking, it doesn't require the @@ -3127,9 +3133,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); * function will return the correct rightmost descendant as long as @pos is * accessible. */ -struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +struct cgroup_subsys_state * +css_rightmost_descendant(struct cgroup_subsys_state *pos) { - struct cgroup *last, *tmp; + struct cgroup_subsys_state *last, *tmp; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -3137,82 +3144,138 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; - list_for_each_entry_rcu(tmp, &last->children, sibling) + css_for_each_child(tmp, last) pos = tmp; } while (pos); return last; } -EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); +EXPORT_SYMBOL_GPL(css_rightmost_descendant); -static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) +static struct cgroup_subsys_state * +css_leftmost_descendant(struct cgroup_subsys_state *pos) { - struct cgroup *last; + struct cgroup_subsys_state *last; do { last = pos; - pos = list_first_or_null_rcu(&pos->children, struct cgroup, - sibling); + pos = css_next_child(NULL, pos); } while (pos); return last; } /** - * cgroup_next_descendant_post - find the next descendant for post-order walk + * css_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) - * @cgroup: cgroup whose descendants to walk + * @root: css whose descendants to walk * - * To be used by cgroup_for_each_descendant_post(). Find the next - * descendant to visit for post-order traversal of @cgroup's descendants. + * To be used by css_for_each_descendant_post(). Find the next descendant + * to visit for post-order traversal of @root's descendants. @root is + * included in the iteration and the last node to be visited. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This * function will return the correct next descendant as long as both @pos * and @cgroup are accessible and @pos is a descendant of @cgroup. */ -struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, - struct cgroup *cgroup) +struct cgroup_subsys_state * +css_next_descendant_post(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) { - struct cgroup *next; + struct cgroup_subsys_state *next; WARN_ON_ONCE(!rcu_read_lock_held()); /* if first iteration, visit the leftmost descendant */ if (!pos) { - next = cgroup_leftmost_descendant(cgroup); - return next != cgroup ? next : NULL; + next = css_leftmost_descendant(root); + return next != root ? next : NULL; } + /* if we visited @root, we're done */ + if (pos == root) + return NULL; + /* if there's an unvisited sibling, visit its leftmost descendant */ - next = cgroup_next_sibling(pos); + next = css_next_child(pos, css_parent(pos)); if (next) - return cgroup_leftmost_descendant(next); + return css_leftmost_descendant(next); /* no sibling left, visit parent */ - next = pos->parent; - return next != cgroup ? next : NULL; + return css_parent(pos); } -EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); +EXPORT_SYMBOL_GPL(css_next_descendant_post); -void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) +/** + * css_advance_task_iter - advance a task itererator to the next css_set + * @it: the iterator to advance + * + * Advance @it to the next css_set to walk. + */ +static void css_advance_task_iter(struct css_task_iter *it) +{ + struct list_head *l = it->cset_link; + struct cgrp_cset_link *link; + struct css_set *cset; + + /* Advance to the next non-empty css_set */ + do { + l = l->next; + if (l == &it->origin_css->cgroup->cset_links) { + it->cset_link = NULL; + return; + } + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } while (list_empty(&cset->tasks)); + it->cset_link = l; + it->task = cset->tasks.next; +} + +/** + * css_task_iter_start - initiate task iteration + * @css: the css to walk tasks of + * @it: the task iterator to use + * + * Initiate iteration through the tasks of @css. The caller can call + * css_task_iter_next() to walk through the tasks until the function + * returns NULL. On completion of iteration, css_task_iter_end() must be + * called. + * + * Note that this function acquires a lock which is released when the + * iteration finishes. The caller can't sleep while iteration is in + * progress. + */ +void css_task_iter_start(struct cgroup_subsys_state *css, + struct css_task_iter *it) __acquires(css_set_lock) { /* - * The first time anyone tries to iterate across a cgroup, - * we need to enable the list linking each css_set to its - * tasks, and fix up all existing tasks. + * The first time anyone tries to iterate across a css, we need to + * enable the list linking each css_set to its tasks, and fix up + * all existing tasks. */ if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); - it->cset_link = &cgrp->cset_links; - cgroup_advance_iter(cgrp, it); + + it->origin_css = css; + it->cset_link = &css->cgroup->cset_links; + + css_advance_task_iter(it); } -struct task_struct *cgroup_iter_next(struct cgroup *cgrp, - struct cgroup_iter *it) +/** + * css_task_iter_next - return the next task for the iterator + * @it: the task iterator being iterated + * + * The "next" function for task iteration. @it should have been + * initialized via css_task_iter_start(). Returns NULL when the iteration + * reaches the end. + */ +struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3226,16 +3289,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, l = l->next; link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); if (l == &link->cset->tasks) { - /* We reached the end of this task list - move on to - * the next cg_cgroup_link */ - cgroup_advance_iter(cgrp, it); + /* + * We reached the end of this task list - move on to the + * next cgrp_cset_link. + */ + css_advance_task_iter(it); } else { it->task = l; } return res; } -void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) +/** + * css_task_iter_end - finish task iteration + * @it: the task iterator to finish + * + * Finish task iteration started by css_task_iter_start(). + */ +void css_task_iter_end(struct css_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3276,46 +3347,49 @@ static inline int started_after(void *p1, void *p2) } /** - * cgroup_scan_tasks - iterate though all the tasks in a cgroup - * @scan: struct cgroup_scanner containing arguments for the scan + * css_scan_tasks - iterate though all the tasks in a css + * @css: the css to iterate tasks of + * @test: optional test callback + * @process: process callback + * @data: data passed to @test and @process + * @heap: optional pre-allocated heap used for task iteration * - * Arguments include pointers to callback functions test_task() and - * process_task(). - * Iterate through all the tasks in a cgroup, calling test_task() for each, - * and if it returns true, call process_task() for it also. - * The test_task pointer may be NULL, meaning always true (select all tasks). - * Effectively duplicates cgroup_iter_{start,next,end}() - * but does not lock css_set_lock for the call to process_task(). - * The struct cgroup_scanner may be embedded in any structure of the caller's - * creation. - * It is guaranteed that process_task() will act on every task that - * is a member of the cgroup for the duration of this call. This - * function may or may not call process_task() for tasks that exit - * or move to a different cgroup during the call, or are forked or - * move into the cgroup during the call. + * Iterate through all the tasks in @css, calling @test for each, and if it + * returns %true, call @process for it also. * - * Note that test_task() may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should - * be cheap. - * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been - * pre-allocated and will be used for heap operations (and its "gt" member will - * be overwritten), else a temporary heap will be used (allocation of which - * may cause this function to fail). + * @test may be NULL, meaning always true (select all tasks), which + * effectively duplicates css_task_iter_{start,next,end}() but does not + * lock css_set_lock for the call to @process. + * + * It is guaranteed that @process will act on every task that is a member + * of @css for the duration of this call. This function may or may not + * call @process for tasks that exit or move to a different css during the + * call, or are forked or move into the css during the call. + * + * Note that @test may be called with locks held, and may in some + * situations be called multiple times for the same task, so it should be + * cheap. + * + * If @heap is non-NULL, a heap has been pre-allocated and will be used for + * heap operations (and its "gt" member will be overwritten), else a + * temporary heap will be used (allocation of which may cause this function + * to fail). */ -int cgroup_scan_tasks(struct cgroup_scanner *scan) +int css_scan_tasks(struct cgroup_subsys_state *css, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap) { int retval, i; - struct cgroup_iter it; + struct css_task_iter it; struct task_struct *p, *dropped; /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; struct ptr_heap tmp_heap; - struct ptr_heap *heap; struct timespec latest_time = { 0, 0 }; - if (scan->heap) { + if (heap) { /* The caller supplied our heap and pre-allocated its memory */ - heap = scan->heap; heap->gt = &started_after; } else { /* We need to allocate our own heap memory */ @@ -3328,25 +3402,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) again: /* - * Scan tasks in the cgroup, using the scanner's "test_task" callback - * to determine which are of interest, and using the scanner's - * "process_task" callback to process any of them that need an update. - * Since we don't want to hold any locks during the task updates, - * gather tasks to be processed in a heap structure. - * The heap is sorted by descending task start time. - * If the statically-sized heap fills up, we overflow tasks that - * started later, and in future iterations only consider tasks that - * started after the latest task in the previous pass. This + * Scan tasks in the css, using the @test callback to determine + * which are of interest, and invoking @process callback on the + * ones which need an update. Since we don't want to hold any + * locks during the task updates, gather tasks to be processed in a + * heap structure. The heap is sorted by descending task start + * time. If the statically-sized heap fills up, we overflow tasks + * that started later, and in future iterations only consider tasks + * that started after the latest task in the previous pass. This * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_iter_start(scan->cg, &it); - while ((p = cgroup_iter_next(scan->cg, &it))) { + css_task_iter_start(css, &it); + while ((p = css_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one */ - if (scan->test_task && !scan->test_task(p, scan)) + if (test && !test(p, data)) continue; /* * Only process tasks that started after the last task @@ -3374,7 +3447,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_iter_end(scan->cg, &it); + css_task_iter_end(&it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3384,7 +3457,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) latest_task = q; } /* Process the task per the caller's callback */ - scan->process_task(q, scan); + process(q, data); put_task_struct(q); } /* @@ -3401,10 +3474,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) return 0; } -static void cgroup_transfer_one_task(struct task_struct *task, - struct cgroup_scanner *scan) +static void cgroup_transfer_one_task(struct task_struct *task, void *data) { - struct cgroup *new_cgroup = scan->data; + struct cgroup *new_cgroup = data; mutex_lock(&cgroup_mutex); cgroup_attach_task(new_cgroup, task, false); @@ -3418,15 +3490,8 @@ static void cgroup_transfer_one_task(struct task_struct *task, */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - struct cgroup_scanner scan; - - scan.cg = from; - scan.test_task = NULL; /* select all tasks in cgroup */ - scan.process_task = cgroup_transfer_one_task; - scan.heap = NULL; - scan.data = to; - - return cgroup_scan_tasks(&scan); + return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, + to, NULL); } /* @@ -3468,7 +3533,7 @@ struct cgroup_pidlist { /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; /* protects the other fields */ - struct rw_semaphore mutex; + struct rw_semaphore rwsem; }; /* @@ -3541,7 +3606,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, struct pid_namespace *ns = task_active_pid_ns(current); /* - * We can't drop the pidlist_mutex before taking the l->mutex in case + * We can't drop the pidlist_mutex before taking the l->rwsem in case * the last ref-holder is trying to remove l from the list at the same * time. Holding the pidlist_mutex precludes somebody taking whichever * list we find out from under us - compare release_pid_array(). @@ -3550,7 +3615,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, list_for_each_entry(l, &cgrp->pidlists, links) { if (l->key.type == type && l->key.ns == ns) { /* make sure l doesn't vanish out from under us */ - down_write(&l->mutex); + down_write(&l->rwsem); mutex_unlock(&cgrp->pidlist_mutex); return l; } @@ -3561,8 +3626,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, mutex_unlock(&cgrp->pidlist_mutex); return l; } - init_rwsem(&l->mutex); - down_write(&l->mutex); + init_rwsem(&l->rwsem); + down_write(&l->rwsem); l->key.type = type; l->key.ns = get_pid_ns(ns); l->owner = cgrp; @@ -3580,7 +3645,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, pid_t *array; int length; int pid, n = 0; /* used for populating the array */ - struct cgroup_iter it; + struct css_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; @@ -3595,8 +3660,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { + css_task_iter_start(&cgrp->dummy_css, &it); + while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3607,7 +3672,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_iter_end(cgrp, &it); + css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3623,7 +3688,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, l->list = array; l->length = length; l->use_count++; - up_write(&l->mutex); + up_write(&l->rwsem); *lp = l; return 0; } @@ -3641,7 +3706,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { int ret = -EINVAL; struct cgroup *cgrp; - struct cgroup_iter it; + struct css_task_iter it; struct task_struct *tsk; /* @@ -3655,8 +3720,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { + css_task_iter_start(&cgrp->dummy_css, &it); + while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3676,7 +3741,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_iter_end(cgrp, &it); + css_task_iter_end(&it); err: return ret; @@ -3701,7 +3766,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) int index = 0, pid = *pos; int *iter; - down_read(&l->mutex); + down_read(&l->rwsem); if (pid) { int end = l->length; @@ -3728,7 +3793,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct cgroup_pidlist *l = s->private; - up_read(&l->mutex); + up_read(&l->rwsem); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) @@ -3774,7 +3839,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) * pidlist_mutex, we have to take pidlist_mutex first. */ mutex_lock(&l->owner->pidlist_mutex); - down_write(&l->mutex); + down_write(&l->rwsem); BUG_ON(!l->use_count); if (!--l->use_count) { /* we're the last user if refcount is 0; remove and free */ @@ -3782,12 +3847,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) mutex_unlock(&l->owner->pidlist_mutex); pidlist_free(l->list); put_pid_ns(l->key.ns); - up_write(&l->mutex); + up_write(&l->rwsem); kfree(l); return; } mutex_unlock(&l->owner->pidlist_mutex); - up_write(&l->mutex); + up_write(&l->rwsem); } static int cgroup_pidlist_release(struct inode *inode, struct file *file) @@ -3851,21 +3916,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file) return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); } -static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, - struct cftype *cft) +static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft) { - return notify_on_release(cgrp); + return notify_on_release(css->cgroup); } -static int cgroup_write_notify_on_release(struct cgroup *cgrp, - struct cftype *cft, - u64 val) +static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - clear_bit(CGRP_RELEASABLE, &cgrp->flags); + clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); return 0; } @@ -3895,18 +3959,18 @@ static void cgroup_event_remove(struct work_struct *work) { struct cgroup_event *event = container_of(work, struct cgroup_event, remove); - struct cgroup *cgrp = event->cgrp; + struct cgroup_subsys_state *css = event->css; remove_wait_queue(event->wqh, &event->wait); - event->cft->unregister_event(cgrp, event->cft, event->eventfd); + event->cft->unregister_event(css, event->cft, event->eventfd); /* Notify userspace the event is going away. */ eventfd_signal(event->eventfd, 1); eventfd_ctx_put(event->eventfd); kfree(event); - cgroup_dput(cgrp); + css_put(css); } /* @@ -3919,7 +3983,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, { struct cgroup_event *event = container_of(wait, struct cgroup_event, wait); - struct cgroup *cgrp = event->cgrp; + struct cgroup *cgrp = event->css->cgroup; unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { @@ -3963,14 +4027,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file, * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, + struct cftype *cft, const char *buffer) { - struct cgroup_event *event = NULL; - struct cgroup *cgrp_cfile; + struct cgroup *cgrp = dummy_css->cgroup; + struct cgroup_event *event; + struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; - struct file *efile = NULL; - struct file *cfile = NULL; + struct file *efile; + struct file *cfile; char *endp; int ret; @@ -3987,7 +4052,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; - event->cgrp = cgrp; + INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -3996,62 +4061,68 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, efile = eventfd_fget(efd); if (IS_ERR(efile)) { ret = PTR_ERR(efile); - goto fail; + goto out_kfree; } event->eventfd = eventfd_ctx_fileget(efile); if (IS_ERR(event->eventfd)) { ret = PTR_ERR(event->eventfd); - goto fail; + goto out_put_efile; } cfile = fget(cfd); if (!cfile) { ret = -EBADF; - goto fail; + goto out_put_eventfd; } /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ ret = inode_permission(file_inode(cfile), MAY_READ); if (ret < 0) - goto fail; + goto out_put_cfile; event->cft = __file_cft(cfile); if (IS_ERR(event->cft)) { ret = PTR_ERR(event->cft); - goto fail; + goto out_put_cfile; + } + + if (!event->cft->ss) { + ret = -EBADF; + goto out_put_cfile; } /* - * The file to be monitored must be in the same cgroup as - * cgroup.event_control is. + * Determine the css of @cfile, verify it belongs to the same + * cgroup as cgroup.event_control, and associate @event with it. + * Remaining events are automatically removed on cgroup destruction + * but the removal is asynchronous, so take an extra ref. */ - cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); - if (cgrp_cfile != cgrp) { - ret = -EINVAL; - goto fail; - } + rcu_read_lock(); + + ret = -EINVAL; + event->css = cgroup_css(cgrp, event->cft->ss); + cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss); + if (event->css && event->css == cfile_css && css_tryget(event->css)) + ret = 0; + + rcu_read_unlock(); + if (ret) + goto out_put_cfile; if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; - goto fail; + goto out_put_css; } - ret = event->cft->register_event(cgrp, event->cft, + ret = event->cft->register_event(event->css, event->cft, event->eventfd, buffer); if (ret) - goto fail; + goto out_put_css; efile->f_op->poll(efile, &event->pt); - /* - * Events should be removed after rmdir of cgroup directory, but before - * destroying subsystem state objects. Let's take reference to cgroup - * directory dentry to do that. - */ - dget(cgrp->dentry); - spin_lock(&cgrp->event_list_lock); list_add(&event->list, &cgrp->event_list); spin_unlock(&cgrp->event_list_lock); @@ -4061,35 +4132,33 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, return 0; -fail: - if (cfile) - fput(cfile); - - if (event && event->eventfd && !IS_ERR(event->eventfd)) - eventfd_ctx_put(event->eventfd); - - if (!IS_ERR_OR_NULL(efile)) - fput(efile); - +out_put_css: + css_put(event->css); +out_put_cfile: + fput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fput(efile); +out_kfree: kfree(event); return ret; } -static u64 cgroup_clone_children_read(struct cgroup *cgrp, - struct cftype *cft) +static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); } -static int cgroup_clone_children_write(struct cgroup *cgrp, - struct cftype *cft, - u64 val) +static int cgroup_clone_children_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { if (val) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); else - clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); return 0; } @@ -4148,36 +4217,34 @@ static struct cftype cgroup_base_files[] = { }; /** - * cgroup_populate_dir - selectively creation of files in a directory + * cgroup_populate_dir - create subsys files in a cgroup directory * @cgrp: target cgroup - * @base_files: true if the base files should be added * @subsys_mask: mask of the subsystem ids whose files should be added + * + * On failure, no file is added. */ -static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { - int err; struct cgroup_subsys *ss; - - if (base_files) { - err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); - if (err < 0) - return err; - } + int i, ret = 0; /* process cftsets of each subsystem */ - for_each_root_subsys(cgrp->root, ss) { + for_each_subsys(ss, i) { struct cftype_set *set; - if (!test_bit(ss->subsys_id, &subsys_mask)) + + if (!test_bit(i, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, ss, set->cfts, true); + list_for_each_entry(set, &ss->cftsets, node) { + ret = cgroup_addrm_files(cgrp, set->cfts, true); + if (ret < 0) + goto err; + } } /* This cgroup is ready now */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); struct css_id *id = rcu_dereference_protected(css->id, true); /* @@ -4190,14 +4257,57 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, } return 0; +err: + cgroup_clear_dir(cgrp, subsys_mask); + return ret; } -static void css_dput_fn(struct work_struct *work) +/* + * css destruction is four-stage process. + * + * 1. Destruction starts. Killing of the percpu_ref is initiated. + * Implemented in kill_css(). + * + * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs + * and thus css_tryget() is guaranteed to fail, the css can be offlined + * by invoking offline_css(). After offlining, the base ref is put. + * Implemented in css_killed_work_fn(). + * + * 3. When the percpu_ref reaches zero, the only possible remaining + * accessors are inside RCU read sections. css_release() schedules the + * RCU callback. + * + * 4. After the grace period, the css can be freed. Implemented in + * css_free_work_fn(). + * + * It is actually hairier because both step 2 and 4 require process context + * and thus involve punting to css->destroy_work adding two additional + * steps to the already complex sequence. + */ +static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, dput_work); + container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup *cgrp = css->cgroup; - cgroup_dput(css->cgroup); + if (css->parent) + css_put(css->parent); + + css->ss->css_free(css); + cgroup_dput(cgrp); +} + +static void css_free_rcu_fn(struct rcu_head *rcu_head) +{ + struct cgroup_subsys_state *css = + container_of(rcu_head, struct cgroup_subsys_state, rcu_head); + + /* + * css holds an extra ref to @cgrp->dentry which is put on the last + * css_put(). dput() requires process context which we don't have. + */ + INIT_WORK(&css->destroy_work, css_free_work_fn); + schedule_work(&css->destroy_work); } static void css_release(struct percpu_ref *ref) @@ -4205,49 +4315,47 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - schedule_work(&css->dput_work); + call_rcu(&css->rcu_head, css_free_rcu_fn); } -static void init_cgroup_css(struct cgroup_subsys_state *css, - struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, + struct cgroup *cgrp) { css->cgroup = cgrp; + css->ss = ss; css->flags = 0; css->id = NULL; - if (cgrp == cgroup_dummy_top) - css->flags |= CSS_ROOT; - BUG_ON(cgrp->subsys[ss->subsys_id]); - cgrp->subsys[ss->subsys_id] = css; - /* - * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context, which css_put() may - * be called without. @css->dput_work will be used to invoke - * dput() asynchronously from css_put(). - */ - INIT_WORK(&css->dput_work, css_dput_fn); + if (cgrp->parent) + css->parent = cgroup_css(cgrp->parent, ss); + else + css->flags |= CSS_ROOT; + + BUG_ON(cgroup_css(cgrp, ss)); } -/* invoke ->post_create() on a new CSS and mark it online if successful */ -static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +/* invoke ->css_online() on a new CSS and mark it online if successful */ +static int online_css(struct cgroup_subsys_state *css) { + struct cgroup_subsys *ss = css->ss; int ret = 0; lockdep_assert_held(&cgroup_mutex); if (ss->css_online) - ret = ss->css_online(cgrp); - if (!ret) - cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; + ret = ss->css_online(css); + if (!ret) { + css->flags |= CSS_ONLINE; + css->cgroup->nr_css++; + rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); + } return ret; } -/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ -static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +/* if the CSS is online, invoke ->css_offline() on it and mark it offline */ +static void offline_css(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys *ss = css->ss; lockdep_assert_held(&cgroup_mutex); @@ -4255,9 +4363,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) return; if (ss->css_offline) - ss->css_offline(cgrp); + ss->css_offline(css); - cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; + css->flags &= ~CSS_ONLINE; + css->cgroup->nr_css--; + RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); } /* @@ -4271,6 +4381,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { + struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; @@ -4288,7 +4399,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_cgrp; rcu_assign_pointer(cgrp->name, name); - cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); + /* + * Temporarily set the pointer to NULL, so idr_find() won't return + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); if (cgrp->id < 0) goto err_free_name; @@ -4317,6 +4432,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->dentry = dentry; cgrp->parent = parent; + cgrp->dummy_css.parent = &parent->dummy_css; cgrp->root = parent->root; if (notify_on_release(parent)) @@ -4328,22 +4444,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(cgrp); + css = ss->css_alloc(cgroup_css(parent, ss)); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; } + css_ar[ss->subsys_id] = css; err = percpu_ref_init(&css->refcnt, css_release); - if (err) { - ss->css_free(cgrp); + if (err) goto err_free_all; - } - init_cgroup_css(css, ss, cgrp); + init_css(css, ss, cgrp); if (ss->use_id) { - err = alloc_css_id(ss, parent, cgrp); + err = alloc_css_id(css); if (err) goto err_free_all; } @@ -4365,16 +4480,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - /* each css holds a ref to the cgroup's dentry */ - for_each_root_subsys(root, ss) + /* each css holds a ref to the cgroup's dentry and the parent css */ + for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; + dget(dentry); + css_get(css->parent); + } /* hold a ref to the parent's dentry */ dget(parent->dentry); /* creation succeeded, notify subsystems */ for_each_root_subsys(root, ss) { - err = online_css(ss, cgrp); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; + + err = online_css(css); if (err) goto err_destroy; @@ -4388,7 +4509,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } - err = cgroup_populate_dir(cgrp, true, root->subsys_mask); + idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + + err = cgroup_addrm_files(cgrp, cgroup_base_files, true); + if (err) + goto err_destroy; + + err = cgroup_populate_dir(cgrp, root->subsys_mask); if (err) goto err_destroy; @@ -4399,18 +4526,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; if (css) { percpu_ref_cancel_init(&css->refcnt); - ss->css_free(cgrp); + ss->css_free(css); } } mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ deactivate_super(sb); err_free_id: - ida_simple_remove(&root->cgroup_ida, cgrp->id); + idr_remove(&root->cgroup_idr, cgrp->id); err_free_name: kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: @@ -4432,22 +4559,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -static void cgroup_css_killed(struct cgroup *cgrp) +/* + * This is called when the refcnt of a css is confirmed to be killed. + * css_tryget() is now guaranteed to fail. + */ +static void css_killed_work_fn(struct work_struct *work) { - if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) - return; + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup *cgrp = css->cgroup; - /* percpu ref's of all css's are killed, kick off the next step */ - INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); - schedule_work(&cgrp->destroy_work); + mutex_lock(&cgroup_mutex); + + /* + * css_tryget() is guaranteed to fail now. Tell subsystems to + * initate destruction. + */ + offline_css(css); + + /* + * If @cgrp is marked dead, it's waiting for refs of all css's to + * be disabled before proceeding to the second phase of cgroup + * destruction. If we are the last one, kick it off. + */ + if (!cgrp->nr_css && cgroup_is_dead(cgrp)) + cgroup_destroy_css_killed(cgrp); + + mutex_unlock(&cgroup_mutex); + + /* + * Put the css refs from kill_css(). Each css holds an extra + * reference to the cgroup's dentry and cgroup removal proceeds + * regardless of css refs. On the last put of each css, whenever + * that may be, the extra dentry ref is put so that dentry + * destruction happens only after all css's are released. + */ + css_put(css); } -static void css_ref_killed_fn(struct percpu_ref *ref) +/* css kill confirmation processing requires process context, bounce */ +static void css_killed_ref_fn(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - cgroup_css_killed(css->cgroup); + INIT_WORK(&css->destroy_work, css_killed_work_fn); + schedule_work(&css->destroy_work); +} + +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) +{ + cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); + + /* + * Killing would put the base ref, but we need to keep it alive + * until after ->css_offline(). + */ + css_get(css); + + /* + * cgroup core guarantees that, by the time ->css_offline() is + * invoked, no new css reference will be given out via + * css_tryget(). We can't simply call percpu_ref_kill() and + * proceed to offlining css's because percpu_ref_kill() doesn't + * guarantee that the ref is seen as killed on all CPUs on return. + * + * Use percpu_ref_kill_and_confirm() to get notifications as each + * css is confirmed to be seen as killed on all CPUs. + */ + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } /** @@ -4513,41 +4702,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Block new css_tryget() by killing css refcnts. cgroup core - * guarantees that, by the time ->css_offline() is invoked, no new - * css reference will be given out via css_tryget(). We can't - * simply call percpu_ref_kill() and proceed to offlining css's - * because percpu_ref_kill() doesn't guarantee that the ref is seen - * as killed on all CPUs on return. - * - * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. The - * notification callback keeps track of the number of css's to be - * killed and schedules cgroup_offline_fn() to perform the rest of - * destruction once the percpu refs of all css's are confirmed to - * be killed. + * Initiate massacre of all css's. cgroup_destroy_css_killed() + * will be invoked to perform the rest of destruction once the + * percpu refs of all css's are confirmed to be killed. */ - atomic_set(&cgrp->css_kill_cnt, 1); - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - /* - * Killing would put the base ref, but we need to keep it - * alive until after ->css_offline. - */ - percpu_ref_get(&css->refcnt); - - atomic_inc(&cgrp->css_kill_cnt); - percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); - } - cgroup_css_killed(cgrp); + for_each_root_subsys(cgrp->root, ss) + kill_css(cgroup_css(cgrp, ss)); /* * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to + * CGRP_DEAD assertion is depended upon by css_next_child() to * resume iteration after dropping RCU read lock. See - * cgroup_next_sibling() for details. + * css_next_child() for details. */ set_bit(CGRP_DEAD, &cgrp->flags); @@ -4558,9 +4725,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) raw_spin_unlock(&release_list_lock); /* - * Remove @cgrp directory. The removal puts the base ref but we - * aren't quite done with @cgrp yet, so hold onto it. + * If @cgrp has css's attached, the second stage of cgroup + * destruction is kicked off from css_killed_work_fn() after the + * refs of all attached css's are killed. If @cgrp doesn't have + * any css, we kick it off here. */ + if (!cgrp->nr_css) + cgroup_destroy_css_killed(cgrp); + + /* + * Clear the base files and remove @cgrp directory. The removal + * puts the base ref but we aren't quite done with @cgrp yet, so + * hold onto it. + */ + cgroup_addrm_files(cgrp, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); @@ -4580,50 +4758,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) }; /** - * cgroup_offline_fn - the second step of cgroup destruction + * cgroup_destroy_css_killed - the second step of cgroup destruction * @work: cgroup->destroy_free_work * * This function is invoked from a work item for a cgroup which is being - * destroyed after the percpu refcnts of all css's are guaranteed to be - * seen as killed on all CPUs, and performs the rest of destruction. This - * is the second step of destruction described in the comment above - * cgroup_destroy_locked(). + * destroyed after all css's are offlined and performs the rest of + * destruction. This is the second step of destruction described in the + * comment above cgroup_destroy_locked(). */ -static void cgroup_offline_fn(struct work_struct *work) +static void cgroup_destroy_css_killed(struct cgroup *cgrp) { - struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); struct cgroup *parent = cgrp->parent; struct dentry *d = cgrp->dentry; - struct cgroup_subsys *ss; - mutex_lock(&cgroup_mutex); - - /* - * css_tryget() is guaranteed to fail now. Tell subsystems to - * initate destruction. - */ - for_each_root_subsys(cgrp->root, ss) - offline_css(ss, cgrp); - - /* - * Put the css refs from cgroup_destroy_locked(). Each css holds - * an extra reference to the cgroup's dentry and cgroup removal - * proceeds regardless of css refs. On the last put of each css, - * whenever that may be, the extra dentry ref is put so that dentry - * destruction happens only after all css's are released. - */ - for_each_root_subsys(cgrp->root, ss) - css_put(cgrp->subsys[ss->subsys_id]); + lockdep_assert_held(&cgroup_mutex); /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); + /* + * We should remove the cgroup object from idr before its grace + * period starts, so we won't be looking up a cgroup while the + * cgroup is being freed. + */ + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + dput(d); set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); - - mutex_unlock(&cgroup_mutex); } static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) @@ -4646,6 +4810,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) * deregistration. */ if (ss->base_cftypes) { + struct cftype *cft; + + for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) + cft->ss = ss; + ss->base_cftset.cfts = ss->base_cftypes; list_add_tail(&ss->base_cftset.node, &ss->cftsets); } @@ -4665,10 +4834,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_dummy_top); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_cgroup_css(css, ss, cgroup_dummy_top); + init_css(css, ss, cgroup_dummy_top); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is @@ -4683,7 +4852,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - BUG_ON(online_css(ss, cgroup_dummy_top)); + BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); @@ -4744,7 +4913,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_dummy_top); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -4756,8 +4925,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) ss->root = &cgroup_dummy_root; /* our new subsystem will be attached to the dummy hierarchy. */ - init_cgroup_css(css, ss, cgroup_dummy_top); - /* init_idr must be after init_cgroup_css because it sets css->id. */ + init_css(css, ss, cgroup_dummy_top); + /* init_idr must be after init_css() because it sets css->id. */ if (ss->use_id) { ret = cgroup_init_idr(ss, css); if (ret) @@ -4787,7 +4956,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ret = online_css(ss, cgroup_dummy_top); + ret = online_css(css); if (ret) goto err_unload; @@ -4819,14 +4988,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) /* * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get in parse_cgroupfs_options should ensure that it + * try_module_get() in rebind_subsystems() should ensure that it * doesn't start being used while we're killing it off. */ BUG_ON(ss->root != &cgroup_dummy_root); mutex_lock(&cgroup_mutex); - offline_css(ss, cgroup_dummy_top); + offline_css(cgroup_css(cgroup_dummy_top, ss)); if (ss->use_id) idr_destroy(&ss->idr); @@ -4860,8 +5029,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_dummy_top); - cgroup_dummy_top->subsys[ss->subsys_id] = NULL; + ss->css_free(cgroup_css(cgroup_dummy_top, ss)); + RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); mutex_unlock(&cgroup_mutex); } @@ -4943,6 +5112,10 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); + err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, + 0, 1, GFP_KERNEL); + BUG_ON(err < 0); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); @@ -5099,7 +5272,7 @@ void cgroup_fork(struct task_struct *child) * Adds the task to the list running through its css_set if necessary and * call the subsystem fork() callbacks. Has to be after the task is * visible on the task list in case we race with the first call to - * cgroup_iter_start() - to guarantee that the new task ends up on its + * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ void cgroup_post_fork(struct task_struct *child) @@ -5212,10 +5385,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) */ for_each_builtin_subsys(ss, i) { if (ss->exit) { - struct cgroup *old_cgrp = cset->subsys[i]->cgroup; - struct cgroup *cgrp = task_cgroup(tsk, i); + struct cgroup_subsys_state *old_css = cset->subsys[i]; + struct cgroup_subsys_state *css = task_css(tsk, i); - ss->exit(cgrp, old_cgrp, tsk); + ss->exit(css, old_css, tsk); } } } @@ -5474,20 +5647,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, return 0; } -static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, - struct cgroup *child) +static int alloc_css_id(struct cgroup_subsys_state *child_css) { - int subsys_id, i, depth = 0; - struct cgroup_subsys_state *parent_css, *child_css; + struct cgroup_subsys_state *parent_css = css_parent(child_css); struct css_id *child_id, *parent_id; + int i, depth; - subsys_id = ss->subsys_id; - parent_css = parent->subsys[subsys_id]; - child_css = child->subsys[subsys_id]; parent_id = rcu_dereference_protected(parent_css->id, true); depth = parent_id->depth + 1; - child_id = get_new_cssid(ss, depth); + child_id = get_new_cssid(child_css->ss, depth); if (IS_ERR(child_id)) return PTR_ERR(child_id); @@ -5525,31 +5694,56 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) } EXPORT_SYMBOL_GPL(css_lookup); -/* - * get corresponding css from file open on cgroupfs directory +/** + * css_from_dir - get corresponding css from the dentry of a cgroup dir + * @dentry: directory dentry of interest + * @ss: subsystem of interest + * + * Must be called under RCU read lock. The caller is responsible for + * pinning the returned css if it needs to be accessed outside the RCU + * critical section. */ -struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct cgroup *cgrp; - struct inode *inode; - struct cgroup_subsys_state *css; - inode = file_inode(f); - /* check in cgroup filesystem dir */ - if (inode->i_op != &cgroup_dir_inode_operations) + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* is @dentry a cgroup dir? */ + if (!dentry->d_inode || + dentry->d_inode->i_op != &cgroup_dir_inode_operations) return ERR_PTR(-EBADF); - if (id < 0 || id >= CGROUP_SUBSYS_COUNT) - return ERR_PTR(-EINVAL); + cgrp = __d_cgrp(dentry); + return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); +} - /* get cgroup */ - cgrp = __d_cgrp(f->f_dentry); - css = cgrp->subsys[id]; - return css ? css : ERR_PTR(-ENOENT); +/** + * css_from_id - lookup css by id + * @id: the cgroup id + * @ss: cgroup subsys to be looked into + * + * Returns the css if there's valid one with @id, otherwise returns NULL. + * Should be called under rcu_read_lock(). + */ +struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) +{ + struct cgroup *cgrp; + + rcu_lockdep_assert(rcu_read_lock_held() || + lockdep_is_held(&cgroup_mutex), + "css_from_id() needs proper protection"); + + cgrp = idr_find(&ss->root->cgroup_idr, id); + if (cgrp) + return cgroup_css(cgrp, ss); + return NULL; } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5559,22 +5753,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) return css; } -static void debug_css_free(struct cgroup *cgrp) +static void debug_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp->subsys[debug_subsys_id]); + kfree(css); } -static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return cgroup_task_count(cgrp); + return cgroup_task_count(css->cgroup); } -static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) { return (u64)(unsigned long)current->cgroups; } -static u64 current_css_set_refcount_read(struct cgroup *cgrp, +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, struct cftype *cft) { u64 count; @@ -5585,7 +5781,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp, return count; } -static int current_css_set_cg_links_read(struct cgroup *cgrp, +static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *seq) { @@ -5612,14 +5808,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp, } #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup *cgrp, - struct cftype *cft, - struct seq_file *seq) +static int cgroup_css_links_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) { + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; @@ -5638,9 +5833,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp, return 0; } -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return test_bit(CGRP_RELEASABLE, &cgrp->flags); + return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); } static struct cftype debug_files[] = { diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 75dda1ea5026..f0ff64d0ebaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -45,25 +45,19 @@ struct freezer { spinlock_t lock; }; -static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) +static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), - struct freezer, css); + return css ? container_of(css, struct freezer, css) : NULL; } static inline struct freezer *task_freezer(struct task_struct *task) { - return container_of(task_subsys_state(task, freezer_subsys_id), - struct freezer, css); + return css_freezer(task_css(task, freezer_subsys_id)); } static struct freezer *parent_freezer(struct freezer *freezer) { - struct cgroup *pcg = freezer->css.cgroup->parent; - - if (pcg) - return cgroup_freezer(pcg); - return NULL; + return css_freezer(css_parent(&freezer->css)); } bool cgroup_freezing(struct task_struct *task) @@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state) struct cgroup_subsys freezer_subsys; -static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +freezer_css_alloc(struct cgroup_subsys_state *parent_css) { struct freezer *freezer; @@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) } /** - * freezer_css_online - commit creation of a freezer cgroup - * @cgroup: cgroup being created + * freezer_css_online - commit creation of a freezer css + * @css: css being created * - * We're committing to creation of @cgroup. Mark it online and inherit + * We're committing to creation of @css. Mark it online and inherit * parent's freezing state while holding both parent's and our * freezer->lock. */ -static int freezer_css_online(struct cgroup *cgroup) +static int freezer_css_online(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); /* * The following double locking and freezing state inheritance * guarantee that @cgroup can never escape ancestors' freezing - * states. See cgroup_for_each_descendant_pre() for details. + * states. See css_for_each_descendant_pre() for details. */ if (parent) spin_lock_irq(&parent->lock); @@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup) } /** - * freezer_css_offline - initiate destruction of @cgroup - * @cgroup: cgroup being destroyed + * freezer_css_offline - initiate destruction of a freezer css + * @css: css being destroyed * - * @cgroup is going away. Mark it dead and decrement system_freezing_count - * if it was holding one. + * @css is going away. Mark it dead and decrement system_freezing_count if + * it was holding one. */ -static void freezer_css_offline(struct cgroup *cgroup) +static void freezer_css_offline(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); spin_lock_irq(&freezer->lock); @@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup) spin_unlock_irq(&freezer->lock); } -static void freezer_css_free(struct cgroup *cgroup) +static void freezer_css_free(struct cgroup_subsys_state *css) { - kfree(cgroup_freezer(cgroup)); + kfree(css_freezer(css)); } /* @@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_subsys_state *new_css, + struct cgroup_taskset *tset) { - struct freezer *freezer = cgroup_freezer(new_cgrp); + struct freezer *freezer = css_freezer(new_css); struct task_struct *task; bool clear_frozen = false; spin_lock_irq(&freezer->lock); /* - * Make the new tasks conform to the current state of @new_cgrp. + * Make the new tasks conform to the current state of @new_css. * For simplicity, when migrating any task to a FROZEN cgroup, we * revert it to FREEZING and let update_if_frozen() determine the * correct state later. * - * Tasks in @tset are on @new_cgrp but may not conform to its + * Tasks in @tset are on @new_css but may not conform to its * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_cgrp, tset) { + cgroup_taskset_for_each(task, new_css, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { @@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task) * The root cgroup is non-freezable, so we can skip the * following check. */ - if (!freezer->css.cgroup->parent) + if (!parent_freezer(freezer)) goto out; spin_lock_irq(&freezer->lock); @@ -244,7 +240,7 @@ out: /** * update_if_frozen - update whether a cgroup finished freezing - * @cgroup: cgroup of interest + * @css: css of interest * * Once FREEZING is initiated, transition to FROZEN is lazily updated by * calling this function. If the current state is FREEZING but not FROZEN, @@ -255,14 +251,14 @@ out: * update_if_frozen() on all descendants prior to invoking this function. * * Task states and freezer state might disagree while tasks are being - * migrated into or out of @cgroup, so we can't verify task states against + * migrated into or out of @css, so we can't verify task states against * @freezer state here. See freezer_attach() for details. */ -static void update_if_frozen(struct cgroup *cgroup) +static void update_if_frozen(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); - struct cgroup *pos; - struct cgroup_iter it; + struct freezer *freezer = css_freezer(css); + struct cgroup_subsys_state *pos; + struct css_task_iter it; struct task_struct *task; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup) goto out_unlock; /* are all (live) children frozen? */ - cgroup_for_each_child(pos, cgroup) { - struct freezer *child = cgroup_freezer(pos); + css_for_each_child(pos, css) { + struct freezer *child = css_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && !(child->state & CGROUP_FROZEN)) @@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup) } /* are all tasks frozen? */ - cgroup_iter_start(cgroup, &it); + css_task_iter_start(css, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { + while ((task = css_task_iter_next(&it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_iter_end(cgroup, &it); + css_task_iter_end(&it); out_unlock: spin_unlock_irq(&freezer->lock); } -static int freezer_read(struct cgroup *cgroup, struct cftype *cft, +static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; rcu_read_lock(); /* update states bottom-up */ - cgroup_for_each_descendant_post(pos, cgroup) + css_for_each_descendant_post(pos, css) update_if_frozen(pos); - update_if_frozen(cgroup); rcu_read_unlock(); - seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); + seq_puts(m, freezer_state_strs(css_freezer(css)->state)); seq_putc(m, '\n'); return 0; } static void freeze_cgroup(struct freezer *freezer) { - struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) + css_task_iter_start(&freezer->css, &it); + while ((task = css_task_iter_next(&it))) freeze_task(task); - cgroup_iter_end(cgroup, &it); + css_task_iter_end(&it); } static void unfreeze_cgroup(struct freezer *freezer) { - struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) + css_task_iter_start(&freezer->css, &it); + while ((task = css_task_iter_next(&it))) __thaw_task(task); - cgroup_iter_end(cgroup, &it); + css_task_iter_end(&it); } /** @@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, */ static void freezer_change_state(struct freezer *freezer, bool freeze) { - struct cgroup *pos; - - /* update @freezer */ - spin_lock_irq(&freezer->lock); - freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); - spin_unlock_irq(&freezer->lock); + struct cgroup_subsys_state *pos; /* * Update all its descendants in pre-order traversal. Each @@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) * CGROUP_FREEZING_PARENT. */ rcu_read_lock(); - cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { - struct freezer *pos_f = cgroup_freezer(pos); + css_for_each_descendant_pre(pos, &freezer->css) { + struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); - /* - * Our update to @parent->state is already visible which is - * all we need. No need to lock @parent. For more info on - * synchronization, see freezer_post_create(). - */ spin_lock_irq(&pos_f->lock); - freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, - CGROUP_FREEZING_PARENT); + + if (pos_f == freezer) { + freezer_apply_state(pos_f, freeze, + CGROUP_FREEZING_SELF); + } else { + /* + * Our update to @parent->state is already visible + * which is all we need. No need to lock @parent. + * For more info on synchronization, see + * freezer_post_create(). + */ + freezer_apply_state(pos_f, + parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + } + spin_unlock_irq(&pos_f->lock); } rcu_read_unlock(); } -static int freezer_write(struct cgroup *cgroup, struct cftype *cft, +static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { bool freeze; @@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, else return -EINVAL; - freezer_change_state(cgroup_freezer(cgroup), freeze); + freezer_change_state(css_freezer(css), freeze); return 0; } -static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) +static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_SELF); } -static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) +static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_PARENT); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ea1966db34f2..6bf981e13c43 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -68,10 +68,6 @@ */ int number_of_cpusets __read_mostly; -/* Forward declare cgroup structures */ -struct cgroup_subsys cpuset_subsys; -struct cpuset; - /* See "Frequency meter" comments, below. */ struct fmeter { @@ -115,27 +111,20 @@ struct cpuset { int relax_domain_level; }; -/* Retrieve the cpuset for a cgroup */ -static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) +static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), - struct cpuset, css); + return css ? container_of(css, struct cpuset, css) : NULL; } /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return container_of(task_subsys_state(task, cpuset_subsys_id), - struct cpuset, css); + return css_cs(task_css(task, cpuset_subsys_id)); } -static inline struct cpuset *parent_cs(const struct cpuset *cs) +static inline struct cpuset *parent_cs(struct cpuset *cs) { - struct cgroup *pcgrp = cs->css.cgroup->parent; - - if (pcgrp) - return cgroup_cs(pcgrp); - return NULL; + return css_cs(css_parent(&cs->css)); } #ifdef CONFIG_NUMA @@ -212,29 +201,30 @@ static struct cpuset top_cpuset = { /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @parent_cs: target cpuset to walk children of * * Walk @child_cs through the online children of @parent_cs. Must be used * with RCU read locked. */ -#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ - cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ - if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) +#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ + css_for_each_child((pos_css), &(parent_cs)->css) \ + if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) /** * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants * @des_cs: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @root_cs: target cpuset to walk ancestor of * * Walk @des_cs through the online descendants of @root_cs. Must be used - * with RCU read locked. The caller may modify @pos_cgrp by calling - * cgroup_rightmost_descendant() to skip subtree. + * with RCU read locked. The caller may modify @pos_css by calling + * css_rightmost_descendant() to skip subtree. @root_cs is included in the + * iteration and the first node to be visited. */ -#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ - cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ - if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) +#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ + css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ + if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* * There are two global mutexes guarding cpuset structures - cpuset_mutex @@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = { * * Call with callback_mutex held. */ -static void guarantee_online_cpus(const struct cpuset *cs, - struct cpumask *pmask) +static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = parent_cs(cs); @@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, * * Call with callback_mutex held. */ -static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) +static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) cs = parent_cs(cs); @@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) * alloc_trial_cpuset - allocate a trial cpuset * @cs: the cpuset that the trial cpuset duplicates */ -static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) +static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) { struct cpuset *trial; @@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial) * Return 0 if valid, -errno if not. */ -static int validate_change(const struct cpuset *cur, const struct cpuset *trial) +static int validate_change(struct cpuset *cur, struct cpuset *trial) { - struct cgroup *cgrp; + struct cgroup_subsys_state *css; struct cpuset *c, *par; int ret; @@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) /* Each of our child cpusets must be a subset of us */ ret = -EBUSY; - cpuset_for_each_child(c, cgrp, cur) + cpuset_for_each_child(c, css, cur) if (!is_cpuset_subset(c, trial)) goto out; @@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * overlap */ ret = -EINVAL; - cpuset_for_each_child(c, cgrp, par) { + cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) @@ -515,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *root_cs) { struct cpuset *cp; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + if (cp == root_cs) + continue; + /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); continue; } @@ -596,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; doms = NULL; dattr = NULL; @@ -625,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains, csn = 0; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { + cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { + if (cp == &top_cpuset) + continue; /* * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The @@ -642,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csa[csn++] = cp; /* skip @cp's subtree */ - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); @@ -837,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) /** * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's * @tsk: task to test - * @scan: struct cgroup_scanner containing the cgroup of the task + * @data: cpuset to @tsk belongs to * - * Called by cgroup_scan_tasks() for each task in a cgroup whose - * cpus_allowed mask needs to be changed. + * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed + * mask needs to be changed. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. */ -static void cpuset_change_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) +static void cpuset_change_cpumask(struct task_struct *tsk, void *data) { - struct cpuset *cpus_cs; + struct cpuset *cs = data; + struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); - cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg)); set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); } /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * * Called with cpuset_mutex held * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * The css_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * No return value. It's guaranteed that css_scan_tasks() always returns 0 * if @heap != NULL. */ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { - struct cgroup_scanner scan; - - scan.cg = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_cpumask; - scan.heap = heap; - cgroup_scan_tasks(&scan); + css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); } /* * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. * @root_cs: the root cpuset of the hierarchy * @update_root: update root cpuset or not? - * @heap: the heap used by cgroup_scan_tasks() + * @heap: the heap used by css_scan_tasks() * * This will update cpumasks of tasks in @root_cs and all other empty cpusets * which take on cpumask of @root_cs. @@ -893,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root, struct ptr_heap *heap) { struct cpuset *cp; - struct cgroup *pos_cgrp; - - if (update_root) - update_tasks_cpumask(root_cs, heap); + struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { - /* skip the whole subtree if @cp have some CPU */ - if (!cpumask_empty(cp->cpus_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); - continue; + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + if (cp == root_cs) { + if (!update_root) + continue; + } else { + /* skip the whole subtree if @cp have some CPU */ + if (!cpumask_empty(cp->cpus_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } } if (!css_tryget(&cp->css)) continue; @@ -1059,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_unlock(tsk); } +struct cpuset_change_nodemask_arg { + struct cpuset *cs; + nodemask_t *newmems; +}; + /* * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy * of it to cpuset's new mems_allowed, and migrate pages to new nodes if * memory_migrate flag is set. Called with cpuset_mutex held. */ -static void cpuset_change_nodemask(struct task_struct *p, - struct cgroup_scanner *scan) +static void cpuset_change_nodemask(struct task_struct *p, void *data) { - struct cpuset *cs = cgroup_cs(scan->cg); + struct cpuset_change_nodemask_arg *arg = data; + struct cpuset *cs = arg->cs; struct mm_struct *mm; int migrate; - nodemask_t *newmems = scan->data; - cpuset_change_task_nodemask(p, newmems); + cpuset_change_task_nodemask(p, arg->newmems); mm = get_task_mm(p); if (!mm) @@ -1082,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p, mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) - cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); + cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); mmput(mm); } @@ -1091,28 +1084,22 @@ static void *cpuset_being_rebound; /** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * - * Called with cpuset_mutex held - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 - * if @heap != NULL. + * Called with cpuset_mutex held. No return value. It's guaranteed that + * css_scan_tasks() always returns 0 if @heap != NULL. */ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) { static nodemask_t newmems; /* protected by cpuset_mutex */ - struct cgroup_scanner scan; struct cpuset *mems_cs = effective_nodemask_cpuset(cs); + struct cpuset_change_nodemask_arg arg = { .cs = cs, + .newmems = &newmems }; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ guarantee_online_mems(mems_cs, &newmems); - scan.cg = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_nodemask; - scan.heap = heap; - scan.data = &newmems; - /* * The mpol_rebind_mm() call takes mmap_sem, which we couldn't * take while holding tasklist_lock. Forks can happen - the @@ -1123,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - cgroup_scan_tasks(&scan); + css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); /* * All the tasks' nodemasks have been updated, update @@ -1139,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. * @cs: the root cpuset of the hierarchy * @update_root: update the root cpuset or not? - * @heap: the heap used by cgroup_scan_tasks() + * @heap: the heap used by css_scan_tasks() * * This will update nodemasks of tasks in @root_cs and all other empty cpusets * which take on nodemask of @root_cs. @@ -1150,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root, struct ptr_heap *heap) { struct cpuset *cp; - struct cgroup *pos_cgrp; - - if (update_root) - update_tasks_nodemask(root_cs, heap); + struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { - /* skip the whole subtree if @cp have some CPU */ - if (!nodes_empty(cp->mems_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); - continue; + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + if (cp == root_cs) { + if (!update_root) + continue; + } else { + /* skip the whole subtree if @cp have some CPU */ + if (!nodes_empty(cp->mems_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } } if (!css_tryget(&cp->css)) continue; @@ -1267,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) return 0; } -/* +/** * cpuset_change_flag - make a task's spread flags the same as its cpuset's * @tsk: task to be updated - * @scan: struct cgroup_scanner containing the cgroup of the task + * @data: cpuset to @tsk belongs to * - * Called by cgroup_scan_tasks() for each task in a cgroup. + * Called by css_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. */ -static void cpuset_change_flag(struct task_struct *tsk, - struct cgroup_scanner *scan) +static void cpuset_change_flag(struct task_struct *tsk, void *data) { - cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); + struct cpuset *cs = data; + + cpuset_update_task_spread_flag(cs, tsk); } -/* +/** * update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * * Called with cpuset_mutex held * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * The css_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * No return value. It's guaranteed that css_scan_tasks() always returns 0 * if @heap != NULL. */ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) { - struct cgroup_scanner scan; - - scan.cg = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_flag; - scan.heap = heap; - cgroup_scan_tasks(&scan); + css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); } /* @@ -1462,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp) } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct task_struct *task; int ret; @@ -1475,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * flag is set. */ ret = -ENOSPC; - if (!cgroup_sane_behavior(cgrp) && + if (!cgroup_sane_behavior(css->cgroup) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1508,11 +1493,11 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup *cgrp, +static void cpuset_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { mutex_lock(&cpuset_mutex); - cgroup_cs(cgrp)->attach_in_progress--; + css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); } @@ -1523,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); - struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); - struct cpuset *cs = cgroup_cs(cgrp); - struct cpuset *oldcs = cgroup_cs(oldcgrp); + struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, + cpuset_subsys_id); + struct cpuset *cs = css_cs(css); + struct cpuset *oldcs = css_cs(oldcss); struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct cpuset *mems_cs = effective_nodemask_cpuset(cs); @@ -1546,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1608,9 +1595,10 @@ typedef enum { FILE_SPREAD_SLAB, } cpuset_filetype_t; -static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; int retval = 0; @@ -1657,9 +1645,10 @@ out_unlock: return retval; } -static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) +static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; int retval = -ENODEV; @@ -1683,10 +1672,10 @@ out_unlock: /* * Common handling for a write to a "cpus" or "mems" file. */ -static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cpuset_write_resmask(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *trialcs; int retval = -ENODEV; @@ -1765,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) return count; } -static ssize_t cpuset_common_file_read(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, - char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; char *page; ssize_t retval = 0; @@ -1801,9 +1789,9 @@ out: return retval; } -static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1832,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) return 0; } -static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) +static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: @@ -1949,11 +1937,12 @@ static struct cftype files[] = { * cgrp: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuset *cs; - if (!cgrp->parent) + if (!parent_css) return &top_cpuset.css; cs = kzalloc(sizeof(*cs), GFP_KERNEL); @@ -1973,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) return &cs->css; } -static int cpuset_css_online(struct cgroup *cgrp) +static int cpuset_css_online(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; - struct cgroup *pos_cg; + struct cgroup_subsys_state *pos_css; if (!parent) return 0; @@ -1993,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp) number_of_cpusets++; - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; /* @@ -2010,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp) * (and likewise for mems) to the new cgroup. */ rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_cg, parent) { + cpuset_for_each_child(tmp_cs, pos_css, parent) { if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { rcu_read_unlock(); goto out_unlock; @@ -2027,9 +2016,15 @@ out_unlock: return 0; } -static void cpuset_css_offline(struct cgroup *cgrp) +/* + * If the cpuset being removed has its flag 'sched_load_balance' + * enabled, then simulate turning sched_load_balance off, which + * will call rebuild_sched_domains_locked(). + */ + +static void cpuset_css_offline(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -2042,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp) mutex_unlock(&cpuset_mutex); } -/* - * If the cpuset being removed has its flag 'sched_load_balance' - * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). - */ - -static void cpuset_css_free(struct cgroup *cgrp) +static void cpuset_css_free(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); free_cpumask_var(cs->cpus_allowed); kfree(cs); @@ -2257,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { struct cpuset *cs; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { - if (!css_tryget(&cs->css)) + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + if (cs == &top_cpuset || !css_tryget(&cs->css)) continue; rcu_read_unlock(); @@ -2350,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { - const struct cpuset *cpus_cs; + struct cpuset *cpus_cs; rcu_read_lock(); cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); @@ -2423,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ -static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) +static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) { while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) cs = parent_cs(cs); @@ -2493,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) */ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) { - const struct cpuset *cs; /* current cpuset ancestors */ + struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) @@ -2731,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) goto out_free; rcu_read_lock(); - css = task_subsys_state(tsk, cpuset_subsys_id); + css = task_css(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); rcu_read_unlock(); if (retval < 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index f86599e8c123..9300f5226077 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -340,8 +340,8 @@ struct perf_cgroup { static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { - return container_of(task_subsys_state(task, perf_subsys_id), - struct perf_cgroup, css); + return container_of(task_css(task, perf_subsys_id), + struct perf_cgroup, css); } static inline bool @@ -591,7 +591,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - css = cgroup_css_from_dir(f.file, perf_subsys_id); + rcu_read_lock(); + + css = css_from_dir(f.file->f_dentry, &perf_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; @@ -617,6 +619,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: + rcu_read_unlock(); fdput(f); return ret; } @@ -7798,7 +7801,8 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) +static struct cgroup_subsys_state * +perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct perf_cgroup *jc; @@ -7815,11 +7819,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) return &jc->css; } -static void perf_cgroup_css_free(struct cgroup *cont) +static void perf_cgroup_css_free(struct cgroup_subsys_state *css) { - struct perf_cgroup *jc; - jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), - struct perf_cgroup, css); + struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); + free_percpu(jc->info); kfree(jc); } @@ -7831,15 +7834,17 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, +static void perf_cgroup_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, struct task_struct *task) { /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 05c39f030314..e53bda3ff2f1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6815,7 +6815,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -7137,23 +7137,22 @@ int sched_rt_handler(struct ctl_table *table, int write, #ifdef CONFIG_CGROUP_SCHED -/* return corresponding task_group object of a cgroup */ -static inline struct task_group *cgroup_tg(struct cgroup *cgrp) +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), - struct task_group, css); + return css ? container_of(css, struct task_group, css) : NULL; } -static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { - struct task_group *tg, *parent; + struct task_group *parent = css_tg(parent_css); + struct task_group *tg; - if (!cgrp->parent) { + if (!parent) { /* This is early initialization for the top cgroup */ return &root_task_group.css; } - parent = cgroup_tg(cgrp->parent); tg = sched_create_group(parent); if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); @@ -7161,41 +7160,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup *cgrp) +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); - struct task_group *parent; + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css_parent(css)); - if (!cgrp->parent) - return 0; - - parent = cgroup_tg(cgrp->parent); - sched_online_group(tg, parent); + if (parent) + sched_online_group(tg, parent); return 0; } -static void cpu_cgroup_css_free(struct cgroup *cgrp) +static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_destroy_group(tg); } -static void cpu_cgroup_css_offline(struct cgroup *cgrp) +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_offline_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup *cgrp, +static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) + if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -7206,18 +7202,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp, return 0; } -static void cpu_cgroup_attach(struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } -static void -cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, - struct task_struct *task) +static void cpu_cgroup_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. @@ -7231,15 +7227,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, } #ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 shareval) +static int cpu_shares_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 shareval) { - return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); + return sched_group_set_shares(css_tg(css), scale_load(shareval)); } -static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); return (u64) scale_load_down(tg->shares); } @@ -7361,26 +7358,28 @@ long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } -static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_quota(cgroup_tg(cgrp)); + return tg_get_cfs_quota(css_tg(css)); } -static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, - s64 cfs_quota_us) +static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 cfs_quota_us) { - return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); + return tg_set_cfs_quota(css_tg(css), cfs_quota_us); } -static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_period(cgroup_tg(cgrp)); + return tg_get_cfs_period(css_tg(css)); } -static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 cfs_period_us) +static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_period_us) { - return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); + return tg_set_cfs_period(css_tg(css), cfs_period_us); } struct cfs_schedulable_data { @@ -7461,10 +7460,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, +static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; cb->fill(cb, "nr_periods", cfs_b->nr_periods); @@ -7477,26 +7476,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - s64 val) +static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, + struct cftype *cft, s64 val) { - return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); + return sched_group_set_rt_runtime(css_tg(css), val); } -static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_runtime(cgroup_tg(cgrp)); + return sched_group_rt_runtime(css_tg(css)); } -static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_period_us) +static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 rt_period_us) { - return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); + return sched_group_set_rt_period(css_tg(css), rt_period_us); } -static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_period(cgroup_tg(cgrp)); + return sched_group_rt_period(css_tg(css)); } #endif /* CONFIG_RT_GROUP_SCHED */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dbb7e2cd95eb..f64722ff0299 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -33,30 +33,20 @@ struct cpuacct { struct kernel_cpustat __percpu *cpustat; }; -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); + return css ? container_of(css, struct cpuacct, css) : NULL; } /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -static inline struct cpuacct *__parent_ca(struct cpuacct *ca) -{ - return cgroup_ca(ca->css.cgroup->parent); + return css_ca(task_css(tsk, cpuacct_subsys_id)); } static inline struct cpuacct *parent_ca(struct cpuacct *ca) { - if (!ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); + return css_ca(css_parent(&ca->css)); } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); @@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = { }; /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuacct *ca; - if (!cgrp->parent) + if (!parent_css) return &root_cpuacct.css; ca = kzalloc(sizeof(*ca), GFP_KERNEL); @@ -96,9 +87,9 @@ out: } /* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) +static void cpuacct_css_free(struct cgroup_subsys_state *css) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); free_percpu(ca->cpustat); free_percpu(ca->cpuusage); @@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; int i; @@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) return totalcpuusage; } -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) +static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 reset) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); int err = 0; int i; @@ -172,10 +163,10 @@ out: return err; } -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) +static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct cpuacct *ca = cgroup_ca(cgroup); + struct cpuacct *ca = css_ca(css); u64 percpu; int i; @@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) +static int cpuacct_stats_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct cgroup_map_cb *cb) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); int cpu; s64 val = 0; @@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) while (ca != &root_cpuacct) { kcpustat = this_cpu_ptr(ca->cpustat); kcpustat->cpustat[index] += val; - ca = __parent_ca(ca); + ca = parent_ca(ca); } rcu_read_unlock(); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef0a7b2439dd..471a56db05ea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg); /* * Return the group to which this tasks belongs. * - * We cannot use task_subsys_state() and friends because the cgroup - * subsystem changes that value before the cgroup_subsys::attach() method - * is called, therefore we cannot pin it and might observe the wrong value. + * We cannot use task_css() and friends because the cgroup subsystem + * changes that value before the cgroup_subsys::attach() method is called, + * therefore we cannot pin it and might observe the wrong value. * * The same is true for autogroup's p->signal->autogroup->tg, the autogroup * core changes this before calling sched_move_task(). diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 9cea7de22ffb..bda8e44f6fde 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -36,21 +36,13 @@ static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) { - return container_of(s, struct hugetlb_cgroup, css); -} - -static inline -struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup) -{ - return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup, - hugetlb_subsys_id)); + return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { - return hugetlb_cgroup_from_css(task_subsys_state(task, - hugetlb_subsys_id)); + return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) @@ -58,17 +50,15 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) return (h_cg == root_h_cgroup); } -static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg) +static inline struct hugetlb_cgroup * +parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) { - if (!cg->parent) - return NULL; - return hugetlb_cgroup_from_cgroup(cg->parent); + return hugetlb_cgroup_from_css(css_parent(&h_cg->css)); } -static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) +static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) { int idx; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg); for (idx = 0; idx < hugetlb_max_hstate; idx++) { if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) @@ -77,19 +67,18 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) return false; } -static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { + struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); + struct hugetlb_cgroup *h_cgroup; int idx; - struct cgroup *parent_cgroup; - struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup; h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); - parent_cgroup = cgroup->parent; - if (parent_cgroup) { - parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup); + if (parent_h_cgroup) { for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) res_counter_init(&h_cgroup->hugepage[idx], &parent_h_cgroup->hugepage[idx]); @@ -101,11 +90,11 @@ static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgrou return &h_cgroup->css; } -static void hugetlb_cgroup_css_free(struct cgroup *cgroup) +static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cgroup; - h_cgroup = hugetlb_cgroup_from_cgroup(cgroup); + h_cgroup = hugetlb_cgroup_from_css(css); kfree(h_cgroup); } @@ -117,15 +106,14 @@ static void hugetlb_cgroup_css_free(struct cgroup *cgroup) * page reference and test for page active here. This function * cannot fail. */ -static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup, +static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct page *page) { int csize; struct res_counter *counter; struct res_counter *fail_res; struct hugetlb_cgroup *page_hcg; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); - struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup); + struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); page_hcg = hugetlb_cgroup_from_page(page); /* @@ -155,8 +143,9 @@ out: * Force the hugetlb cgroup to empty the hugetlb resources by moving them to * the parent cgroup. */ -static void hugetlb_cgroup_css_offline(struct cgroup *cgroup) +static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; struct page *page; int idx = 0; @@ -165,13 +154,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup *cgroup) for_each_hstate(h) { spin_lock(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) - hugetlb_cgroup_move_parent(idx, cgroup, page); + hugetlb_cgroup_move_parent(idx, h_cg, page); spin_unlock(&hugetlb_lock); idx++; } cond_resched(); - } while (hugetlb_cgroup_have_usage(cgroup)); + } while (hugetlb_cgroup_have_usage(h_cg)); } int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, @@ -253,14 +242,15 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, return; } -static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, - struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { u64 val; char str[64]; int idx, name, len; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); idx = MEMFILE_IDX(cft->private); name = MEMFILE_ATTR(cft->private); @@ -270,12 +260,12 @@ static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, str, len); } -static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, - const char *buffer) +static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { int idx, name, ret; unsigned long long val; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); idx = MEMFILE_IDX(cft->private); name = MEMFILE_ATTR(cft->private); @@ -300,10 +290,11 @@ static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, return ret; } -static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event) +static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css, + unsigned int event) { int idx, name, ret = 0; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); idx = MEMFILE_IDX(event); name = MEMFILE_ATTR(event); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0878ff7c26a9..3b83957b6439 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -483,10 +483,9 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) { - return container_of(s, struct mem_cgroup, css); + return s ? container_of(s, struct mem_cgroup, css) : NULL; } /* Some nice accessors for the vmpressure. */ @@ -1035,12 +1034,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); } -struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) -{ - return mem_cgroup_from_css( - cgroup_subsys_state(cont, mem_cgroup_subsys_id)); -} - struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* @@ -1051,7 +1044,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) if (unlikely(!p)) return NULL; - return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); + return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); } struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) @@ -1084,20 +1077,11 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, struct mem_cgroup *last_visited) { - struct cgroup *prev_cgroup, *next_cgroup; + struct cgroup_subsys_state *prev_css, *next_css; - /* - * Root is not visited by cgroup iterators so it needs an - * explicit visit. - */ - if (!last_visited) - return root; - - prev_cgroup = (last_visited == root) ? NULL - : last_visited->css.cgroup; + prev_css = last_visited ? &last_visited->css : NULL; skip_node: - next_cgroup = cgroup_next_descendant_pre( - prev_cgroup, root->css.cgroup); + next_css = css_next_descendant_pre(prev_css, &root->css); /* * Even if we found a group we have to make sure it is @@ -1106,13 +1090,13 @@ skip_node: * last_visited css is safe to use because it is * protected by css_get and the tree walk is rcu safe. */ - if (next_cgroup) { - struct mem_cgroup *mem = mem_cgroup_from_cont( - next_cgroup); + if (next_css) { + struct mem_cgroup *mem = mem_cgroup_from_css(next_css); + if (css_tryget(&mem->css)) return mem; else { - prev_cgroup = next_cgroup; + prev_css = next_css; goto skip_node; } } @@ -1525,10 +1509,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) int mem_cgroup_swappiness(struct mem_cgroup *memcg) { - struct cgroup *cgrp = memcg->css.cgroup; - /* root ? */ - if (cgrp->parent == NULL) + if (!css_parent(&memcg->css)) return vm_swappiness; return memcg->swappiness; @@ -1805,12 +1787,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; for_each_mem_cgroup_tree(iter, memcg) { - struct cgroup *cgroup = iter->css.cgroup; - struct cgroup_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { + css_task_iter_start(&iter->css, &it); + while ((task = css_task_iter_next(&it))) { switch (oom_scan_process_thread(task, totalpages, NULL, false)) { case OOM_SCAN_SELECT: @@ -1823,7 +1804,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, case OOM_SCAN_CONTINUE: continue; case OOM_SCAN_ABORT: - cgroup_iter_end(cgroup, &it); + css_task_iter_end(&it); mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); @@ -1840,7 +1821,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, get_task_struct(chosen); } } - cgroup_iter_end(cgroup, &it); + css_task_iter_end(&it); } if (!chosen) @@ -2954,10 +2935,10 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) } #ifdef CONFIG_SLABINFO -static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, - struct seq_file *m) +static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct memcg_cache_params *params; if (!memcg_can_account_kmem(memcg)) @@ -4943,10 +4924,10 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) */ static inline bool __memcg_has_children(struct mem_cgroup *memcg) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; /* bounce at first found */ - cgroup_for_each_child(pos, memcg->css.cgroup) + css_for_each_child(pos, &memcg->css) return true; return false; } @@ -5002,9 +4983,10 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return 0; } -static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) +static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, + unsigned int event) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); int ret; if (mem_cgroup_is_root(memcg)) @@ -5017,21 +4999,18 @@ static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) } -static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) +static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return mem_cgroup_from_cont(cont)->use_hierarchy; + return mem_cgroup_from_css(css)->use_hierarchy; } -static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, - u64 val) +static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { int retval = 0; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - struct cgroup *parent = cont->parent; - struct mem_cgroup *parent_memcg = NULL; - - if (parent) - parent_memcg = mem_cgroup_from_cont(parent); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); mutex_lock(&memcg_create_mutex); @@ -5101,11 +5080,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return val << PAGE_SHIFT; } -static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, - struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); char str[64]; u64 val; int name, len; @@ -5138,11 +5117,11 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, str, len); } -static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) +static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) { int ret = -EINVAL; #ifdef CONFIG_MEMCG_KMEM - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* * For simplicity, we won't allow this to be disabled. It also can't * be changed if the cgroup has children already, or if tasks had @@ -5158,7 +5137,7 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) mutex_lock(&memcg_create_mutex); mutex_lock(&set_limit_mutex); if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { - if (cgroup_task_count(cont) || memcg_has_children(memcg)) { + if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { ret = -EBUSY; goto out; } @@ -5228,10 +5207,10 @@ out: * The user of this function is... * RES_LIMIT. */ -static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, +static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); enum res_type type; int name; unsigned long long val; @@ -5255,7 +5234,7 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, else if (type == _MEMSWAP) ret = mem_cgroup_resize_memsw_limit(memcg, val); else if (type == _KMEM) - ret = memcg_update_kmem_limit(cont, val); + ret = memcg_update_kmem_limit(css, val); else return -EINVAL; break; @@ -5283,18 +5262,15 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, unsigned long long *mem_limit, unsigned long long *memsw_limit) { - struct cgroup *cgroup; unsigned long long min_limit, min_memsw_limit, tmp; min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - cgroup = memcg->css.cgroup; if (!memcg->use_hierarchy) goto out; - while (cgroup->parent) { - cgroup = cgroup->parent; - memcg = mem_cgroup_from_cont(cgroup); + while (css_parent(&memcg->css)) { + memcg = mem_cgroup_from_css(css_parent(&memcg->css)); if (!memcg->use_hierarchy) break; tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -5307,9 +5283,9 @@ out: *memsw_limit = min_memsw_limit; } -static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) +static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); int name; enum res_type type; @@ -5342,17 +5318,17 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) return 0; } -static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, +static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; + return mem_cgroup_from_css(css)->move_charge_at_immigrate; } #ifdef CONFIG_MMU -static int mem_cgroup_move_charge_write(struct cgroup *cgrp, +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); if (val >= (1 << NR_MOVE_TYPE)) return -EINVAL; @@ -5367,7 +5343,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, return 0; } #else -static int mem_cgroup_move_charge_write(struct cgroup *cgrp, +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { return -ENOSYS; @@ -5375,13 +5351,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, #endif #ifdef CONFIG_NUMA -static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, - struct seq_file *m) +static int memcg_numa_stat_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { int nid; unsigned long total_nr, file_nr, anon_nr, unevictable_nr; unsigned long node_nr; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); seq_printf(m, "total=%lu", total_nr); @@ -5426,10 +5402,10 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); } -static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, +static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *mi; unsigned int i; @@ -5513,27 +5489,23 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, return 0; } -static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) +static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); return mem_cgroup_swappiness(memcg); } -static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, - u64 val) +static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup *parent; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - if (val > 100) + if (val > 100 || !parent) return -EINVAL; - if (cgrp->parent == NULL) - return -EINVAL; - - parent = mem_cgroup_from_cont(cgrp->parent); - mutex_lock(&memcg_create_mutex); /* If under hierarchy, only empty-root can set this value */ @@ -5636,10 +5608,10 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) mem_cgroup_oom_notify_cb(iter); } -static int mem_cgroup_usage_register_event(struct cgroup *cgrp, +static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5719,10 +5691,10 @@ unlock: return ret; } -static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, +static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5798,10 +5770,10 @@ unlock: mutex_unlock(&memcg->thresholds_lock); } -static int mem_cgroup_oom_register_event(struct cgroup *cgrp, +static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *event; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5823,10 +5795,10 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, return 0; } -static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, +static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *ev, *tmp; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5844,10 +5816,10 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, spin_unlock(&memcg_oom_lock); } -static int mem_cgroup_oom_control_read(struct cgroup *cgrp, +static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); @@ -5858,18 +5830,16 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, return 0; } -static int mem_cgroup_oom_control_write(struct cgroup *cgrp, +static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup *parent; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!cgrp->parent || !((val == 0) || (val == 1))) + if (!parent || !((val == 0) || (val == 1))) return -EINVAL; - parent = mem_cgroup_from_cont(cgrp->parent); - mutex_lock(&memcg_create_mutex); /* oom-kill-disable is a flag for subhierarchy. */ if ((parent->use_hierarchy) || memcg_has_children(memcg)) { @@ -6228,7 +6198,7 @@ static void __init mem_cgroup_soft_limit_tree_init(void) } static struct cgroup_subsys_state * __ref -mem_cgroup_css_alloc(struct cgroup *cont) +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct mem_cgroup *memcg; long error = -ENOMEM; @@ -6243,7 +6213,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) goto free_out; /* root ? */ - if (cont->parent == NULL) { + if (parent_css == NULL) { root_mem_cgroup = memcg; res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); @@ -6265,17 +6235,16 @@ free_out: } static int -mem_cgroup_css_online(struct cgroup *cont) +mem_cgroup_css_online(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg, *parent; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); int error = 0; - if (!cont->parent) + if (!parent) return 0; mutex_lock(&memcg_create_mutex); - memcg = mem_cgroup_from_cont(cont); - parent = mem_cgroup_from_cont(cont->parent); memcg->use_hierarchy = parent->use_hierarchy; memcg->oom_kill_disable = parent->oom_kill_disable; @@ -6326,9 +6295,9 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) mem_cgroup_iter_invalidate(root_mem_cgroup); } -static void mem_cgroup_css_offline(struct cgroup *cont) +static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); kmem_cgroup_css_offline(memcg); @@ -6338,9 +6307,9 @@ static void mem_cgroup_css_offline(struct cgroup *cont) vmpressure_cleanup(&memcg->vmpressure); } -static void mem_cgroup_css_free(struct cgroup *cont) +static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); memcg_destroy_kmem(memcg); __mem_cgroup_free(memcg); @@ -6710,12 +6679,12 @@ static void mem_cgroup_clear_mc(void) mem_cgroup_end_move(from); } -static int mem_cgroup_can_attach(struct cgroup *cgroup, +static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); unsigned long move_charge_at_immigrate; /* @@ -6757,7 +6726,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup *cgroup, +static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { mem_cgroup_clear_mc(); @@ -6905,7 +6874,7 @@ retry: up_read(&mm->mmap_sem); } -static void mem_cgroup_move_task(struct cgroup *cont, +static void mem_cgroup_move_task(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); @@ -6920,16 +6889,16 @@ static void mem_cgroup_move_task(struct cgroup *cont, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup *cgroup, +static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup *cgroup, +static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup *cont, +static void mem_cgroup_move_task(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { } @@ -6939,15 +6908,15 @@ static void mem_cgroup_move_task(struct cgroup *cont, * Cgroup retains root cgroups across [un]mount cycles making it necessary * to verify sane_behavior flag on each mount attempt. */ -static void mem_cgroup_bind(struct cgroup *root) +static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) { /* * use_hierarchy is forced with sane_behavior. cgroup core * guarantees that @root doesn't have any children, so turning it * on for the root memcg is enough. */ - if (cgroup_sane_behavior(root)) - mem_cgroup_from_cont(root)->use_hierarchy = true; + if (cgroup_sane_behavior(root_css->cgroup)) + mem_cgroup_from_css(root_css)->use_hierarchy = true; } struct cgroup_subsys mem_cgroup_subsys = { diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 0c1e37d829fa..e0f62837c3f4 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -74,15 +74,10 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work) return container_of(work, struct vmpressure, work); } -static struct vmpressure *cg_to_vmpressure(struct cgroup *cg) -{ - return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id)); -} - static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { - struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cg); + struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); memcg = parent_mem_cgroup(memcg); if (!memcg) @@ -283,7 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd - * @cg: cgroup that is interested in vmpressure notifications + * @css: css that is interested in vmpressure notifications * @cft: cgroup control files handle * @eventfd: eventfd context to link notifications with * @args: event arguments (used to set up a pressure level threshold) @@ -298,10 +293,11 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * cftype).register_event, and then cgroup core will handle everything by * itself. */ -int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, - struct eventfd_ctx *eventfd, const char *args) +int vmpressure_register_event(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd, + const char *args) { - struct vmpressure *vmpr = cg_to_vmpressure(cg); + struct vmpressure *vmpr = css_to_vmpressure(css); struct vmpressure_event *ev; int level; @@ -329,7 +325,7 @@ int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, /** * vmpressure_unregister_event() - Unbind eventfd from vmpressure - * @cg: cgroup handle + * @css: css handle * @cft: cgroup control files handle * @eventfd: eventfd context that was used to link vmpressure with the @cg * @@ -341,10 +337,11 @@ int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, * cftype).unregister_event, and then cgroup core will handle everything * by itself. */ -void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, +void vmpressure_unregister_event(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd) { - struct vmpressure *vmpr = cg_to_vmpressure(cg); + struct vmpressure *vmpr = css_to_vmpressure(css); struct vmpressure_event *ev; mutex_lock(&vmpr->events_lock); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index e533259dce3c..d9cd627e6a16 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -29,12 +29,6 @@ #define PRIOMAP_MIN_SZ 128 -static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), - struct cgroup_netprio_state, css); -} - /* * Extend @dev->priomap so that it's large enough to accomodate * @target_idx. @dev->priomap.priomap_len > @target_idx after successful @@ -87,67 +81,70 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx) /** * netprio_prio - return the effective netprio of a cgroup-net_device pair - * @cgrp: cgroup part of the target pair + * @css: css part of the target pair * @dev: net_device part of the target pair * * Should be called under RCU read or rtnl lock. */ -static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev) +static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); + int id = css->cgroup->id; - if (map && cgrp->id < map->priomap_len) - return map->priomap[cgrp->id]; + if (map && id < map->priomap_len) + return map->priomap[id]; return 0; } /** * netprio_set_prio - set netprio on a cgroup-net_device pair - * @cgrp: cgroup part of the target pair + * @css: css part of the target pair * @dev: net_device part of the target pair * @prio: prio to set * - * Set netprio to @prio on @cgrp-@dev pair. Should be called under rtnl + * Set netprio to @prio on @css-@dev pair. Should be called under rtnl * lock and may fail under memory pressure for non-zero @prio. */ -static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev, - u32 prio) +static int netprio_set_prio(struct cgroup_subsys_state *css, + struct net_device *dev, u32 prio) { struct netprio_map *map; + int id = css->cgroup->id; int ret; /* avoid extending priomap for zero writes */ map = rtnl_dereference(dev->priomap); - if (!prio && (!map || map->priomap_len <= cgrp->id)) + if (!prio && (!map || map->priomap_len <= id)) return 0; - ret = extend_netdev_table(dev, cgrp->id); + ret = extend_netdev_table(dev, id); if (ret) return ret; map = rtnl_dereference(dev->priomap); - map->priomap[cgrp->id] = prio; + map->priomap[id] = prio; return 0; } -static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { - struct cgroup_netprio_state *cs; + struct cgroup_subsys_state *css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); - if (!cs) + css = kzalloc(sizeof(*css), GFP_KERNEL); + if (!css) return ERR_PTR(-ENOMEM); - return &cs->css; + return css; } -static int cgrp_css_online(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup_subsys_state *css) { - struct cgroup *parent = cgrp->parent; + struct cgroup_subsys_state *parent_css = css_parent(css); struct net_device *dev; int ret = 0; - if (!parent) + if (!parent_css) return 0; rtnl_lock(); @@ -156,9 +153,9 @@ static int cgrp_css_online(struct cgroup *cgrp) * onlining, there is no need to clear them on offline. */ for_each_netdev(&init_net, dev) { - u32 prio = netprio_prio(parent, dev); + u32 prio = netprio_prio(parent_css, dev); - ret = netprio_set_prio(cgrp, dev, prio); + ret = netprio_set_prio(css, dev, prio); if (ret) break; } @@ -166,29 +163,29 @@ static int cgrp_css_online(struct cgroup *cgrp) return ret; } -static void cgrp_css_free(struct cgroup *cgrp) +static void cgrp_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp_netprio_state(cgrp)); + kfree(css); } -static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) +static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { - return cgrp->id; + return css->cgroup->id; } -static int read_priomap(struct cgroup *cont, struct cftype *cft, +static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) - cb->fill(cb, dev->name, netprio_prio(cont, dev)); + cb->fill(cb, dev->name, netprio_prio(css, dev)); rcu_read_unlock(); return 0; } -static int write_priomap(struct cgroup *cgrp, struct cftype *cft, +static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { char devname[IFNAMSIZ + 1]; @@ -205,7 +202,7 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft, rtnl_lock(); - ret = netprio_set_prio(cgrp, dev, prio); + ret = netprio_set_prio(css, dev, prio); rtnl_unlock(); dev_put(dev); @@ -221,12 +218,13 @@ static int update_netprio(const void *v, struct file *file, unsigned n) return 0; } -static void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *p; void *v; - cgroup_taskset_for_each(p, cgrp, tset) { + cgroup_taskset_for_each(p, css, tset) { task_lock(p); v = (void *)(unsigned long)task_netprioidx(p); iterate_fd(p->files, 0, update_netprio, v); diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index da14436c1735..8a57d79b0b16 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -132,10 +132,10 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) return 0; } -static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, +static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); unsigned long long val; int ret = 0; @@ -180,9 +180,9 @@ static u64 tcp_read_usage(struct mem_cgroup *memcg) return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); } -static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) +static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); u64 val; switch (cft->private) { @@ -202,13 +202,13 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) return val; } -static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event) +static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) { struct mem_cgroup *memcg; struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; - memcg = mem_cgroup_from_cont(cont); + memcg = mem_cgroup_from_css(css); cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return 0; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 3a294eb98d61..867b4a3e3980 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -23,19 +23,18 @@ #include #include -static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) +static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), - struct cgroup_cls_state, css); + return css ? container_of(css, struct cgroup_cls_state, css) : NULL; } static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) { - return container_of(task_subsys_state(p, net_cls_subsys_id), - struct cgroup_cls_state, css); + return css_cls_state(task_css(p, net_cls_subsys_id)); } -static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_cls_state *cs; @@ -45,17 +44,19 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) return &cs->css; } -static int cgrp_css_online(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup_subsys_state *css) { - if (cgrp->parent) - cgrp_cls_state(cgrp)->classid = - cgrp_cls_state(cgrp->parent)->classid; + struct cgroup_cls_state *cs = css_cls_state(css); + struct cgroup_cls_state *parent = css_cls_state(css_parent(css)); + + if (parent) + cs->classid = parent->classid; return 0; } -static void cgrp_css_free(struct cgroup *cgrp) +static void cgrp_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp_cls_state(cgrp)); + kfree(css_cls_state(css)); } static int update_classid(const void *v, struct file *file, unsigned n) @@ -67,12 +68,13 @@ static int update_classid(const void *v, struct file *file, unsigned n) return 0; } -static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void cgrp_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *p; void *v; - cgroup_taskset_for_each(p, cgrp, tset) { + cgroup_taskset_for_each(p, css, tset) { task_lock(p); v = (void *)(unsigned long)task_cls_classid(p); iterate_fd(p->files, 0, update_classid, v); @@ -80,14 +82,15 @@ static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) } } -static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) +static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) { - return cgrp_cls_state(cgrp)->classid; + return css_cls_state(css)->classid; } -static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value) +static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, + u64 value) { - cgrp_cls_state(cgrp)->classid = (u32) value; + css_cls_state(css)->classid = (u32) value; return 0; } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index e8aad69f0d69..c123628d3f84 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -53,22 +53,17 @@ struct dev_cgroup { static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) { - return container_of(s, struct dev_cgroup, css); -} - -static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) -{ - return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id)); + return s ? container_of(s, struct dev_cgroup, css) : NULL; } static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { - return css_to_devcgroup(task_subsys_state(task, devices_subsys_id)); + return css_to_devcgroup(task_css(task, devices_subsys_id)); } struct cgroup_subsys devices_subsys; -static int devcgroup_can_attach(struct cgroup *new_cgrp, +static int devcgroup_can_attach(struct cgroup_subsys_state *new_css, struct cgroup_taskset *set) { struct task_struct *task = cgroup_taskset_first(set); @@ -193,18 +188,16 @@ static inline bool is_devcg_online(const struct dev_cgroup *devcg) /** * devcgroup_online - initializes devcgroup's behavior and exceptions based on * parent's - * @cgroup: cgroup getting online + * @css: css getting online * returns 0 in case of success, error code otherwise */ -static int devcgroup_online(struct cgroup *cgroup) +static int devcgroup_online(struct cgroup_subsys_state *css) { - struct dev_cgroup *dev_cgroup, *parent_dev_cgroup = NULL; + struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); + struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(css)); int ret = 0; mutex_lock(&devcgroup_mutex); - dev_cgroup = cgroup_to_devcgroup(cgroup); - if (cgroup->parent) - parent_dev_cgroup = cgroup_to_devcgroup(cgroup->parent); if (parent_dev_cgroup == NULL) dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; @@ -219,9 +212,9 @@ static int devcgroup_online(struct cgroup *cgroup) return ret; } -static void devcgroup_offline(struct cgroup *cgroup) +static void devcgroup_offline(struct cgroup_subsys_state *css) { - struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup); + struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); mutex_lock(&devcgroup_mutex); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; @@ -231,7 +224,8 @@ static void devcgroup_offline(struct cgroup *cgroup) /* * called from kernel/cgroup.c with cgroup_lock() held. */ -static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +devcgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct dev_cgroup *dev_cgroup; @@ -244,11 +238,10 @@ static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup) return &dev_cgroup->css; } -static void devcgroup_css_free(struct cgroup *cgroup) +static void devcgroup_css_free(struct cgroup_subsys_state *css) { - struct dev_cgroup *dev_cgroup; + struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); - dev_cgroup = cgroup_to_devcgroup(cgroup); __dev_exception_clean(dev_cgroup); kfree(dev_cgroup); } @@ -291,10 +284,10 @@ static void set_majmin(char *str, unsigned m) sprintf(str, "%u", m); } -static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) +static int devcgroup_seq_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); + struct dev_cgroup *devcgroup = css_to_devcgroup(css); struct dev_exception_item *ex; char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; @@ -394,12 +387,10 @@ static bool may_access(struct dev_cgroup *dev_cgroup, static int parent_has_perm(struct dev_cgroup *childcg, struct dev_exception_item *ex) { - struct cgroup *pcg = childcg->css.cgroup->parent; - struct dev_cgroup *parent; + struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css)); - if (!pcg) + if (!parent) return 1; - parent = cgroup_to_devcgroup(pcg); return may_access(parent, ex, childcg->behavior); } @@ -451,13 +442,13 @@ static void revalidate_active_exceptions(struct dev_cgroup *devcg) static int propagate_exception(struct dev_cgroup *devcg_root, struct dev_exception_item *ex) { - struct cgroup *root = devcg_root->css.cgroup, *pos; + struct cgroup_subsys_state *pos; int rc = 0; rcu_read_lock(); - cgroup_for_each_descendant_pre(pos, root) { - struct dev_cgroup *devcg = cgroup_to_devcgroup(pos); + css_for_each_descendant_pre(pos, &devcg_root->css) { + struct dev_cgroup *devcg = css_to_devcgroup(pos); /* * Because devcgroup_mutex is held, no devcg will become @@ -465,7 +456,7 @@ static int propagate_exception(struct dev_cgroup *devcg_root, * methods), and online ones are safe to access outside RCU * read lock without bumping refcnt. */ - if (!is_devcg_online(devcg)) + if (pos == &devcg_root->css || !is_devcg_online(devcg)) continue; rcu_read_unlock(); @@ -524,15 +515,11 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, char temp[12]; /* 11 + 1 characters needed for a u32 */ int count, rc = 0; struct dev_exception_item ex; - struct cgroup *p = devcgroup->css.cgroup; - struct dev_cgroup *parent = NULL; + struct dev_cgroup *parent = css_to_devcgroup(css_parent(&devcgroup->css)); if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (p->parent) - parent = cgroup_to_devcgroup(p->parent); - memset(&ex, 0, sizeof(ex)); b = buffer; @@ -677,13 +664,13 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, return rc; } -static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int devcgroup_access_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { int retval; mutex_lock(&devcgroup_mutex); - retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp), + retval = devcgroup_update_access(css_to_devcgroup(css), cft->private, buffer); mutex_unlock(&devcgroup_mutex); return retval;