writeback: don't embed root bdi_writeback_congested in bdi_writeback

52ebea749a ("writeback: make backing_dev_info host cgroup-specific
bdi_writebacks") made bdi (backing_dev_info) host per-cgroup wb's
(bdi_writeback's).  As the congested state needs to be per-wb and
referenced from blkcg side and multiple wbs, the patch made all
non-root cong's (bdi_writeback_congested's) reference counted and
indexed on bdi.

When a bdi is destroyed, cgwb_bdi_destroy() tries to drain all
non-root cong's; however, this can hang indefinitely because wb's can
also be referenced from blkcg_gq's which are destroyed after bdi
destruction is complete.

To fix the bug, bdi destruction will be updated to not wait for cong's
to drain, which naturally means that cong's may outlive the associated
bdi.  This is fine for non-root cong's but is problematic for the root
cong's which are embedded in their bdi's as they may end up getting
dereferenced after the containing bdi's are freed.

This patch makes root cong's behave the same as non-root cong's.  They
are no longer embedded in their bdi's but allocated separately during
bdi initialization, indexed and reference counted the same way.

* As cong handling is the same for all wb's, wb->congested
  initialization is moved into wb_init().

* When !CONFIG_CGROUP_WRITEBACK, there was no indexing or refcnting.
  bdi->wb_congested is now a pointer pointing to the root cong
  allocated during bdi init and minimal refcnting operations are
  implemented.

* The above makes root wb init paths diverge depending on
  CONFIG_CGROUP_WRITEBACK.  root wb init is moved to cgwb_bdi_init().

This patch in itself shouldn't cause any consequential behavior
differences but prepares for the actual fix.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jon Christopherson <jon@jons.org>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=100681
Tested-by: Jon Christopherson <jon@jons.org>

Added <linux/slab.h> include to backing-dev.h for kfree() definition.

Signed-off-by: Jens Axboe <axboe@fb.com>
This commit is contained in:
Tejun Heo 2015-07-02 08:44:34 -06:00 коммит произвёл Jens Axboe
Родитель 4da3064d17
Коммит a13f35e871
3 изменённых файлов: 54 добавлений и 44 удалений

Просмотреть файл

@ -50,10 +50,10 @@ enum wb_stat_item {
*/ */
struct bdi_writeback_congested { struct bdi_writeback_congested {
unsigned long state; /* WB_[a]sync_congested flags */ unsigned long state; /* WB_[a]sync_congested flags */
atomic_t refcnt; /* nr of attached wb's and blkg */
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
struct backing_dev_info *bdi; /* the associated bdi */ struct backing_dev_info *bdi; /* the associated bdi */
atomic_t refcnt; /* nr of attached wb's and blkg */
int blkcg_id; /* ID of the associated blkcg */ int blkcg_id; /* ID of the associated blkcg */
struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */
#endif #endif
@ -150,11 +150,12 @@ struct backing_dev_info {
atomic_long_t tot_write_bandwidth; atomic_long_t tot_write_bandwidth;
struct bdi_writeback wb; /* the root writeback info for this bdi */ struct bdi_writeback wb; /* the root writeback info for this bdi */
struct bdi_writeback_congested wb_congested; /* its congested state */
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
struct rb_root cgwb_congested_tree; /* their congested states */ struct rb_root cgwb_congested_tree; /* their congested states */
atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
#else
struct bdi_writeback_congested *wb_congested;
#endif #endif
wait_queue_head_t wb_waitq; wait_queue_head_t wb_waitq;

Просмотреть файл

@ -15,6 +15,7 @@
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/blk-cgroup.h> #include <linux/blk-cgroup.h>
#include <linux/backing-dev-defs.h> #include <linux/backing-dev-defs.h>
#include <linux/slab.h>
int __must_check bdi_init(struct backing_dev_info *bdi); int __must_check bdi_init(struct backing_dev_info *bdi);
void bdi_destroy(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi);
@ -465,11 +466,14 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
static inline struct bdi_writeback_congested * static inline struct bdi_writeback_congested *
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
{ {
return bdi->wb.congested; atomic_inc(&bdi->wb_congested->refcnt);
return bdi->wb_congested;
} }
static inline void wb_congested_put(struct bdi_writeback_congested *congested) static inline void wb_congested_put(struct bdi_writeback_congested *congested)
{ {
if (atomic_dec_and_test(&congested->refcnt))
kfree(congested);
} }
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)

Просмотреть файл

@ -287,7 +287,7 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
#define INIT_BW (100 << (20 - PAGE_SHIFT)) #define INIT_BW (100 << (20 - PAGE_SHIFT))
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
gfp_t gfp) int blkcg_id, gfp_t gfp)
{ {
int i, err; int i, err;
@ -311,21 +311,29 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list); INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn); INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
if (!wb->congested)
return -ENOMEM;
err = fprop_local_init_percpu(&wb->completions, gfp); err = fprop_local_init_percpu(&wb->completions, gfp);
if (err) if (err)
return err; goto out_put_cong;
for (i = 0; i < NR_WB_STAT_ITEMS; i++) { for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
err = percpu_counter_init(&wb->stat[i], 0, gfp); err = percpu_counter_init(&wb->stat[i], 0, gfp);
if (err) { if (err)
while (--i) goto out_destroy_stat;
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
return err;
}
} }
return 0; return 0;
out_destroy_stat:
while (--i)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
out_put_cong:
wb_congested_put(wb->congested);
return err;
} }
/* /*
@ -361,6 +369,7 @@ static void wb_exit(struct bdi_writeback *wb)
percpu_counter_destroy(&wb->stat[i]); percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions); fprop_local_destroy_percpu(&wb->completions);
wb_congested_put(wb->congested);
} }
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
@ -392,9 +401,6 @@ wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
struct bdi_writeback_congested *new_congested = NULL, *congested; struct bdi_writeback_congested *new_congested = NULL, *congested;
struct rb_node **node, *parent; struct rb_node **node, *parent;
unsigned long flags; unsigned long flags;
if (blkcg_id == 1)
return &bdi->wb_congested;
retry: retry:
spin_lock_irqsave(&cgwb_lock, flags); spin_lock_irqsave(&cgwb_lock, flags);
@ -453,9 +459,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
struct backing_dev_info *bdi = congested->bdi; struct backing_dev_info *bdi = congested->bdi;
unsigned long flags; unsigned long flags;
if (congested->blkcg_id == 1)
return;
local_irq_save(flags); local_irq_save(flags);
if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
local_irq_restore(flags); local_irq_restore(flags);
@ -480,7 +483,6 @@ static void cgwb_release_workfn(struct work_struct *work)
css_put(wb->memcg_css); css_put(wb->memcg_css);
css_put(wb->blkcg_css); css_put(wb->blkcg_css);
wb_congested_put(wb->congested);
fprop_local_destroy_percpu(&wb->memcg_completions); fprop_local_destroy_percpu(&wb->memcg_completions);
percpu_ref_exit(&wb->refcnt); percpu_ref_exit(&wb->refcnt);
@ -541,7 +543,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
if (!wb) if (!wb)
return -ENOMEM; return -ENOMEM;
ret = wb_init(wb, bdi, gfp); ret = wb_init(wb, bdi, blkcg_css->id, gfp);
if (ret) if (ret)
goto err_free; goto err_free;
@ -553,12 +555,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
if (ret) if (ret)
goto err_ref_exit; goto err_ref_exit;
wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
if (!wb->congested) {
ret = -ENOMEM;
goto err_fprop_exit;
}
wb->memcg_css = memcg_css; wb->memcg_css = memcg_css;
wb->blkcg_css = blkcg_css; wb->blkcg_css = blkcg_css;
INIT_WORK(&wb->release_work, cgwb_release_workfn); INIT_WORK(&wb->release_work, cgwb_release_workfn);
@ -588,12 +584,10 @@ static int cgwb_create(struct backing_dev_info *bdi,
if (ret) { if (ret) {
if (ret == -EEXIST) if (ret == -EEXIST)
ret = 0; ret = 0;
goto err_put_congested; goto err_fprop_exit;
} }
goto out_put; goto out_put;
err_put_congested:
wb_congested_put(wb->congested);
err_fprop_exit: err_fprop_exit:
fprop_local_destroy_percpu(&wb->memcg_completions); fprop_local_destroy_percpu(&wb->memcg_completions);
err_ref_exit: err_ref_exit:
@ -662,14 +656,20 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
return wb; return wb;
} }
static void cgwb_bdi_init(struct backing_dev_info *bdi) static int cgwb_bdi_init(struct backing_dev_info *bdi)
{ {
bdi->wb.memcg_css = mem_cgroup_root_css; int ret;
bdi->wb.blkcg_css = blkcg_root_css;
bdi->wb_congested.blkcg_id = 1;
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT; bdi->cgwb_congested_tree = RB_ROOT;
atomic_set(&bdi->usage_cnt, 1); atomic_set(&bdi->usage_cnt, 1);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
bdi->wb.memcg_css = mem_cgroup_root_css;
bdi->wb.blkcg_css = blkcg_root_css;
}
return ret;
} }
static void cgwb_bdi_destroy(struct backing_dev_info *bdi) static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
@ -732,15 +732,28 @@ void wb_blkcg_offline(struct blkcg *blkcg)
#else /* CONFIG_CGROUP_WRITEBACK */ #else /* CONFIG_CGROUP_WRITEBACK */
static void cgwb_bdi_init(struct backing_dev_info *bdi) { } static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
int err;
bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL);
if (!bdi->wb_congested)
return -ENOMEM;
err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (err) {
kfree(bdi->wb_congested);
return err;
}
return 0;
}
static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
#endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CONFIG_CGROUP_WRITEBACK */
int bdi_init(struct backing_dev_info *bdi) int bdi_init(struct backing_dev_info *bdi)
{ {
int err;
bdi->dev = NULL; bdi->dev = NULL;
bdi->min_ratio = 0; bdi->min_ratio = 0;
@ -749,15 +762,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->bdi_list);
init_waitqueue_head(&bdi->wb_waitq); init_waitqueue_head(&bdi->wb_waitq);
err = wb_init(&bdi->wb, bdi, GFP_KERNEL); return cgwb_bdi_init(bdi);
if (err)
return err;
bdi->wb_congested.state = 0;
bdi->wb.congested = &bdi->wb_congested;
cgwb_bdi_init(bdi);
return 0;
} }
EXPORT_SYMBOL(bdi_init); EXPORT_SYMBOL(bdi_init);