From a13f35e8714009145e32ebe2bf25b84e1376e314 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Jul 2015 08:44:34 -0600 Subject: [PATCH 1/3] writeback: don't embed root bdi_writeback_congested in bdi_writeback 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") made bdi (backing_dev_info) host per-cgroup wb's (bdi_writeback's). As the congested state needs to be per-wb and referenced from blkcg side and multiple wbs, the patch made all non-root cong's (bdi_writeback_congested's) reference counted and indexed on bdi. When a bdi is destroyed, cgwb_bdi_destroy() tries to drain all non-root cong's; however, this can hang indefinitely because wb's can also be referenced from blkcg_gq's which are destroyed after bdi destruction is complete. To fix the bug, bdi destruction will be updated to not wait for cong's to drain, which naturally means that cong's may outlive the associated bdi. This is fine for non-root cong's but is problematic for the root cong's which are embedded in their bdi's as they may end up getting dereferenced after the containing bdi's are freed. This patch makes root cong's behave the same as non-root cong's. They are no longer embedded in their bdi's but allocated separately during bdi initialization, indexed and reference counted the same way. * As cong handling is the same for all wb's, wb->congested initialization is moved into wb_init(). * When !CONFIG_CGROUP_WRITEBACK, there was no indexing or refcnting. bdi->wb_congested is now a pointer pointing to the root cong allocated during bdi init and minimal refcnting operations are implemented. * The above makes root wb init paths diverge depending on CONFIG_CGROUP_WRITEBACK. root wb init is moved to cgwb_bdi_init(). This patch in itself shouldn't cause any consequential behavior differences but prepares for the actual fix. Signed-off-by: Tejun Heo Reported-by: Jon Christopherson Link: https://bugzilla.kernel.org/show_bug.cgi?id=100681 Tested-by: Jon Christopherson Added include to backing-dev.h for kfree() definition. Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 5 +- include/linux/backing-dev.h | 6 ++- mm/backing-dev.c | 87 +++++++++++++++++--------------- 3 files changed, 54 insertions(+), 44 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index a48d90e3bcbb..a23209b43842 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -50,10 +50,10 @@ enum wb_stat_item { */ struct bdi_writeback_congested { unsigned long state; /* WB_[a]sync_congested flags */ + atomic_t refcnt; /* nr of attached wb's and blkg */ #ifdef CONFIG_CGROUP_WRITEBACK struct backing_dev_info *bdi; /* the associated bdi */ - atomic_t refcnt; /* nr of attached wb's and blkg */ int blkcg_id; /* ID of the associated blkcg */ struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ #endif @@ -150,11 +150,12 @@ struct backing_dev_info { atomic_long_t tot_write_bandwidth; struct bdi_writeback wb; /* the root writeback info for this bdi */ - struct bdi_writeback_congested wb_congested; /* its congested state */ #ifdef CONFIG_CGROUP_WRITEBACK struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct rb_root cgwb_congested_tree; /* their congested states */ atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ +#else + struct bdi_writeback_congested *wb_congested; #endif wait_queue_head_t wb_waitq; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0e6d4828a77a..0fe9df983ab7 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -15,6 +15,7 @@ #include #include #include +#include int __must_check bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); @@ -465,11 +466,14 @@ static inline bool inode_cgwb_enabled(struct inode *inode) static inline struct bdi_writeback_congested * wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) { - return bdi->wb.congested; + atomic_inc(&bdi->wb_congested->refcnt); + return bdi->wb_congested; } static inline void wb_congested_put(struct bdi_writeback_congested *congested) { + if (atomic_dec_and_test(&congested->refcnt)) + kfree(congested); } static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7756da31b02b..51cc461e7256 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -287,7 +287,7 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) #define INIT_BW (100 << (20 - PAGE_SHIFT)) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, - gfp_t gfp) + int blkcg_id, gfp_t gfp) { int i, err; @@ -311,21 +311,29 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); + if (!wb->congested) + return -ENOMEM; + err = fprop_local_init_percpu(&wb->completions, gfp); if (err) - return err; + goto out_put_cong; for (i = 0; i < NR_WB_STAT_ITEMS; i++) { err = percpu_counter_init(&wb->stat[i], 0, gfp); - if (err) { - while (--i) - percpu_counter_destroy(&wb->stat[i]); - fprop_local_destroy_percpu(&wb->completions); - return err; - } + if (err) + goto out_destroy_stat; } return 0; + +out_destroy_stat: + while (--i) + percpu_counter_destroy(&wb->stat[i]); + fprop_local_destroy_percpu(&wb->completions); +out_put_cong: + wb_congested_put(wb->congested); + return err; } /* @@ -361,6 +369,7 @@ static void wb_exit(struct bdi_writeback *wb) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); + wb_congested_put(wb->congested); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -392,9 +401,6 @@ wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) struct bdi_writeback_congested *new_congested = NULL, *congested; struct rb_node **node, *parent; unsigned long flags; - - if (blkcg_id == 1) - return &bdi->wb_congested; retry: spin_lock_irqsave(&cgwb_lock, flags); @@ -453,9 +459,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested) struct backing_dev_info *bdi = congested->bdi; unsigned long flags; - if (congested->blkcg_id == 1) - return; - local_irq_save(flags); if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { local_irq_restore(flags); @@ -480,7 +483,6 @@ static void cgwb_release_workfn(struct work_struct *work) css_put(wb->memcg_css); css_put(wb->blkcg_css); - wb_congested_put(wb->congested); fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); @@ -541,7 +543,7 @@ static int cgwb_create(struct backing_dev_info *bdi, if (!wb) return -ENOMEM; - ret = wb_init(wb, bdi, gfp); + ret = wb_init(wb, bdi, blkcg_css->id, gfp); if (ret) goto err_free; @@ -553,12 +555,6 @@ static int cgwb_create(struct backing_dev_info *bdi, if (ret) goto err_ref_exit; - wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp); - if (!wb->congested) { - ret = -ENOMEM; - goto err_fprop_exit; - } - wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_WORK(&wb->release_work, cgwb_release_workfn); @@ -588,12 +584,10 @@ static int cgwb_create(struct backing_dev_info *bdi, if (ret) { if (ret == -EEXIST) ret = 0; - goto err_put_congested; + goto err_fprop_exit; } goto out_put; -err_put_congested: - wb_congested_put(wb->congested); err_fprop_exit: fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: @@ -662,14 +656,20 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, return wb; } -static void cgwb_bdi_init(struct backing_dev_info *bdi) +static int cgwb_bdi_init(struct backing_dev_info *bdi) { - bdi->wb.memcg_css = mem_cgroup_root_css; - bdi->wb.blkcg_css = blkcg_root_css; - bdi->wb_congested.blkcg_id = 1; + int ret; + INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); bdi->cgwb_congested_tree = RB_ROOT; atomic_set(&bdi->usage_cnt, 1); + + ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); + if (!ret) { + bdi->wb.memcg_css = mem_cgroup_root_css; + bdi->wb.blkcg_css = blkcg_root_css; + } + return ret; } static void cgwb_bdi_destroy(struct backing_dev_info *bdi) @@ -732,15 +732,28 @@ void wb_blkcg_offline(struct blkcg *blkcg) #else /* CONFIG_CGROUP_WRITEBACK */ -static void cgwb_bdi_init(struct backing_dev_info *bdi) { } +static int cgwb_bdi_init(struct backing_dev_info *bdi) +{ + int err; + + bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL); + if (!bdi->wb_congested) + return -ENOMEM; + + err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); + if (err) { + kfree(bdi->wb_congested); + return err; + } + return 0; +} + static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } #endif /* CONFIG_CGROUP_WRITEBACK */ int bdi_init(struct backing_dev_info *bdi) { - int err; - bdi->dev = NULL; bdi->min_ratio = 0; @@ -749,15 +762,7 @@ int bdi_init(struct backing_dev_info *bdi) INIT_LIST_HEAD(&bdi->bdi_list); init_waitqueue_head(&bdi->wb_waitq); - err = wb_init(&bdi->wb, bdi, GFP_KERNEL); - if (err) - return err; - - bdi->wb_congested.state = 0; - bdi->wb.congested = &bdi->wb_congested; - - cgwb_bdi_init(bdi); - return 0; + return cgwb_bdi_init(bdi); } EXPORT_SYMBOL(bdi_init); From a20135ffbc44545596f9b99c970de097fb497bdd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 1 Jul 2015 20:53:37 -0400 Subject: [PATCH 2/3] writeback: don't drain bdi_writeback_congested on bdi destruction 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") made bdi (backing_dev_info) host per-cgroup wb's (bdi_writeback's). As the congested state needs to be per-wb and referenced from blkcg side and multiple wbs, the patch made all non-root cong's (bdi_writeback_congested's) reference counted and indexed on bdi. When a bdi is destroyed, cgwb_bdi_destroy() tries to drain all non-root cong's; however, this can hang indefinitely because wb's can also be referenced from blkcg_gq's which are destroyed after bdi destruction is complete. This patch fixes the bug by updating bdi destruction to not wait for cong's to drain. A cong is unlinked from bdi->cgwb_congested_tree on bdi destuction regardless of its reference count as the bdi may go away any point after destruction. wb_congested_put() checks whether the cong is already unlinked on release. Signed-off-by: Tejun Heo Reported-by: Jon Christopherson Link: https://bugzilla.kernel.org/show_bug.cgi?id=100681 Fixes: 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") Tested-by: Jon Christopherson Signed-off-by: Jens Axboe --- mm/backing-dev.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 51cc461e7256..dac5bf59309d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -425,7 +425,6 @@ retry: new_congested = NULL; rb_link_node(&congested->rb_node, parent, node); rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); - atomic_inc(&bdi->usage_cnt); goto found; } @@ -456,7 +455,6 @@ found: */ void wb_congested_put(struct bdi_writeback_congested *congested) { - struct backing_dev_info *bdi = congested->bdi; unsigned long flags; local_irq_save(flags); @@ -465,12 +463,15 @@ void wb_congested_put(struct bdi_writeback_congested *congested) return; } - rb_erase(&congested->rb_node, &congested->bdi->cgwb_congested_tree); + /* bdi might already have been destroyed leaving @congested unlinked */ + if (congested->bdi) { + rb_erase(&congested->rb_node, + &congested->bdi->cgwb_congested_tree); + congested->bdi = NULL; + } + spin_unlock_irqrestore(&cgwb_lock, flags); kfree(congested); - - if (atomic_dec_and_test(&bdi->usage_cnt)) - wake_up_all(&cgwb_release_wait); } static void cgwb_release_workfn(struct work_struct *work) @@ -675,13 +676,22 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { struct radix_tree_iter iter; + struct bdi_writeback_congested *congested, *congested_n; void **slot; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); + radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); + + rbtree_postorder_for_each_entry_safe(congested, congested_n, + &bdi->cgwb_congested_tree, rb_node) { + rb_erase(&congested->rb_node, &bdi->cgwb_congested_tree); + congested->bdi = NULL; /* mark @congested unlinked */ + } + spin_unlock_irq(&cgwb_lock); /* From 758dd7fdffd60507624edce34fff122a63163b3f Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Tue, 30 Jun 2015 11:22:52 -0600 Subject: [PATCH 3/3] NVMe: Fix irq freeing when queue_request_irq fails Fixes an issue when queue_reuest_irq fails in nvme_setup_io_queues. This patch initializes all vectors to -1 and resets the vector to -1 in the case of a failure in queue_request_irq. This avoids the free_irq in nvme_suspend_queue if the queue did not get an irq. Signed-off-by: Jon Derrick Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 34338d7438f5..d1d6141920d3 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1474,6 +1474,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; nvmeq->qid = qid; + nvmeq->cq_vector = -1; dev->queues[qid] = nvmeq; /* make sure queue descriptor is set before queue count, for kthread */ @@ -1726,8 +1727,10 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) nvmeq->cq_vector = 0; result = queue_request_irq(dev, nvmeq, nvmeq->irqname); - if (result) + if (result) { + nvmeq->cq_vector = -1; goto free_nvmeq; + } return result; @@ -2213,8 +2216,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->max_qid = nr_io_queues; result = queue_request_irq(dev, adminq, adminq->irqname); - if (result) + if (result) { + adminq->cq_vector = -1; goto free_queues; + } /* Free previously allocated queues that are no longer usable */ nvme_free_queues(dev, nr_io_queues + 1);