diff --git a/block/blk-map.c b/block/blk-map.c index f565e11f465a..a54f0543b956 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -57,6 +57,49 @@ static int __blk_rq_unmap_user(struct bio *bio) return ret; } +static int __blk_rq_map_user_iov(struct request *rq, + struct rq_map_data *map_data, struct iov_iter *iter, + gfp_t gfp_mask, bool copy) +{ + struct request_queue *q = rq->q; + struct bio *bio, *orig_bio; + int ret; + + if (copy) + bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); + else + bio = bio_map_user_iov(q, iter, gfp_mask); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (map_data && map_data->null_mapped) + bio_set_flag(bio, BIO_NULL_MAPPED); + + iov_iter_advance(iter, bio->bi_iter.bi_size); + if (map_data) + map_data->offset += bio->bi_iter.bi_size; + + orig_bio = bio; + blk_queue_bounce(q, &bio); + + /* + * We link the bounce buffer in and could have to traverse it + * later so we have to get a ref to prevent it from being freed + */ + bio_get(bio); + + ret = blk_rq_append_bio(q, rq, bio); + if (ret) { + bio_endio(bio); + __blk_rq_unmap_user(orig_bio); + bio_put(bio); + return ret; + } + + return 0; +} + /** * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage * @q: request queue where request should be inserted @@ -82,10 +125,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { - struct bio *bio; - int unaligned = 0; - struct iov_iter i; struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; + bool copy = (q->dma_pad_mask & iter->count) || map_data; + struct bio *bio = NULL; + struct iov_iter i; + int ret; if (!iter || !iter->count) return -EINVAL; @@ -101,42 +145,29 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, */ if ((uaddr & queue_dma_alignment(q)) || iovec_gap_to_prv(q, &prv, &iov)) - unaligned = 1; + copy = true; prv.iov_base = iov.iov_base; prv.iov_len = iov.iov_len; } - if (unaligned || (q->dma_pad_mask & iter->count) || map_data) - bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); - else - bio = bio_map_user_iov(q, iter, gfp_mask); - - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (map_data && map_data->null_mapped) - bio_set_flag(bio, BIO_NULL_MAPPED); - - if (bio->bi_iter.bi_size != iter->count) { - /* - * Grab an extra reference to this bio, as bio_unmap_user() - * expects to be able to drop it twice as it happens on the - * normal IO completion path - */ - bio_get(bio); - bio_endio(bio); - __blk_rq_unmap_user(bio); - return -EINVAL; - } + i = *iter; + do { + ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy); + if (ret) + goto unmap_rq; + if (!bio) + bio = rq->bio; + } while (iov_iter_count(&i)); if (!bio_flagged(bio, BIO_USER_MAPPED)) rq->cmd_flags |= REQ_COPY_USER; - - blk_queue_bounce(q, &bio); - bio_get(bio); - blk_rq_bio_prep(q, rq, bio); return 0; + +unmap_rq: + __blk_rq_unmap_user(bio); + rq->bio = NULL; + return -EINVAL; } EXPORT_SYMBOL(blk_rq_map_user_iov); diff --git a/block/blk-merge.c b/block/blk-merge.c index 888a7fec81f7..261353166dcf 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -304,7 +304,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, struct bio *nxt) { struct bio_vec end_bv = { NULL }, nxt_bv; - struct bvec_iter iter; if (!blk_queue_cluster(q)) return 0; @@ -316,11 +315,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, if (!bio_has_data(bio)) return 1; - bio_for_each_segment(end_bv, bio, iter) - if (end_bv.bv_len == iter.bi_size) - break; - - nxt_bv = bio_iovec(nxt); + bio_get_last_bvec(bio, &end_bv); + bio_get_first_bvec(nxt, &nxt_bv); if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv)) return 0; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3cd921e6121e..03c46412fff4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -55,8 +55,9 @@ static void nvme_free_ns(struct kref *kref) ns->disk->private_data = NULL; spin_unlock(&dev_list_lock); - nvme_put_ctrl(ns->ctrl); put_disk(ns->disk); + ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); + nvme_put_ctrl(ns->ctrl); kfree(ns); } @@ -183,7 +184,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, goto out_unmap; } - if (meta_buffer) { + if (meta_buffer && meta_len) { struct bio_integrity_payload *bip; meta = kmalloc(meta_len, GFP_KERNEL); @@ -373,6 +374,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) if (copy_from_user(&io, uio, sizeof(io))) return -EFAULT; + if (io.flags) + return -EINVAL; switch (io.opcode) { case nvme_cmd_write: @@ -424,6 +427,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return -EACCES; if (copy_from_user(&cmd, ucmd, sizeof(cmd))) return -EFAULT; + if (cmd.flags) + return -EINVAL; memset(&c, 0, sizeof(c)); c.common.opcode = cmd.opcode; @@ -556,6 +561,10 @@ static int nvme_revalidate_disk(struct gendisk *disk) u16 old_ms; unsigned short bs; + if (test_bit(NVME_NS_DEAD, &ns->flags)) { + set_capacity(disk, 0); + return -ENODEV; + } if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n", __func__, ns->ctrl->instance, ns->ns_id); @@ -831,6 +840,23 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) return ret; } +static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, + struct request_queue *q) +{ + if (ctrl->max_hw_sectors) { + u32 max_segments = + (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; + + blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); + blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); + } + if (ctrl->stripe_size) + blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9); + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + blk_queue_virt_boundary(q, ctrl->page_size - 1); +} + /* * Initialize the cached copies of the Identify data and various controller * register in our nvme_ctrl structure. This should be called as soon as @@ -888,6 +914,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } } + nvme_set_queue_limits(ctrl, ctrl->admin_q); + kfree(id); return 0; } @@ -1118,9 +1146,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (!ns) return; + ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL); + if (ns->instance < 0) + goto out_free_ns; + ns->queue = blk_mq_init_queue(ctrl->tagset); if (IS_ERR(ns->queue)) - goto out_free_ns; + goto out_release_instance; queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); ns->queue->queuedata = ns; ns->ctrl = ctrl; @@ -1134,17 +1166,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns->disk = disk; ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - if (ctrl->max_hw_sectors) { - blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors); - blk_queue_max_segments(ns->queue, - (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1); - } - if (ctrl->stripe_size) - blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9); - if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) - blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); - blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1); + nvme_set_queue_limits(ctrl, ns->queue); disk->major = nvme_major; disk->first_minor = 0; @@ -1153,7 +1177,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) disk->queue = ns->queue; disk->driverfs_dev = ctrl->device; disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid); + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance); if (nvme_revalidate_disk(ns->disk)) goto out_free_disk; @@ -1173,40 +1197,29 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) kfree(disk); out_free_queue: blk_cleanup_queue(ns->queue); + out_release_instance: + ida_simple_remove(&ctrl->ns_ida, ns->instance); out_free_ns: kfree(ns); } static void nvme_ns_remove(struct nvme_ns *ns) { - bool kill = nvme_io_incapable(ns->ctrl) && - !blk_queue_dying(ns->queue); + if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) + return; - lockdep_assert_held(&ns->ctrl->namespaces_mutex); - - if (kill) { - blk_set_queue_dying(ns->queue); - - /* - * The controller was shutdown first if we got here through - * device removal. The shutdown may requeue outstanding - * requests. These need to be aborted immediately so - * del_gendisk doesn't block indefinitely for their completion. - */ - blk_mq_abort_requeue_list(ns->queue); - } if (ns->disk->flags & GENHD_FL_UP) { if (blk_get_integrity(ns->disk)) blk_integrity_unregister(ns->disk); sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, &nvme_ns_attr_group); del_gendisk(ns->disk); - } - if (kill || !blk_queue_dying(ns->queue)) { blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); } + mutex_lock(&ns->ctrl->namespaces_mutex); list_del_init(&ns->list); + mutex_unlock(&ns->ctrl->namespaces_mutex); nvme_put_ns(ns); } @@ -1300,10 +1313,8 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) { struct nvme_ns *ns, *next; - mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) nvme_ns_remove(ns); - mutex_unlock(&ctrl->namespaces_mutex); } static DEFINE_IDA(nvme_instance_ida); @@ -1350,6 +1361,7 @@ static void nvme_free_ctrl(struct kref *kref) put_device(ctrl->device); nvme_release_instance(ctrl); + ida_destroy(&ctrl->ns_ida); ctrl->ops->free_ctrl(ctrl); } @@ -1390,6 +1402,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, } get_device(ctrl->device); dev_set_drvdata(ctrl->device, ctrl); + ida_init(&ctrl->ns_ida); spin_lock(&dev_list_lock); list_add_tail(&ctrl->node, &nvme_ctrl_list); @@ -1402,6 +1415,38 @@ out: return ret; } +/** + * nvme_kill_queues(): Ends all namespace queues + * @ctrl: the dead controller that needs to end + * + * Call this function when the driver determines it is unable to get the + * controller in a state capable of servicing IO. + */ +void nvme_kill_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (!kref_get_unless_zero(&ns->kref)) + continue; + + /* + * Revalidating a dead namespace sets capacity to 0. This will + * end buffered writers dirtying pages that can't be synced. + */ + if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + revalidate_disk(ns->disk); + + blk_set_queue_dying(ns->queue); + blk_mq_abort_requeue_list(ns->queue); + blk_mq_start_stopped_hw_queues(ns->queue, true); + + nvme_put_ns(ns); + } + mutex_unlock(&ctrl->namespaces_mutex); +} + void nvme_stop_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9664d07d807d..fb15ba5f5d19 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -72,6 +72,7 @@ struct nvme_ctrl { struct mutex namespaces_mutex; struct device *device; /* char device */ struct list_head node; + struct ida ns_ida; char name[12]; char serial[20]; @@ -102,6 +103,7 @@ struct nvme_ns { struct request_queue *queue; struct gendisk *disk; struct kref kref; + int instance; u8 eui[8]; u8 uuid[16]; @@ -112,6 +114,11 @@ struct nvme_ns { bool ext; u8 pi_type; int type; + unsigned long flags; + +#define NVME_NS_REMOVING 0 +#define NVME_NS_DEAD 1 + u64 mode_select_num_blocks; u32 mode_select_block_len; }; @@ -240,6 +247,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl); void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); +void nvme_kill_queues(struct nvme_ctrl *ctrl); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a128672472ec..680f5780750c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -86,7 +86,6 @@ struct nvme_queue; static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); -static void nvme_remove_dead_ctrl(struct nvme_dev *dev); static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); /* @@ -120,6 +119,7 @@ struct nvme_dev { unsigned long flags; #define NVME_CTRL_RESETTING 0 +#define NVME_CTRL_REMOVING 1 struct nvme_ctrl ctrl; struct completion ioq_wait; @@ -286,6 +286,17 @@ static int nvme_init_request(void *data, struct request *req, return 0; } +static void nvme_queue_scan(struct nvme_dev *dev) +{ + /* + * Do not queue new scan work when a controller is reset during + * removal. + */ + if (test_bit(NVME_CTRL_REMOVING, &dev->flags)) + return; + queue_work(nvme_workq, &dev->scan_work); +} + static void nvme_complete_async_event(struct nvme_dev *dev, struct nvme_completion *cqe) { @@ -300,7 +311,7 @@ static void nvme_complete_async_event(struct nvme_dev *dev, switch (result & 0xff07) { case NVME_AER_NOTICE_NS_CHANGED: dev_info(dev->dev, "rescanning\n"); - queue_work(nvme_workq, &dev->scan_work); + nvme_queue_scan(dev); default: dev_warn(dev->dev, "async event result %08x\n", result); } @@ -679,7 +690,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, spin_lock_irq(&nvmeq->q_lock); if (unlikely(nvmeq->cq_vector < 0)) { - ret = BLK_MQ_RQ_QUEUE_BUSY; + if (ns && !test_bit(NVME_NS_DEAD, &ns->flags)) + ret = BLK_MQ_RQ_QUEUE_BUSY; + else + ret = BLK_MQ_RQ_QUEUE_ERROR; spin_unlock_irq(&nvmeq->q_lock); goto out; } @@ -1250,6 +1264,12 @@ static struct blk_mq_ops nvme_mq_ops = { static void nvme_dev_remove_admin(struct nvme_dev *dev) { if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { + /* + * If the controller was reset during removal, it's possible + * user requests may be waiting on a stopped queue. Start the + * queue to flush these to completion. + */ + blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); blk_cleanup_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } @@ -1690,14 +1710,14 @@ static int nvme_dev_add(struct nvme_dev *dev) return 0; dev->ctrl.tagset = &dev->tagset; } - queue_work(nvme_workq, &dev->scan_work); + nvme_queue_scan(dev); return 0; } -static int nvme_dev_map(struct nvme_dev *dev) +static int nvme_pci_enable(struct nvme_dev *dev) { u64 cap; - int bars, result = -ENOMEM; + int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); if (pci_enable_device_mem(pdev)) @@ -1705,24 +1725,14 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->entry[0].vector = pdev->irq; pci_set_master(pdev); - bars = pci_select_bars(pdev, IORESOURCE_MEM); - if (!bars) - goto disable_pci; - - if (pci_request_selected_regions(pdev, bars, "nvme")) - goto disable_pci; if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) goto disable; - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); - if (!dev->bar) - goto disable; - if (readl(dev->bar + NVME_REG_CSTS) == -1) { result = -ENODEV; - goto unmap; + goto disable; } /* @@ -1732,7 +1742,7 @@ static int nvme_dev_map(struct nvme_dev *dev) if (!pdev->irq) { result = pci_enable_msix(pdev, dev->entry, 1); if (result < 0) - goto unmap; + goto disable; } cap = lo_hi_readq(dev->bar + NVME_REG_CAP); @@ -1759,17 +1769,19 @@ static int nvme_dev_map(struct nvme_dev *dev) pci_save_state(pdev); return 0; - unmap: - iounmap(dev->bar); - dev->bar = NULL; disable: - pci_release_regions(pdev); - disable_pci: pci_disable_device(pdev); return result; } static void nvme_dev_unmap(struct nvme_dev *dev) +{ + if (dev->bar) + iounmap(dev->bar); + pci_release_regions(to_pci_dev(dev->dev)); +} + +static void nvme_pci_disable(struct nvme_dev *dev) { struct pci_dev *pdev = to_pci_dev(dev->dev); @@ -1778,12 +1790,6 @@ static void nvme_dev_unmap(struct nvme_dev *dev) else if (pdev->msix_enabled) pci_disable_msix(pdev); - if (dev->bar) { - iounmap(dev->bar); - dev->bar = NULL; - pci_release_regions(pdev); - } - if (pci_is_enabled(pdev)) { pci_disable_pcie_error_reporting(pdev); pci_disable_device(pdev); @@ -1842,7 +1848,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_dev_list_remove(dev); mutex_lock(&dev->shutdown_lock); - if (dev->bar) { + if (pci_is_enabled(to_pci_dev(dev->dev))) { nvme_stop_queues(&dev->ctrl); csts = readl(dev->bar + NVME_REG_CSTS); } @@ -1855,7 +1861,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_disable_io_queues(dev); nvme_disable_admin_queue(dev, shutdown); } - nvme_dev_unmap(dev); + nvme_pci_disable(dev); for (i = dev->queue_count - 1; i >= 0; i--) nvme_clear_queue(dev->queues[i]); @@ -1899,10 +1905,20 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) kfree(dev); } +static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) +{ + dev_warn(dev->dev, "Removing after probe failure status: %d\n", status); + + kref_get(&dev->ctrl.kref); + nvme_dev_disable(dev, false); + if (!schedule_work(&dev->remove_work)) + nvme_put_ctrl(&dev->ctrl); +} + static void nvme_reset_work(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); - int result; + int result = -ENODEV; if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) goto out; @@ -1911,37 +1927,37 @@ static void nvme_reset_work(struct work_struct *work) * If we're called to reset a live controller first shut it down before * moving on. */ - if (dev->bar) + if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); set_bit(NVME_CTRL_RESETTING, &dev->flags); - result = nvme_dev_map(dev); + result = nvme_pci_enable(dev); if (result) goto out; result = nvme_configure_admin_queue(dev); if (result) - goto unmap; + goto out; nvme_init_queue(dev->queues[0], 0); result = nvme_alloc_admin_tags(dev); if (result) - goto disable; + goto out; result = nvme_init_identify(&dev->ctrl); if (result) - goto free_tags; + goto out; result = nvme_setup_io_queues(dev); if (result) - goto free_tags; + goto out; dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; result = nvme_dev_list_add(dev); if (result) - goto remove; + goto out; /* * Keep the controller around but remove all namespaces if we don't have @@ -1958,19 +1974,8 @@ static void nvme_reset_work(struct work_struct *work) clear_bit(NVME_CTRL_RESETTING, &dev->flags); return; - remove: - nvme_dev_list_remove(dev); - free_tags: - nvme_dev_remove_admin(dev); - blk_put_queue(dev->ctrl.admin_q); - dev->ctrl.admin_q = NULL; - dev->queues[0]->tags = NULL; - disable: - nvme_disable_admin_queue(dev, false); - unmap: - nvme_dev_unmap(dev); out: - nvme_remove_dead_ctrl(dev); + nvme_remove_dead_ctrl(dev, result); } static void nvme_remove_dead_ctrl_work(struct work_struct *work) @@ -1978,19 +1983,12 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); struct pci_dev *pdev = to_pci_dev(dev->dev); + nvme_kill_queues(&dev->ctrl); if (pci_get_drvdata(pdev)) pci_stop_and_remove_bus_device_locked(pdev); nvme_put_ctrl(&dev->ctrl); } -static void nvme_remove_dead_ctrl(struct nvme_dev *dev) -{ - dev_warn(dev->dev, "Removing after probe failure\n"); - kref_get(&dev->ctrl.kref); - if (!schedule_work(&dev->remove_work)) - nvme_put_ctrl(&dev->ctrl); -} - static int nvme_reset(struct nvme_dev *dev) { if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) @@ -2042,6 +2040,27 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .free_ctrl = nvme_pci_free_ctrl, }; +static int nvme_dev_map(struct nvme_dev *dev) +{ + int bars; + struct pci_dev *pdev = to_pci_dev(dev->dev); + + bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (!bars) + return -ENODEV; + if (pci_request_selected_regions(pdev, bars, "nvme")) + return -ENODEV; + + dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); + if (!dev->bar) + goto release; + + return 0; + release: + pci_release_regions(pdev); + return -ENODEV; +} + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int node, result = -ENOMEM; @@ -2066,6 +2085,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); + result = nvme_dev_map(dev); + if (result) + goto free; + INIT_LIST_HEAD(&dev->node); INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->reset_work, nvme_reset_work); @@ -2089,6 +2112,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_release_prp_pools(dev); put_pci: put_device(dev->dev); + nvme_dev_unmap(dev); free: kfree(dev->queues); kfree(dev->entry); @@ -2112,10 +2136,16 @@ static void nvme_shutdown(struct pci_dev *pdev) nvme_dev_disable(dev, true); } +/* + * The driver's remove may be called on a device in a partially initialized + * state. This function must not have any dependencies on the device state in + * order to proceed. + */ static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); + set_bit(NVME_CTRL_REMOVING, &dev->flags); pci_set_drvdata(pdev, NULL); flush_work(&dev->scan_work); nvme_remove_namespaces(&dev->ctrl); @@ -2126,6 +2156,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_free_queues(dev, 0); nvme_release_cmb(dev); nvme_release_prp_pools(dev); + nvme_dev_unmap(dev); nvme_put_ctrl(&dev->ctrl); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1f76d8950a57..5c46ed9f3e14 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi, #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) /* one round can affect upto 5 slots */ +static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); +static struct workqueue_struct *isw_wq; + void __inode_attach_wb(struct inode *inode, struct page *page) { struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -317,7 +320,6 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) struct inode_switch_wbs_context *isw = container_of(work, struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; - struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; @@ -424,8 +426,9 @@ skip_switch: wb_put(new_wb); iput(inode); - deactivate_super(sb); kfree(isw); + + atomic_dec(&isw_nr_in_flight); } static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) @@ -435,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) /* needs to grab bh-unsafe locks, bounce to work item */ INIT_WORK(&isw->work, inode_switch_wbs_work_fn); - schedule_work(&isw->work); + queue_work(isw_wq, &isw->work); } /** @@ -471,20 +474,20 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); - - if (inode->i_state & (I_WB_SWITCH | I_FREEING) || - inode_to_wb(inode) == isw->new_wb) - goto out_unlock; - - if (!atomic_inc_not_zero(&inode->i_sb->s_active)) - goto out_unlock; - + if (!(inode->i_sb->s_flags & MS_ACTIVE) || + inode->i_state & (I_WB_SWITCH | I_FREEING) || + inode_to_wb(inode) == isw->new_wb) { + spin_unlock(&inode->i_lock); + goto out_free; + } inode->i_state |= I_WB_SWITCH; spin_unlock(&inode->i_lock); ihold(inode); isw->inode = inode; + atomic_inc(&isw_nr_in_flight); + /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the mapping's @@ -494,8 +497,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); return; -out_unlock: - spin_unlock(&inode->i_lock); out_free: if (isw->new_wb) wb_put(isw->new_wb); @@ -847,6 +848,33 @@ restart: wb_put(last_wb); } +/** + * cgroup_writeback_umount - flush inode wb switches for umount + * + * This function is called when a super_block is about to be destroyed and + * flushes in-flight inode wb switches. An inode wb switch goes through + * RCU and then workqueue, so the two need to be flushed in order to ensure + * that all previously scheduled switches are finished. As wb switches are + * rare occurrences and synchronize_rcu() can take a while, perform + * flushing iff wb switches are in flight. + */ +void cgroup_writeback_umount(void) +{ + if (atomic_read(&isw_nr_in_flight)) { + synchronize_rcu(); + flush_workqueue(isw_wq); + } +} + +static int __init cgroup_writeback_init(void) +{ + isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); + if (!isw_wq) + return -ENOMEM; + return 0; +} +fs_initcall(cgroup_writeback_init); + #else /* CONFIG_CGROUP_WRITEBACK */ static struct bdi_writeback * diff --git a/fs/super.c b/fs/super.c index 1182af8fd5ff..74914b1bae70 100644 --- a/fs/super.c +++ b/fs/super.c @@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb) sb->s_flags &= ~MS_ACTIVE; fsnotify_unmount_inodes(sb); + cgroup_writeback_umount(); evict_inodes(sb); diff --git a/include/linux/bio.h b/include/linux/bio.h index 5349e6816cbb..cb6888824108 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -310,6 +310,43 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit) bio->bi_flags &= ~(1U << bit); } +static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv) +{ + *bv = bio_iovec(bio); +} + +static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) +{ + struct bvec_iter iter = bio->bi_iter; + int idx; + + if (!bio_flagged(bio, BIO_CLONED)) { + *bv = bio->bi_io_vec[bio->bi_vcnt - 1]; + return; + } + + if (unlikely(!bio_multiple_segments(bio))) { + *bv = bio_iovec(bio); + return; + } + + bio_advance_iter(bio, &iter, iter.bi_size); + + if (!iter.bi_bvec_done) + idx = iter.bi_idx - 1; + else /* in the middle of bvec */ + idx = iter.bi_idx; + + *bv = bio->bi_io_vec[idx]; + + /* + * iter.bi_bvec_done records actual length of the last bvec + * if this bio ends in the middle of one io vector + */ + if (iter.bi_bvec_done) + bv->bv_len = iter.bi_bvec_done; +} + enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4571ef1a12a9..413c84fbc4ed 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -895,7 +895,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq) { struct request_queue *q = rq->q; - if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC)) + if (unlikely(rq->cmd_type != REQ_TYPE_FS)) return q->limits.max_hw_sectors; if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD)) @@ -1372,6 +1372,13 @@ static inline void put_dev_sector(Sector p) page_cache_release(p.v); } +static inline bool __bvec_gap_to_prev(struct request_queue *q, + struct bio_vec *bprv, unsigned int offset) +{ + return offset || + ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); +} + /* * Check if adding a bio_vec after bprv with offset would create a gap in * the SG list. Most drivers don't care about this, but some do. @@ -1381,18 +1388,22 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, { if (!queue_virt_boundary(q)) return false; - return offset || - ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); + return __bvec_gap_to_prev(q, bprv, offset); } static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, struct bio *next) { - if (!bio_has_data(prev)) - return false; + if (bio_has_data(prev) && queue_virt_boundary(q)) { + struct bio_vec pb, nb; - return bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1], - next->bi_io_vec[0].bv_offset); + bio_get_last_bvec(prev, &pb); + bio_get_first_bvec(next, &nb); + + return __bvec_gap_to_prev(q, &pb, nb.bv_offset); + } + + return false; } static inline bool req_gap_back_merge(struct request *req, struct bio *bio) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b333c945e571..d0b5ca5d4e08 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -198,6 +198,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_io(struct writeback_control *wbc, struct page *page, size_t bytes); +void cgroup_writeback_umount(void); /** * inode_attach_wb - associate an inode with its wb @@ -301,6 +302,10 @@ static inline void wbc_account_io(struct writeback_control *wbc, { } +static inline void cgroup_writeback_umount(void) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /*