diff --git a/MAINTAINERS b/MAINTAINERS index 8349c1a76970..2336dd26ece4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2627,7 +2627,7 @@ F: Documentation/devicetree/bindings/eeprom/at24.txt F: drivers/misc/eeprom/at24.c ATA OVER ETHERNET (AOE) DRIVER -M: "Ed L. Cashin" +M: "Justin Sanders" W: http://www.openaoe.org/ S: Supported F: Documentation/aoe/ @@ -11226,7 +11226,7 @@ F: drivers/video/fbdev/riva/ F: drivers/video/fbdev/nvidia/ NVM EXPRESS DRIVER -M: Keith Busch +M: Keith Busch M: Jens Axboe M: Christoph Hellwig M: Sagi Grimberg diff --git a/block/blk-core.c b/block/blk-core.c index 419d600e6637..1bf83a0df0f6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -413,7 +413,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) smp_rmb(); wait_event(q->mq_freeze_wq, - (atomic_read(&q->mq_freeze_depth) == 0 && + (!q->mq_freeze_depth && (pm || (blk_pm_request_resume(q), !blk_queue_pm_only(q)))) || blk_queue_dying(q)); @@ -503,6 +503,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); + mutex_init(&q->mq_freeze_lock); /* * Init percpu_ref in atomic mode so that it's faster to shutdown. diff --git a/block/blk-merge.c b/block/blk-merge.c index 21e87a714a73..17713d7d98d5 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -12,23 +12,6 @@ #include "blk.h" -/* - * Check if the two bvecs from two bios can be merged to one segment. If yes, - * no need to check gap between the two bios since the 1st bio and the 1st bvec - * in the 2nd bio can be handled in one segment. - */ -static inline bool bios_segs_mergeable(struct request_queue *q, - struct bio *prev, struct bio_vec *prev_last_bv, - struct bio_vec *next_first_bv) -{ - if (!biovec_phys_mergeable(q, prev_last_bv, next_first_bv)) - return false; - if (prev->bi_seg_back_size + next_first_bv->bv_len > - queue_max_segment_size(q)) - return false; - return true; -} - static inline bool bio_will_gap(struct request_queue *q, struct request *prev_rq, struct bio *prev, struct bio *next) { @@ -60,7 +43,7 @@ static inline bool bio_will_gap(struct request_queue *q, */ bio_get_last_bvec(prev, &pb); bio_get_first_bvec(next, &nb); - if (bios_segs_mergeable(q, prev, &pb, &nb)) + if (biovec_phys_mergeable(q, &pb, &nb)) return false; return __bvec_gap_to_prev(q, &pb, nb.bv_offset); } @@ -179,8 +162,7 @@ static unsigned get_max_segment_size(struct request_queue *q, * variables. */ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, - unsigned *nsegs, unsigned *last_seg_size, - unsigned *front_seg_size, unsigned *sectors, unsigned max_segs) + unsigned *nsegs, unsigned *sectors, unsigned max_segs) { unsigned len = bv->bv_len; unsigned total_len = 0; @@ -202,28 +184,12 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, break; } - if (!new_nsegs) - return !!len; - - /* update front segment size */ - if (!*nsegs) { - unsigned first_seg_size; - - if (new_nsegs == 1) - first_seg_size = get_max_segment_size(q, bv->bv_offset); - else - first_seg_size = queue_max_segment_size(q); - - if (*front_seg_size < first_seg_size) - *front_seg_size = first_seg_size; + if (new_nsegs) { + *nsegs += new_nsegs; + if (sectors) + *sectors += total_len >> 9; } - /* update other varibles */ - *last_seg_size = seg_size; - *nsegs += new_nsegs; - if (sectors) - *sectors += total_len >> 9; - /* split in the middle of the bvec if len != 0 */ return !!len; } @@ -235,8 +201,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; - unsigned seg_size = 0, nsegs = 0, sectors = 0; - unsigned front_seg_size = bio->bi_seg_front_size; + unsigned nsegs = 0, sectors = 0; bool do_split = true; struct bio *new = NULL; const unsigned max_sectors = get_max_io_size(q, bio); @@ -260,8 +225,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, /* split in the middle of bvec */ bv.bv_len = (max_sectors - sectors) << 9; bvec_split_segs(q, &bv, &nsegs, - &seg_size, - &front_seg_size, §ors, max_segs); } goto split; @@ -275,12 +238,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) { nsegs++; - seg_size = bv.bv_len; sectors += bv.bv_len >> 9; - if (nsegs == 1 && seg_size > front_seg_size) - front_seg_size = seg_size; - } else if (bvec_split_segs(q, &bv, &nsegs, &seg_size, - &front_seg_size, §ors, max_segs)) { + } else if (bvec_split_segs(q, &bv, &nsegs, §ors, + max_segs)) { goto split; } } @@ -295,10 +255,6 @@ split: bio = new; } - bio->bi_seg_front_size = front_seg_size; - if (seg_size > bio->bi_seg_back_size) - bio->bi_seg_back_size = seg_size; - return do_split ? new : NULL; } @@ -353,18 +309,13 @@ EXPORT_SYMBOL(blk_queue_split); static unsigned int __blk_recalc_rq_segments(struct request_queue *q, struct bio *bio) { - struct bio_vec uninitialized_var(bv), bvprv = { NULL }; - unsigned int seg_size, nr_phys_segs; - unsigned front_seg_size; - struct bio *fbio, *bbio; + unsigned int nr_phys_segs = 0; struct bvec_iter iter; - bool new_bio = false; + struct bio_vec bv; if (!bio) return 0; - front_seg_size = bio->bi_seg_front_size; - switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: @@ -374,42 +325,11 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, return 1; } - fbio = bio; - seg_size = 0; - nr_phys_segs = 0; for_each_bio(bio) { - bio_for_each_bvec(bv, bio, iter) { - if (new_bio) { - if (seg_size + bv.bv_len - > queue_max_segment_size(q)) - goto new_segment; - if (!biovec_phys_mergeable(q, &bvprv, &bv)) - goto new_segment; - - seg_size += bv.bv_len; - - if (nr_phys_segs == 1 && seg_size > - front_seg_size) - front_seg_size = seg_size; - - continue; - } -new_segment: - bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size, - &front_seg_size, NULL, UINT_MAX); - new_bio = false; - } - bbio = bio; - if (likely(bio->bi_iter.bi_size)) { - bvprv = bv; - new_bio = true; - } + bio_for_each_bvec(bv, bio, iter) + bvec_split_segs(q, &bv, &nr_phys_segs, NULL, UINT_MAX); } - fbio->bi_seg_front_size = front_seg_size; - if (seg_size > bbio->bi_seg_back_size) - bbio->bi_seg_back_size = seg_size; - return nr_phys_segs; } @@ -429,24 +349,6 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio) bio_set_flag(bio, BIO_SEG_VALID); } -static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, - struct bio *nxt) -{ - struct bio_vec end_bv = { NULL }, nxt_bv; - - if (bio->bi_seg_back_size + nxt->bi_seg_front_size > - queue_max_segment_size(q)) - return 0; - - if (!bio_has_data(bio)) - return 1; - - bio_get_last_bvec(bio, &end_bv); - bio_get_first_bvec(nxt, &nxt_bv); - - return biovec_phys_mergeable(q, &end_bv, &nxt_bv); -} - static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) { @@ -706,8 +608,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { int total_phys_segments; - unsigned int seg_size = - req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; if (req_gap_back_merge(req, next->bio)) return 0; @@ -720,14 +620,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, return 0; total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; - if (blk_phys_contig_segment(q, req->biotail, next->bio)) { - if (req->nr_phys_segments == 1) - req->bio->bi_seg_front_size = seg_size; - if (next->nr_phys_segments == 1) - next->biotail->bi_seg_back_size = seg_size; - total_phys_segments--; - } - if (total_phys_segments > queue_max_segments(q)) return 0; diff --git a/block/blk-mq.c b/block/blk-mq.c index 08a6248d8536..32b8ad3d341b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -144,13 +144,14 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, void blk_freeze_queue_start(struct request_queue *q) { - int freeze_depth; - - freeze_depth = atomic_inc_return(&q->mq_freeze_depth); - if (freeze_depth == 1) { + mutex_lock(&q->mq_freeze_lock); + if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); + mutex_unlock(&q->mq_freeze_lock); if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); + } else { + mutex_unlock(&q->mq_freeze_lock); } } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -199,14 +200,14 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) { - int freeze_depth; - - freeze_depth = atomic_dec_return(&q->mq_freeze_depth); - WARN_ON_ONCE(freeze_depth < 0); - if (!freeze_depth) { + mutex_lock(&q->mq_freeze_lock); + q->mq_freeze_depth--; + WARN_ON_ONCE(q->mq_freeze_depth < 0); + if (!q->mq_freeze_depth) { percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } + mutex_unlock(&q->mq_freeze_lock); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); diff --git a/block/blk-settings.c b/block/blk-settings.c index 3facc41476be..2ae348c101a0 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -310,6 +310,9 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) __func__, max_size); } + /* see blk_queue_virt_boundary() for the explanation */ + WARN_ON_ONCE(q->limits.virt_boundary_mask); + q->limits.max_segment_size = max_size; } EXPORT_SYMBOL(blk_queue_max_segment_size); @@ -742,6 +745,14 @@ EXPORT_SYMBOL(blk_queue_segment_boundary); void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask) { q->limits.virt_boundary_mask = mask; + + /* + * Devices that require a virtual boundary do not support scatter/gather + * I/O natively, but instead require a descriptor list entry for each + * page (which might not be idential to the Linux PAGE_SIZE). Because + * of that they are not limited by our notion of "segment size". + */ + q->limits.max_segment_size = UINT_MAX; } EXPORT_SYMBOL(blk_queue_virt_boundary); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7da80f375315..1b7c2afd84cb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1257,9 +1257,9 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return 0; } - effects |= nvme_known_admin_effects(opcode); if (ctrl->effects) effects = le32_to_cpu(ctrl->effects->acs[opcode]); + effects |= nvme_known_admin_effects(opcode); /* * For simplicity, IO to all namespaces is quiesced even if the command @@ -1361,9 +1361,14 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, { #ifdef CONFIG_NVME_MULTIPATH if (disk->fops == &nvme_ns_head_ops) { + struct nvme_ns *ns; + *head = disk->private_data; *srcu_idx = srcu_read_lock(&(*head)->srcu); - return nvme_find_path(*head); + ns = nvme_find_path(*head); + if (!ns) + srcu_read_unlock(&(*head)->srcu, *srcu_idx); + return ns; } #endif *head = NULL; @@ -1377,42 +1382,56 @@ static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) srcu_read_unlock(&head->srcu, idx); } -static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg) -{ - switch (cmd) { - case NVME_IOCTL_ID: - force_successful_syscall_return(); - return ns->head->ns_id; - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); - case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); - case NVME_IOCTL_SUBMIT_IO: - return nvme_submit_io(ns, (void __user *)arg); - default: -#ifdef CONFIG_NVM - if (ns->ndev) - return nvme_nvm_ioctl(ns, cmd, arg); -#endif - if (is_sed_ioctl(cmd)) - return sed_ioctl(ns->ctrl->opal_dev, cmd, - (void __user *) arg); - return -ENOTTY; - } -} - static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct nvme_ns_head *head = NULL; + void __user *argp = (void __user *)arg; struct nvme_ns *ns; int srcu_idx, ret; ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); if (unlikely(!ns)) - ret = -EWOULDBLOCK; - else - ret = nvme_ns_ioctl(ns, cmd, arg); + return -EWOULDBLOCK; + + /* + * Handle ioctls that apply to the controller instead of the namespace + * seperately and drop the ns SRCU reference early. This avoids a + * deadlock when deleting namespaces using the passthrough interface. + */ + if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) { + struct nvme_ctrl *ctrl = ns->ctrl; + + nvme_get_ctrl(ns->ctrl); + nvme_put_ns_from_disk(head, srcu_idx); + + if (cmd == NVME_IOCTL_ADMIN_CMD) + ret = nvme_user_cmd(ctrl, NULL, argp); + else + ret = sed_ioctl(ctrl->opal_dev, cmd, argp); + + nvme_put_ctrl(ctrl); + return ret; + } + + switch (cmd) { + case NVME_IOCTL_ID: + force_successful_syscall_return(); + ret = ns->head->ns_id; + break; + case NVME_IOCTL_IO_CMD: + ret = nvme_user_cmd(ns->ctrl, ns, argp); + break; + case NVME_IOCTL_SUBMIT_IO: + ret = nvme_submit_io(ns, argp); + break; + default: + if (ns->ndev) + ret = nvme_nvm_ioctl(ns, cmd, arg); + else + ret = -ENOTTY; + } + nvme_put_ns_from_disk(head, srcu_idx); return ret; } @@ -2557,6 +2576,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->oacs = le16_to_cpu(id->oacs); ctrl->oncs = le16_to_cpu(id->oncs); + ctrl->mtfa = le16_to_cpu(id->mtfa); ctrl->oaes = le32_to_cpu(id->oaes); atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc; @@ -3681,6 +3701,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { + dev_pm_qos_hide_latency_tolerance(ctrl->device); cdev_device_del(&ctrl->cdev, ctrl->device); } EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); @@ -3880,6 +3901,18 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_queues); + +void nvme_sync_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_sync_queue(ns->queue); + up_read(&ctrl->namespaces_rwsem); +} +EXPORT_SYMBOL_GPL(nvme_sync_queues); + /* * Check we didn't inadvertently grow the command structure sizes: */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5ee75b5ff83f..55553d293a98 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -441,6 +441,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); +void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2a8708c9ac18..f562154551ce 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -464,7 +464,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set) * affinity), so use the regular blk-mq cpu mapping */ map->queue_offset = qoff; - if (i != HCTX_TYPE_POLL) + if (i != HCTX_TYPE_POLL && offset) blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset); else blk_mq_map_queues(map); @@ -1257,7 +1257,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd; - bool shutdown = false; u32 csts = readl(dev->bar + NVME_REG_CSTS); /* If PCI error recovery process is happening, we cannot reset or @@ -1294,17 +1293,18 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * shutdown, so we return BLK_EH_DONE. */ switch (dev->ctrl.state) { - case NVME_CTRL_DELETING: - shutdown = true; - /* fall through */ case NVME_CTRL_CONNECTING: - case NVME_CTRL_RESETTING: + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + /* fall through */ + case NVME_CTRL_DELETING: dev_warn_ratelimited(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); - nvme_dev_disable(dev, shutdown); + nvme_dev_disable(dev, true); nvme_req(req)->flags |= NVME_REQ_CANCELLED; return BLK_EH_DONE; + case NVME_CTRL_RESETTING: + return BLK_EH_RESET_TIMER; default: break; } @@ -2376,7 +2376,7 @@ static void nvme_pci_disable(struct nvme_dev *dev) static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { - bool dead = true; + bool dead = true, freeze = false; struct pci_dev *pdev = to_pci_dev(dev->dev); mutex_lock(&dev->shutdown_lock); @@ -2384,8 +2384,10 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) u32 csts = readl(dev->bar + NVME_REG_CSTS); if (dev->ctrl.state == NVME_CTRL_LIVE || - dev->ctrl.state == NVME_CTRL_RESETTING) + dev->ctrl.state == NVME_CTRL_RESETTING) { + freeze = true; nvme_start_freeze(&dev->ctrl); + } dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || pdev->error_state != pci_channel_io_normal); } @@ -2394,10 +2396,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * Give the controller a chance to complete all entered requests if * doing a safe shutdown. */ - if (!dead) { - if (shutdown) - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); - } + if (!dead && shutdown && freeze) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); nvme_stop_queues(&dev->ctrl); @@ -2492,6 +2492,7 @@ static void nvme_reset_work(struct work_struct *work) */ if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); + nvme_sync_queues(&dev->ctrl); mutex_lock(&dev->shutdown_lock); result = nvme_pci_enable(dev); diff --git a/include/linux/bio.h b/include/linux/bio.h index ea73df36529a..0f23b5682640 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -210,7 +210,7 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count) { if (count != 1) { bio->bi_flags |= (1 << BIO_REFFED); - smp_mb__before_atomic(); + smp_mb(); } atomic_set(&bio->__bi_cnt, count); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index be418275763c..95202f80676c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -159,13 +159,6 @@ struct bio { */ unsigned int bi_phys_segments; - /* - * To keep track of the max segment size, we account for the - * sizes of the first and last mergeable segments in this bio. - */ - unsigned int bi_seg_front_size; - unsigned int bi_seg_back_size; - struct bvec_iter bi_iter; atomic_t __bi_remaining; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1aafeb923e7b..592669bcc536 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -542,7 +542,7 @@ struct request_queue { struct list_head unused_hctx_list; spinlock_t unused_hctx_lock; - atomic_t mq_freeze_depth; + int mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG) struct bsg_class_device bsg_dev; @@ -554,6 +554,11 @@ struct request_queue { #endif struct rcu_head rcu_head; wait_queue_head_t mq_freeze_wq; + /* + * Protect concurrent access to q_usage_counter by + * percpu_ref_kill() and percpu_ref_reinit(). + */ + struct mutex mq_freeze_lock; struct percpu_ref q_usage_counter; struct blk_mq_tag_set *tag_set; diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 155fe38756ec..4a7fc4915dfc 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -435,7 +435,7 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, * to ensure that the batch size is updated before the wait * counts. */ - smp_mb__before_atomic(); + smp_mb(); for (i = 0; i < SBQ_WAIT_QUEUES; i++) atomic_set(&sbq->ws[i].wait_cnt, 1); } diff --git a/tools/io_uring/Makefile b/tools/io_uring/Makefile index f79522fc37b5..00f146c54c53 100644 --- a/tools/io_uring/Makefile +++ b/tools/io_uring/Makefile @@ -8,7 +8,7 @@ all: io_uring-cp io_uring-bench $(CC) $(CFLAGS) -o $@ $^ io_uring-bench: syscall.o io_uring-bench.o - $(CC) $(CFLAGS) $(LDLIBS) -o $@ $^ + $(CC) $(CFLAGS) -o $@ $^ $(LDLIBS) io_uring-cp: setup.o syscall.o queue.o diff --git a/tools/io_uring/io_uring-cp.c b/tools/io_uring/io_uring-cp.c index 633f65bb43a7..81461813ec62 100644 --- a/tools/io_uring/io_uring-cp.c +++ b/tools/io_uring/io_uring-cp.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -85,11 +86,16 @@ static int queue_read(struct io_uring *ring, off_t size, off_t offset) struct io_uring_sqe *sqe; struct io_data *data; - sqe = io_uring_get_sqe(ring); - if (!sqe) + data = malloc(size + sizeof(*data)); + if (!data) return 1; - data = malloc(size + sizeof(*data)); + sqe = io_uring_get_sqe(ring); + if (!sqe) { + free(data); + return 1; + } + data->read = 1; data->offset = data->first_offset = offset; @@ -166,22 +172,23 @@ static int copy_file(struct io_uring *ring, off_t insize) struct io_data *data; if (!got_comp) { - ret = io_uring_wait_completion(ring, &cqe); + ret = io_uring_wait_cqe(ring, &cqe); got_comp = 1; } else - ret = io_uring_get_completion(ring, &cqe); + ret = io_uring_peek_cqe(ring, &cqe); if (ret < 0) { - fprintf(stderr, "io_uring_get_completion: %s\n", + fprintf(stderr, "io_uring_peek_cqe: %s\n", strerror(-ret)); return 1; } if (!cqe) break; - data = (struct io_data *) (uintptr_t) cqe->user_data; + data = io_uring_cqe_get_data(cqe); if (cqe->res < 0) { if (cqe->res == -EAGAIN) { queue_prepped(ring, data); + io_uring_cqe_seen(ring, cqe); continue; } fprintf(stderr, "cqe failed: %s\n", @@ -193,6 +200,7 @@ static int copy_file(struct io_uring *ring, off_t insize) data->iov.iov_len -= cqe->res; data->offset += cqe->res; queue_prepped(ring, data); + io_uring_cqe_seen(ring, cqe); continue; } @@ -209,6 +217,7 @@ static int copy_file(struct io_uring *ring, off_t insize) free(data); writes--; } + io_uring_cqe_seen(ring, cqe); } } diff --git a/tools/io_uring/liburing.h b/tools/io_uring/liburing.h index cab0f50257ba..5f305c86b892 100644 --- a/tools/io_uring/liburing.h +++ b/tools/io_uring/liburing.h @@ -1,10 +1,16 @@ #ifndef LIB_URING_H #define LIB_URING_H +#ifdef __cplusplus +extern "C" { +#endif + #include #include #include #include "../../include/uapi/linux/io_uring.h" +#include +#include "barrier.h" /* * Library interface to io_uring @@ -46,7 +52,7 @@ struct io_uring { * System calls */ extern int io_uring_setup(unsigned entries, struct io_uring_params *p); -extern int io_uring_enter(unsigned fd, unsigned to_submit, +extern int io_uring_enter(int fd, unsigned to_submit, unsigned min_complete, unsigned flags, sigset_t *sig); extern int io_uring_register(int fd, unsigned int opcode, void *arg, unsigned int nr_args); @@ -59,13 +65,32 @@ extern int io_uring_queue_init(unsigned entries, struct io_uring *ring, extern int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring); extern void io_uring_queue_exit(struct io_uring *ring); -extern int io_uring_get_completion(struct io_uring *ring, +extern int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr); -extern int io_uring_wait_completion(struct io_uring *ring, +extern int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr); extern int io_uring_submit(struct io_uring *ring); extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring); +/* + * Must be called after io_uring_{peek,wait}_cqe() after the cqe has + * been processed by the application. + */ +static inline void io_uring_cqe_seen(struct io_uring *ring, + struct io_uring_cqe *cqe) +{ + if (cqe) { + struct io_uring_cq *cq = &ring->cq; + + (*cq->khead)++; + /* + * Ensure that the kernel sees our new head, the kernel has + * the matching read barrier. + */ + write_barrier(); + } +} + /* * Command prep helpers */ @@ -74,8 +99,14 @@ static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data) sqe->user_data = (unsigned long) data; } +static inline void *io_uring_cqe_get_data(struct io_uring_cqe *cqe) +{ + return (void *) (uintptr_t) cqe->user_data; +} + static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, - void *addr, unsigned len, off_t offset) + const void *addr, unsigned len, + off_t offset) { memset(sqe, 0, sizeof(*sqe)); sqe->opcode = op; @@ -86,8 +117,8 @@ static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, } static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd, - struct iovec *iovecs, unsigned nr_vecs, - off_t offset) + const struct iovec *iovecs, + unsigned nr_vecs, off_t offset) { io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset); } @@ -100,14 +131,14 @@ static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd, } static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd, - struct iovec *iovecs, unsigned nr_vecs, - off_t offset) + const struct iovec *iovecs, + unsigned nr_vecs, off_t offset) { io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset); } static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd, - void *buf, unsigned nbytes, + const void *buf, unsigned nbytes, off_t offset) { io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset); @@ -131,13 +162,22 @@ static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe, } static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd, - int datasync) + unsigned fsync_flags) { memset(sqe, 0, sizeof(*sqe)); sqe->opcode = IORING_OP_FSYNC; sqe->fd = fd; - if (datasync) - sqe->fsync_flags = IORING_FSYNC_DATASYNC; + sqe->fsync_flags = fsync_flags; } +static inline void io_uring_prep_nop(struct io_uring_sqe *sqe) +{ + memset(sqe, 0, sizeof(*sqe)); + sqe->opcode = IORING_OP_NOP; +} + +#ifdef __cplusplus +} +#endif + #endif diff --git a/tools/io_uring/queue.c b/tools/io_uring/queue.c index 88505e873ad9..321819c132c7 100644 --- a/tools/io_uring/queue.c +++ b/tools/io_uring/queue.c @@ -8,8 +8,8 @@ #include "liburing.h" #include "barrier.h" -static int __io_uring_get_completion(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr, int wait) +static int __io_uring_get_cqe(struct io_uring *ring, + struct io_uring_cqe **cqe_ptr, int wait) { struct io_uring_cq *cq = &ring->cq; const unsigned mask = *cq->kring_mask; @@ -39,34 +39,25 @@ static int __io_uring_get_completion(struct io_uring *ring, return -errno; } while (1); - if (*cqe_ptr) { - *cq->khead = head + 1; - /* - * Ensure that the kernel sees our new head, the kernel has - * the matching read barrier. - */ - write_barrier(); - } - return 0; } /* - * Return an IO completion, if one is readily available + * Return an IO completion, if one is readily available. Returns 0 with + * cqe_ptr filled in on success, -errno on failure. */ -int io_uring_get_completion(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr) +int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) { - return __io_uring_get_completion(ring, cqe_ptr, 0); + return __io_uring_get_cqe(ring, cqe_ptr, 0); } /* - * Return an IO completion, waiting for it if necessary + * Return an IO completion, waiting for it if necessary. Returns 0 with + * cqe_ptr filled in on success, -errno on failure. */ -int io_uring_wait_completion(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr) +int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) { - return __io_uring_get_completion(ring, cqe_ptr, 1); + return __io_uring_get_cqe(ring, cqe_ptr, 1); } /* @@ -78,7 +69,7 @@ int io_uring_submit(struct io_uring *ring) { struct io_uring_sq *sq = &ring->sq; const unsigned mask = *sq->kring_mask; - unsigned ktail, ktail_next, submitted; + unsigned ktail, ktail_next, submitted, to_submit; int ret; /* @@ -100,7 +91,8 @@ int io_uring_submit(struct io_uring *ring) */ submitted = 0; ktail = ktail_next = *sq->ktail; - while (sq->sqe_head < sq->sqe_tail) { + to_submit = sq->sqe_tail - sq->sqe_head; + while (to_submit--) { ktail_next++; read_barrier(); @@ -136,7 +128,7 @@ submit: if (ret < 0) return -errno; - return 0; + return ret; } /* diff --git a/tools/io_uring/setup.c b/tools/io_uring/setup.c index 4da19a77132c..0b50fcd78520 100644 --- a/tools/io_uring/setup.c +++ b/tools/io_uring/setup.c @@ -27,7 +27,7 @@ static int io_uring_mmap(int fd, struct io_uring_params *p, sq->kdropped = ptr + p->sq_off.dropped; sq->array = ptr + p->sq_off.array; - size = p->sq_entries * sizeof(struct io_uring_sqe), + size = p->sq_entries * sizeof(struct io_uring_sqe); sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); @@ -79,7 +79,7 @@ int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) { struct io_uring_params p; - int fd; + int fd, ret; memset(&p, 0, sizeof(p)); p.flags = flags; @@ -88,7 +88,11 @@ int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) if (fd < 0) return fd; - return io_uring_queue_mmap(fd, &p, ring); + ret = io_uring_queue_mmap(fd, &p, ring); + if (ret) + close(fd); + + return ret; } void io_uring_queue_exit(struct io_uring *ring) diff --git a/tools/io_uring/syscall.c b/tools/io_uring/syscall.c index 6b835e5c6a5b..b22e0aa54e9d 100644 --- a/tools/io_uring/syscall.c +++ b/tools/io_uring/syscall.c @@ -7,34 +7,46 @@ #include #include "liburing.h" -#if defined(__x86_64) || defined(__i386__) -#ifndef __NR_sys_io_uring_setup -#define __NR_sys_io_uring_setup 425 -#endif -#ifndef __NR_sys_io_uring_enter -#define __NR_sys_io_uring_enter 426 -#endif -#ifndef __NR_sys_io_uring_register -#define __NR_sys_io_uring_register 427 -#endif -#else -#error "Arch not supported yet" +#ifdef __alpha__ +/* + * alpha is the only exception, all other architectures + * have common numbers for new system calls. + */ +# ifndef __NR_io_uring_setup +# define __NR_io_uring_setup 535 +# endif +# ifndef __NR_io_uring_enter +# define __NR_io_uring_enter 536 +# endif +# ifndef __NR_io_uring_register +# define __NR_io_uring_register 537 +# endif +#else /* !__alpha__ */ +# ifndef __NR_io_uring_setup +# define __NR_io_uring_setup 425 +# endif +# ifndef __NR_io_uring_enter +# define __NR_io_uring_enter 426 +# endif +# ifndef __NR_io_uring_register +# define __NR_io_uring_register 427 +# endif #endif int io_uring_register(int fd, unsigned int opcode, void *arg, unsigned int nr_args) { - return syscall(__NR_sys_io_uring_register, fd, opcode, arg, nr_args); + return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args); } -int io_uring_setup(unsigned entries, struct io_uring_params *p) +int io_uring_setup(unsigned int entries, struct io_uring_params *p) { - return syscall(__NR_sys_io_uring_setup, entries, p); + return syscall(__NR_io_uring_setup, entries, p); } -int io_uring_enter(unsigned fd, unsigned to_submit, unsigned min_complete, - unsigned flags, sigset_t *sig) +int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete, + unsigned int flags, sigset_t *sig) { - return syscall(__NR_sys_io_uring_enter, fd, to_submit, min_complete, + return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags, sig, _NSIG / 8); }