diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 2122d1a4a541..ba45c46cc0da 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -144,6 +144,42 @@ managing and controlling ublk devices with help of several control commands: For retrieving device info via ``ublksrv_ctrl_dev_info``. It is the server's responsibility to save IO target specific info in userspace. +- ``UBLK_CMD_START_USER_RECOVERY`` + + This command is valid if ``UBLK_F_USER_RECOVERY`` feature is enabled. This + command is accepted after the old process has exited, ublk device is quiesced + and ``/dev/ublkc*`` is released. User should send this command before he starts + a new process which re-opens ``/dev/ublkc*``. When this command returns, the + ublk device is ready for the new process. + +- ``UBLK_CMD_END_USER_RECOVERY`` + + This command is valid if ``UBLK_F_USER_RECOVERY`` feature is enabled. This + command is accepted after ublk device is quiesced and a new process has + opened ``/dev/ublkc*`` and get all ublk queues be ready. When this command + returns, ublk device is unquiesced and new I/O requests are passed to the + new process. + +- user recovery feature description + + Two new features are added for user recovery: ``UBLK_F_USER_RECOVERY`` and + ``UBLK_F_USER_RECOVERY_REISSUE``. + + With ``UBLK_F_USER_RECOVERY`` set, after one ubq_daemon(ublk server's io + handler) is dying, ublk does not delete ``/dev/ublkb*`` during the whole + recovery stage and ublk device ID is kept. It is ublk server's + responsibility to recover the device context by its own knowledge. + Requests which have not been issued to userspace are requeued. Requests + which have been issued to userspace are aborted. + + With ``UBLK_F_USER_RECOVERY_REISSUE`` set, after one ubq_daemon(ublk + server's io handler) is dying, contrary to ``UBLK_F_USER_RECOVERY``, + requests which have been issued to userspace are requeued and will be + re-issued to the new process after handling ``UBLK_CMD_END_USER_RECOVERY``. + ``UBLK_F_USER_RECOVERY_REISSUE`` is designed for backends who tolerate + double-write since the driver may issue the same I/O request twice. It + might be useful to a read-only FS or a VM backend. + Data plane ---------- diff --git a/MAINTAINERS b/MAINTAINERS index 8904f186c83b..bdd43abd2683 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14713,6 +14713,12 @@ F: drivers/nvme/target/auth.c F: drivers/nvme/target/fabrics-cmd-auth.c F: include/linux/nvme-auth.h +NVM EXPRESS HARDWARE MONITORING SUPPORT +M: Guenter Roeck +L: linux-nvme@lists.infradead.org +S: Supported +F: drivers/nvme/host/hwmon.c + NVM EXPRESS FC TRANSPORT DRIVERS M: James Smart L: linux-nvme@lists.infradead.org diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 64ee618064ba..71f721670ab6 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -369,12 +369,8 @@ struct bfq_queue { unsigned long split_time; /* time of last split */ unsigned long first_IO_time; /* time of first I/O for this queue */ - unsigned long creation_time; /* when this queue is created */ - /* max service rate measured so far */ - u32 max_service_rate; - /* * Pointer to the waker queue for this queue, i.e., to the * queue Q such that this queue happens to get new I/O right diff --git a/block/bio.c b/block/bio.c index 633a902468ec..57c2f327225b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -741,7 +741,7 @@ void bio_put(struct bio *bio) return; } - if (bio->bi_opf & REQ_ALLOC_CACHE) { + if ((bio->bi_opf & REQ_ALLOC_CACHE) && !WARN_ON_ONCE(in_interrupt())) { struct bio_alloc_cache *cache; bio_uninit(bio); diff --git a/block/blk-mq.c b/block/blk-mq.c index 8070b6c10e8d..33292c01875d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3112,8 +3112,11 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct page *page; unsigned long flags; - /* There is no need to clear a driver tags own mapping */ - if (drv_tags == tags) + /* + * There is no need to clear mapping if driver tags is not initialized + * or the mapping belongs to the driver tags. + */ + if (!drv_tags || drv_tags == tags) return; list_for_each_entry(page, &tags->page_list, lru) { diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 8f7f144e54f3..7f9bcc82fc9c 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -30,11 +30,6 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio return NULL; memset(req, 0, sizeof(*req)); - req->private_bio = bio_alloc_clone(device->ldev->backing_bdev, bio_src, - GFP_NOIO, &drbd_io_bio_set); - req->private_bio->bi_private = req; - req->private_bio->bi_end_io = drbd_request_endio; - req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0) | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0) | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0); @@ -1219,9 +1214,12 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio) /* Update disk stats */ req->start_jif = bio_start_io_acct(req->master_bio); - if (!get_ldev(device)) { - bio_put(req->private_bio); - req->private_bio = NULL; + if (get_ldev(device)) { + req->private_bio = bio_alloc_clone(device->ldev->backing_bdev, + bio, GFP_NOIO, + &drbd_io_bio_set); + req->private_bio->bi_private = req; + req->private_bio->bi_end_io = drbd_request_endio; } /* process discards always from our submitter thread */ diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 2651bf41dde3..5afce6ffaadf 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -124,7 +124,7 @@ struct ublk_queue { bool force_abort; unsigned short nr_io_ready; /* how many ios setup */ struct ublk_device *dev; - struct ublk_io ios[0]; + struct ublk_io ios[]; }; #define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 5fc5ea196b40..ff8b083dc5c6 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1039,6 +1039,8 @@ static void apple_nvme_reset_work(struct work_struct *work) dma_max_mapping_size(anv->dev) >> 9); anv->ctrl.max_segments = NVME_MAX_SEGS; + dma_set_max_seg_size(anv->dev, 0xffffffff); + /* * Enable NVMMU and linear submission queues. * While we could keep those disabled and pretend this is slightly diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 059737c1a2c1..dc4220600585 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3262,8 +3262,12 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) return ret; if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) { + /* + * Do not return errors unless we are in a controller reset, + * the controller works perfectly fine without hwmon. + */ ret = nvme_hwmon_init(ctrl); - if (ret < 0) + if (ret == -EINTR) return ret; } @@ -4846,7 +4850,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, return 0; out_cleanup_admin_q: - blk_mq_destroy_queue(ctrl->fabrics_q); + blk_mq_destroy_queue(ctrl->admin_q); out_free_tagset: blk_mq_free_tag_set(ctrl->admin_tagset); return ret; diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 0a586d712920..9e6e56c20ec9 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -12,7 +12,7 @@ struct nvme_hwmon_data { struct nvme_ctrl *ctrl; - struct nvme_smart_log log; + struct nvme_smart_log *log; struct mutex read_lock; }; @@ -60,14 +60,14 @@ static int nvme_set_temp_thresh(struct nvme_ctrl *ctrl, int sensor, bool under, static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data) { return nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0, - NVME_CSI_NVM, &data->log, sizeof(data->log), 0); + NVME_CSI_NVM, data->log, sizeof(*data->log), 0); } static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *val) { struct nvme_hwmon_data *data = dev_get_drvdata(dev); - struct nvme_smart_log *log = &data->log; + struct nvme_smart_log *log = data->log; int temp; int err; @@ -163,7 +163,7 @@ static umode_t nvme_hwmon_is_visible(const void *_data, case hwmon_temp_max: case hwmon_temp_min: if ((!channel && data->ctrl->wctemp) || - (channel && data->log.temp_sensor[channel - 1])) { + (channel && data->log->temp_sensor[channel - 1])) { if (data->ctrl->quirks & NVME_QUIRK_NO_TEMP_THRESH_CHANGE) return 0444; @@ -176,7 +176,7 @@ static umode_t nvme_hwmon_is_visible(const void *_data, break; case hwmon_temp_input: case hwmon_temp_label: - if (!channel || data->log.temp_sensor[channel - 1]) + if (!channel || data->log->temp_sensor[channel - 1]) return 0444; break; default: @@ -230,7 +230,13 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl) data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) - return 0; + return -ENOMEM; + + data->log = kzalloc(sizeof(*data->log), GFP_KERNEL); + if (!data->log) { + err = -ENOMEM; + goto err_free_data; + } data->ctrl = ctrl; mutex_init(&data->read_lock); @@ -238,8 +244,7 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl) err = nvme_hwmon_get_smart_log(data); if (err) { dev_warn(dev, "Failed to read smart log (error %d)\n", err); - kfree(data); - return err; + goto err_free_log; } hwmon = hwmon_device_register_with_info(dev, "nvme", @@ -247,11 +252,17 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl) NULL); if (IS_ERR(hwmon)) { dev_warn(dev, "Failed to instantiate hwmon device\n"); - kfree(data); - return PTR_ERR(hwmon); + err = PTR_ERR(hwmon); + goto err_free_log; } ctrl->hwmon_device = hwmon; return 0; + +err_free_log: + kfree(data->log); +err_free_data: + kfree(data); + return err; } void nvme_hwmon_exit(struct nvme_ctrl *ctrl) @@ -262,6 +273,7 @@ void nvme_hwmon_exit(struct nvme_ctrl *ctrl) hwmon_device_unregister(ctrl->hwmon_device); ctrl->hwmon_device = NULL; + kfree(data->log); kfree(data); } } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bcbef6bc5672..31e577b01257 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3511,6 +3511,16 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0x2646, 0x5018), /* KINGSTON OM8SFP4xxxxP OS21012 NVMe SSD */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x2646, 0x5016), /* KINGSTON OM3PGP4xxxxP OS21011 NVMe SSD */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x2646, 0x501A), /* KINGSTON OM8PGP4xxxxP OS21005 NVMe SSD */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x2646, 0x501B), /* KINGSTON OM8PGP4xxxxQ OS21005 NVMe SSD */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x2646, 0x501E), /* KINGSTON OM3PGP4xxxxQ OS21011 NVMe SSD */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1e4B, 0x1001), /* MAXIO MAP1001 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1e4B, 0x1002), /* MAXIO MAP1002 */ diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index e34a2896fedb..9443ee1d4ae3 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1290,12 +1290,8 @@ static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item, static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item, const char *page, size_t cnt) { - struct nvmet_port *port = to_nvmet_port(item); u16 qid_max; - if (nvmet_is_port_enabled(port, __func__)) - return -EACCES; - if (sscanf(page, "%hu\n", &qid_max) != 1) return -EINVAL; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 14677145bbba..aecb5853f8da 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1176,7 +1176,7 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) * reset the keep alive timer when the controller is enabled. */ if (ctrl->kato) - mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); + mod_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ); } static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7f5eb295fe19..a995ea1ef849 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -346,8 +346,40 @@ static void put_probe_ref(void) mutex_unlock(&blk_probe_mutex); } +static int blk_trace_start(struct blk_trace *bt) +{ + if (bt->trace_state != Blktrace_setup && + bt->trace_state != Blktrace_stopped) + return -EINVAL; + + blktrace_seq++; + smp_mb(); + bt->trace_state = Blktrace_running; + raw_spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + raw_spin_unlock_irq(&running_trace_lock); + trace_note_time(bt); + + return 0; +} + +static int blk_trace_stop(struct blk_trace *bt) +{ + if (bt->trace_state != Blktrace_running) + return -EINVAL; + + bt->trace_state = Blktrace_stopped; + raw_spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + raw_spin_unlock_irq(&running_trace_lock); + relay_flush(bt->rchan); + + return 0; +} + static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { + blk_trace_stop(bt); synchronize_rcu(); blk_trace_free(q, bt); put_probe_ref(); @@ -362,8 +394,7 @@ static int __blk_trace_remove(struct request_queue *q) if (!bt) return -EINVAL; - if (bt->trace_state != Blktrace_running) - blk_trace_cleanup(q, bt); + blk_trace_cleanup(q, bt); return 0; } @@ -658,7 +689,6 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, static int __blk_trace_startstop(struct request_queue *q, int start) { - int ret; struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, @@ -666,36 +696,10 @@ static int __blk_trace_startstop(struct request_queue *q, int start) if (bt == NULL) return -EINVAL; - /* - * For starting a trace, we can transition from a setup or stopped - * trace. For stopping a trace, the state must be running - */ - ret = -EINVAL; - if (start) { - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) { - blktrace_seq++; - smp_mb(); - bt->trace_state = Blktrace_running; - raw_spin_lock_irq(&running_trace_lock); - list_add(&bt->running_list, &running_trace_list); - raw_spin_unlock_irq(&running_trace_lock); - - trace_note_time(bt); - ret = 0; - } - } else { - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - raw_spin_lock_irq(&running_trace_lock); - list_del_init(&bt->running_list); - raw_spin_unlock_irq(&running_trace_lock); - relay_flush(bt->rchan); - ret = 0; - } - } - - return ret; + if (start) + return blk_trace_start(bt); + else + return blk_trace_stop(bt); } int blk_trace_startstop(struct request_queue *q, int start) @@ -772,10 +776,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) void blk_trace_shutdown(struct request_queue *q) { if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->debugfs_mutex))) { - __blk_trace_startstop(q, 0); + lockdep_is_held(&q->debugfs_mutex))) __blk_trace_remove(q); - } } #ifdef CONFIG_BLK_CGROUP @@ -1614,13 +1616,7 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - raw_spin_lock_irq(&running_trace_lock); - list_del_init(&bt->running_list); - raw_spin_unlock_irq(&running_trace_lock); - relay_flush(bt->rchan); - } + blk_trace_stop(bt); put_probe_ref(); synchronize_rcu();