From 6cc8e7430801fa238bd7d3acae1eb406c6e02fe1 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 26 Jan 2021 09:46:30 -0500 Subject: [PATCH 01/64] loop: scale loop device by introducing per device lock Currently, loop device has only one global lock: loop_ctl_mutex. This becomes hot in scenarios where many loop devices are used. Scale it by introducing per-device lock: lo_mutex that protects modifications of all fields in struct loop_device. Keep loop_ctl_mutex to protect global data: loop_index_idr, loop_lookup, loop_add. The new lock ordering requirement is that loop_ctl_mutex must be taken before lo_mutex. Signed-off-by: Pavel Tatashin Reviewed-by: Tyler Hicks Reviewed-by: Petr Vorel Signed-off-by: Jens Axboe --- drivers/block/loop.c | 93 +++++++++++++++++++++++++------------------- drivers/block/loop.h | 1 + 2 files changed, 54 insertions(+), 40 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index e5ff328f0917..578fc034db3f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -704,7 +704,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, int error; bool partscan; - error = mutex_lock_killable(&loop_ctl_mutex); + error = mutex_lock_killable(&lo->lo_mutex); if (error) return error; error = -ENXIO; @@ -743,9 +743,9 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue); partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); /* - * We must drop file reference outside of loop_ctl_mutex as dropping + * We must drop file reference outside of lo_mutex as dropping * the file ref can take bd_mutex which creates circular locking * dependency. */ @@ -755,7 +755,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, return 0; out_err: - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); if (file) fput(file); return error; @@ -1092,7 +1092,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, goto out_putf; } - error = mutex_lock_killable(&loop_ctl_mutex); + error = mutex_lock_killable(&lo->lo_mutex); if (error) goto out_bdev; @@ -1171,7 +1171,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev). */ bdgrab(bdev); - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); if (partscan) loop_reread_partitions(lo, bdev); if (!(mode & FMODE_EXCL)) @@ -1179,7 +1179,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, return 0; out_unlock: - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); out_bdev: if (!(mode & FMODE_EXCL)) bd_abort_claiming(bdev, loop_configure); @@ -1200,7 +1200,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) bool partscan = false; int lo_number; - mutex_lock(&loop_ctl_mutex); + mutex_lock(&lo->lo_mutex); if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) { err = -ENXIO; goto out_unlock; @@ -1253,7 +1253,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) lo_number = lo->lo_number; loop_unprepare_queue(lo); out_unlock: - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); if (partscan) { /* * bd_mutex has been held already in release path, so don't @@ -1284,18 +1284,17 @@ out_unlock: * protects us from all the other places trying to change the 'lo' * device. */ - mutex_lock(&loop_ctl_mutex); + mutex_lock(&lo->lo_mutex); lo->lo_flags = 0; if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; lo->lo_state = Lo_unbound; - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); /* - * Need not hold loop_ctl_mutex to fput backing file. - * Calling fput holding loop_ctl_mutex triggers a circular - * lock dependency possibility warning as fput can take - * bd_mutex which is usually taken before loop_ctl_mutex. + * Need not hold lo_mutex to fput backing file. Calling fput holding + * lo_mutex triggers a circular lock dependency possibility warning as + * fput can take bd_mutex which is usually taken before lo_mutex. */ if (filp) fput(filp); @@ -1306,11 +1305,11 @@ static int loop_clr_fd(struct loop_device *lo) { int err; - err = mutex_lock_killable(&loop_ctl_mutex); + err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; if (lo->lo_state != Lo_bound) { - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); return -ENXIO; } /* @@ -1325,11 +1324,11 @@ static int loop_clr_fd(struct loop_device *lo) */ if (atomic_read(&lo->lo_refcnt) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); return 0; } lo->lo_state = Lo_rundown; - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); return __loop_clr_fd(lo, false); } @@ -1344,7 +1343,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) bool partscan = false; bool size_changed = false; - err = mutex_lock_killable(&loop_ctl_mutex); + err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; if (lo->lo_encrypt_key_size && @@ -1411,7 +1410,7 @@ out_unfreeze: partscan = true; } out_unlock: - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); if (partscan) loop_reread_partitions(lo, bdev); @@ -1425,11 +1424,11 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info) struct kstat stat; int ret; - ret = mutex_lock_killable(&loop_ctl_mutex); + ret = mutex_lock_killable(&lo->lo_mutex); if (ret) return ret; if (lo->lo_state != Lo_bound) { - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); return -ENXIO; } @@ -1448,10 +1447,10 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info) lo->lo_encrypt_key_size); } - /* Drop loop_ctl_mutex while we call into the filesystem. */ + /* Drop lo_mutex while we call into the filesystem. */ path = lo->lo_backing_file->f_path; path_get(&path); - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); if (!ret) { info->lo_device = huge_encode_dev(stat.dev); @@ -1637,7 +1636,7 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, { int err; - err = mutex_lock_killable(&loop_ctl_mutex); + err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; switch (cmd) { @@ -1653,7 +1652,7 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, default: err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; } - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); return err; } @@ -1879,27 +1878,33 @@ static int lo_open(struct block_device *bdev, fmode_t mode) struct loop_device *lo; int err; + /* + * take loop_ctl_mutex to protect lo pointer from race with + * loop_control_ioctl(LOOP_CTL_REMOVE), however, to reduce contention + * release it prior to updating lo->lo_refcnt. + */ err = mutex_lock_killable(&loop_ctl_mutex); if (err) return err; lo = bdev->bd_disk->private_data; if (!lo) { - err = -ENXIO; - goto out; + mutex_unlock(&loop_ctl_mutex); + return -ENXIO; } - - atomic_inc(&lo->lo_refcnt); -out: + err = mutex_lock_killable(&lo->lo_mutex); mutex_unlock(&loop_ctl_mutex); - return err; + if (err) + return err; + atomic_inc(&lo->lo_refcnt); + mutex_unlock(&lo->lo_mutex); + return 0; } static void lo_release(struct gendisk *disk, fmode_t mode) { - struct loop_device *lo; + struct loop_device *lo = disk->private_data; - mutex_lock(&loop_ctl_mutex); - lo = disk->private_data; + mutex_lock(&lo->lo_mutex); if (atomic_dec_return(&lo->lo_refcnt)) goto out_unlock; @@ -1907,7 +1912,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode) if (lo->lo_state != Lo_bound) goto out_unlock; lo->lo_state = Lo_rundown; - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); /* * In autoclear mode, stop the loop thread * and remove configuration after last close. @@ -1924,7 +1929,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode) } out_unlock: - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); } static const struct block_device_operations lo_fops = { @@ -1963,10 +1968,10 @@ static int unregister_transfer_cb(int id, void *ptr, void *data) struct loop_device *lo = ptr; struct loop_func_table *xfer = data; - mutex_lock(&loop_ctl_mutex); + mutex_lock(&lo->lo_mutex); if (lo->lo_encryption == xfer) loop_release_xfer(lo); - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); return 0; } @@ -2152,6 +2157,7 @@ static int loop_add(struct loop_device **l, int i) disk->flags |= GENHD_FL_NO_PART_SCAN; disk->flags |= GENHD_FL_EXT_DEVT; atomic_set(&lo->lo_refcnt, 0); + mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); disk->major = LOOP_MAJOR; @@ -2182,6 +2188,7 @@ static void loop_remove(struct loop_device *lo) blk_cleanup_queue(lo->lo_queue); blk_mq_free_tag_set(&lo->tag_set); put_disk(lo->lo_disk); + mutex_destroy(&lo->lo_mutex); kfree(lo); } @@ -2261,15 +2268,21 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd, ret = loop_lookup(&lo, parm); if (ret < 0) break; + ret = mutex_lock_killable(&lo->lo_mutex); + if (ret) + break; if (lo->lo_state != Lo_unbound) { ret = -EBUSY; + mutex_unlock(&lo->lo_mutex); break; } if (atomic_read(&lo->lo_refcnt) > 0) { ret = -EBUSY; + mutex_unlock(&lo->lo_mutex); break; } lo->lo_disk->private_data = NULL; + mutex_unlock(&lo->lo_mutex); idr_remove(&loop_index_idr, lo->lo_number); loop_remove(lo); break; diff --git a/drivers/block/loop.h b/drivers/block/loop.h index af75a5ee4094..a3c04f310672 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -62,6 +62,7 @@ struct loop_device { struct request_queue *lo_queue; struct blk_mq_tag_set tag_set; struct gendisk *lo_disk; + struct mutex lo_mutex; }; struct loop_cmd { From 416c05477772c147190d6b2371254510c81a4a04 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Jan 2021 14:04:32 -0600 Subject: [PATCH 02/64] mtip32xx: use PCI #defines instead of numbers Use PCI #defines for PCIe Device Control register values instead of hard-coding bit positions. No functional change intended. Signed-off-by: Bjorn Helgaas Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 3fd99836bb1c..b58f3a59b5bb 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3932,8 +3932,8 @@ static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev) pci_read_config_word(pdev, pos + PCI_EXP_DEVCTL, &pcie_dev_ctrl); - if (pcie_dev_ctrl & (1 << 11) || - pcie_dev_ctrl & (1 << 4)) { + if (pcie_dev_ctrl & PCI_EXP_DEVCTL_NOSNOOP_EN || + pcie_dev_ctrl & PCI_EXP_DEVCTL_RELAX_EN) { dev_info(&dd->pdev->dev, "Disabling ERO/No-Snoop on bridge device %04x:%04x\n", pdev->vendor, pdev->device); From 2126979183148a1bbe8aebe67079856c15ae1763 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Jan 2021 14:04:33 -0600 Subject: [PATCH 03/64] mtip32xx: prefer pcie_capability_read_word() Replace pci_read_config_word() with pcie_capability_read_word(). pcie_capability_read_word() takes care of a few special cases when reading the PCIe capability. See 8c0d3a02c130 ("PCI: Add accessors for PCI Express Capability"). Signed-off-by: Bjorn Helgaas Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index b58f3a59b5bb..3be0dbc674bd 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3924,14 +3924,10 @@ static DEFINE_HANDLER(7); static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev) { - int pos; unsigned short pcie_dev_ctrl; - pos = pci_find_capability(pdev, PCI_CAP_ID_EXP); - if (pos) { - pci_read_config_word(pdev, - pos + PCI_EXP_DEVCTL, - &pcie_dev_ctrl); + if (pci_is_pcie(pdev)) { + pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &pcie_dev_ctrl); if (pcie_dev_ctrl & PCI_EXP_DEVCTL_NOSNOOP_EN || pcie_dev_ctrl & PCI_EXP_DEVCTL_RELAX_EN) { dev_info(&dd->pdev->dev, @@ -3939,8 +3935,7 @@ static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev) pdev->vendor, pdev->device); pcie_dev_ctrl &= ~(PCI_EXP_DEVCTL_NOSNOOP_EN | PCI_EXP_DEVCTL_RELAX_EN); - pci_write_config_word(pdev, - pos + PCI_EXP_DEVCTL, + pcie_capability_write_word(pdev, PCI_EXP_DEVCTL, pcie_dev_ctrl); } } From 370276bac8ec6f74fb52a518ef05aa84d1059067 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 21 Jan 2021 15:21:50 +0100 Subject: [PATCH 04/64] drbd: remove unused argument from drbd_request_prepare and __drbd_make_request We can remove start_jif since it is not used by drbd_request_prepare, then remove it from __drbd_make_request further. Cc: Philipp Reisner Cc: Lars Ellenberg Cc: drbd-dev@lists.linbit.com Signed-off-by: Guoqing Jiang Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 2 +- drivers/block/drbd/drbd_main.c | 3 +-- drivers/block/drbd/drbd_req.c | 11 ++++------- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b2c93a29c251..de59f72d49cc 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1449,7 +1449,7 @@ extern void conn_free_crypto(struct drbd_connection *connection); /* drbd_req */ extern void do_submit(struct work_struct *ws); -extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long); +extern void __drbd_make_request(struct drbd_device *, struct bio *); extern blk_qc_t drbd_submit_bio(struct bio *bio); extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req); extern int is_valid_ar_handle(struct drbd_request *, sector_t); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 1c8c18b2a25f..7e5fcce812e1 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2288,7 +2288,6 @@ static void do_retry(struct work_struct *ws) list_for_each_entry_safe(req, tmp, &writes, tl_requests) { struct drbd_device *device = req->device; struct bio *bio = req->master_bio; - unsigned long start_jif = req->start_jif; bool expected; expected = @@ -2323,7 +2322,7 @@ static void do_retry(struct work_struct *ws) /* We are not just doing submit_bio_noacct(), * as we want to keep the start_time information. */ inc_ap_bio(device); - __drbd_make_request(device, bio, start_jif); + __drbd_make_request(device, bio); } } diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index ea0f31ab3343..ee785f2bdf79 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1188,7 +1188,7 @@ static void drbd_queue_write(struct drbd_device *device, struct drbd_request *re * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. */ static struct drbd_request * -drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif) +drbd_request_prepare(struct drbd_device *device, struct bio *bio) { const int rw = bio_data_dir(bio); struct drbd_request *req; @@ -1416,9 +1416,9 @@ out: complete_master_bio(device, &m); } -void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif) +void __drbd_make_request(struct drbd_device *device, struct bio *bio) { - struct drbd_request *req = drbd_request_prepare(device, bio, start_jif); + struct drbd_request *req = drbd_request_prepare(device, bio); if (IS_ERR_OR_NULL(req)) return; drbd_send_and_submit(device, req); @@ -1596,19 +1596,16 @@ void do_submit(struct work_struct *ws) blk_qc_t drbd_submit_bio(struct bio *bio) { struct drbd_device *device = bio->bi_bdev->bd_disk->private_data; - unsigned long start_jif; blk_queue_split(&bio); - start_jif = jiffies; - /* * what we "blindly" assume: */ D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512)); inc_ap_bio(device); - __drbd_make_request(device, bio, start_jif); + __drbd_make_request(device, bio); return BLK_QC_T_NONE; } From 294ed6b9f00665acc22253044890257c5d9d18c1 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Mon, 25 Jan 2021 16:13:01 +0800 Subject: [PATCH 05/64] zram: fix NULL check before some freeing functions is not needed fixed the below warning: /drivers/block/zram/zram_drv.c:534:2-8: WARNING: NULL check before some freeing functions is not needed. Signed-off-by: Tian Tao Acked-by: Minchan Kim Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d6243dbc53cc..d7018543842e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -530,8 +530,7 @@ static ssize_t backing_dev_store(struct device *dev, return len; out: - if (bitmap) - kvfree(bitmap); + kvfree(bitmap); if (bdev) blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); From 9abe47cc5cbeda75a1ae2ffe6bb8636a0327eddc Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 21 Jan 2021 17:43:22 +0800 Subject: [PATCH 06/64] rsxx: remove redundant NULL check Fix below warnings reported by coccicheck: ./drivers/block/rsxx/dma.c:948:3-8: WARNING: NULL check before some freeing functions is not needed. Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Jens Axboe --- drivers/block/rsxx/dma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index 1914f5488b22..0574f4495755 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -944,8 +944,7 @@ failed_dma_setup: ctrl->done_wq = NULL; } - if (ctrl->trackers) - vfree(ctrl->trackers); + vfree(ctrl->trackers); if (ctrl->status.buf) dma_free_coherent(&card->dev->dev, STATUS_BUFFER_SIZE8, From e8628013e5ddc7cf78cc2f738ab760e8c0fa8559 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 24 Aug 2020 21:56:03 -0700 Subject: [PATCH 07/64] drbd: Avoid comma separated statements Use semicolons and braces. Signed-off-by: Joe Perches Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 09c86ef3f0fd..c3f09a122f20 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -111,8 +111,10 @@ static struct page *page_chain_tail(struct page *page, int *len) { struct page *tmp; int i = 1; - while ((tmp = page_chain_next(page))) - ++i, page = tmp; + while ((tmp = page_chain_next(page))) { + ++i; + page = tmp; + } if (len) *len = i; return page; From cc3456226176385aed8aa6ebb021ebb1380a0183 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Thu, 7 Jan 2021 17:34:13 +0200 Subject: [PATCH 08/64] nvmet: Use nvmet_is_port_enabled helper for pi_enable Remove code duplication. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index c61ffd767062..b2021bf6cee5 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -266,10 +266,8 @@ static ssize_t nvmet_param_pi_enable_store(struct config_item *item, if (strtobool(page, &val)) return -EINVAL; - if (port->enabled) { - pr_err("Disable port before setting pi_enable value.\n"); + if (nvmet_is_port_enabled(port, __func__)) return -EACCES; - } port->pi_enable = val; return count; From 36ca03c830e41769c62d2ca15be8351059f86c45 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Thu, 7 Jan 2021 17:34:14 +0200 Subject: [PATCH 09/64] nvmet: Fix nvmet_is_port_enabled indentation Remove extra tab. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index b2021bf6cee5..635a7cb45d0b 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -45,7 +45,7 @@ static bool nvmet_is_port_enabled(struct nvmet_port *p, const char *caller) { if (p->enabled) pr_err("Disable port '%u' before changing attribute in %s\n", - le16_to_cpu(p->disc_addr.portid), caller); + le16_to_cpu(p->disc_addr.portid), caller); return p->enabled; } From 4e2f02bf77dac7b8c841f93ae5a71556d733cb04 Mon Sep 17 00:00:00 2001 From: Leonid Ravich Date: Sun, 3 Jan 2021 20:12:54 +0200 Subject: [PATCH 10/64] nvmet-fc: use RCU proctection for assoc_list searching assoc_list protected by rcu_read_lock if list not changed inline. and according to the rcu list rules. queue array embedded into nvmet_fc_tgt_assoc protected by rcu_read_lock according to rcu dereference/assign rules. queue and assoc object freed after grace period by call_rcu. tgtport lock taken for changing assoc_list. Reviewed-by: Eldad Zinger Reviewed-by: Elad Grupi Reviewed-by: James Smart Signed-off-by: Leonid Ravich Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 81 +++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 43 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index cd4e73aa9807..c14c60bfdf85 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -145,6 +145,7 @@ struct nvmet_fc_tgt_queue { struct list_head avail_defer_list; struct workqueue_struct *work_q; struct kref ref; + struct rcu_head rcu; struct nvmet_fc_fcp_iod fod[]; /* array of fcp_iods */ } __aligned(sizeof(unsigned long long)); @@ -167,6 +168,7 @@ struct nvmet_fc_tgt_assoc { struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; struct kref ref; struct work_struct del_work; + struct rcu_head rcu; }; @@ -790,7 +792,6 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, u16 qid, u16 sqsize) { struct nvmet_fc_tgt_queue *queue; - unsigned long flags; int ret; if (qid > NVMET_NR_QUEUES) @@ -829,9 +830,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, goto out_fail_iodlist; WARN_ON(assoc->queues[qid]); - spin_lock_irqsave(&assoc->tgtport->lock, flags); - assoc->queues[qid] = queue; - spin_unlock_irqrestore(&assoc->tgtport->lock, flags); + rcu_assign_pointer(assoc->queues[qid], queue); return queue; @@ -851,11 +850,8 @@ nvmet_fc_tgt_queue_free(struct kref *ref) { struct nvmet_fc_tgt_queue *queue = container_of(ref, struct nvmet_fc_tgt_queue, ref); - unsigned long flags; - spin_lock_irqsave(&queue->assoc->tgtport->lock, flags); - queue->assoc->queues[queue->qid] = NULL; - spin_unlock_irqrestore(&queue->assoc->tgtport->lock, flags); + rcu_assign_pointer(queue->assoc->queues[queue->qid], NULL); nvmet_fc_destroy_fcp_iodlist(queue->assoc->tgtport, queue); @@ -863,7 +859,7 @@ nvmet_fc_tgt_queue_free(struct kref *ref) destroy_workqueue(queue->work_q); - kfree(queue); + kfree_rcu(queue, rcu); } static void @@ -965,24 +961,23 @@ nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport, struct nvmet_fc_tgt_queue *queue; u64 association_id = nvmet_fc_getassociationid(connection_id); u16 qid = nvmet_fc_getqueueid(connection_id); - unsigned long flags; if (qid > NVMET_NR_QUEUES) return NULL; - spin_lock_irqsave(&tgtport->lock, flags); - list_for_each_entry(assoc, &tgtport->assoc_list, a_list) { + rcu_read_lock(); + list_for_each_entry_rcu(assoc, &tgtport->assoc_list, a_list) { if (association_id == assoc->association_id) { - queue = assoc->queues[qid]; + queue = rcu_dereference(assoc->queues[qid]); if (queue && (!atomic_read(&queue->connected) || !nvmet_fc_tgt_q_get(queue))) queue = NULL; - spin_unlock_irqrestore(&tgtport->lock, flags); + rcu_read_unlock(); return queue; } } - spin_unlock_irqrestore(&tgtport->lock, flags); + rcu_read_unlock(); return NULL; } @@ -1137,7 +1132,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle) } if (!needrandom) { assoc->association_id = ran; - list_add_tail(&assoc->a_list, &tgtport->assoc_list); + list_add_tail_rcu(&assoc->a_list, &tgtport->assoc_list); } spin_unlock_irqrestore(&tgtport->lock, flags); } @@ -1167,7 +1162,7 @@ nvmet_fc_target_assoc_free(struct kref *ref) nvmet_fc_free_hostport(assoc->hostport); spin_lock_irqsave(&tgtport->lock, flags); - list_del(&assoc->a_list); + list_del_rcu(&assoc->a_list); oldls = assoc->rcv_disconn; spin_unlock_irqrestore(&tgtport->lock, flags); /* if pending Rcv Disconnect Association LS, send rsp now */ @@ -1177,7 +1172,7 @@ nvmet_fc_target_assoc_free(struct kref *ref) dev_info(tgtport->dev, "{%d:%d} Association freed\n", tgtport->fc_target_port.port_num, assoc->a_id); - kfree(assoc); + kfree_rcu(assoc, rcu); nvmet_fc_tgtport_put(tgtport); } @@ -1198,7 +1193,6 @@ nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc) { struct nvmet_fc_tgtport *tgtport = assoc->tgtport; struct nvmet_fc_tgt_queue *queue; - unsigned long flags; int i, terminating; terminating = atomic_xchg(&assoc->terminating, 1); @@ -1207,19 +1201,23 @@ nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc) if (terminating) return; - spin_lock_irqsave(&tgtport->lock, flags); + for (i = NVMET_NR_QUEUES; i >= 0; i--) { - queue = assoc->queues[i]; - if (queue) { - if (!nvmet_fc_tgt_q_get(queue)) - continue; - spin_unlock_irqrestore(&tgtport->lock, flags); - nvmet_fc_delete_target_queue(queue); - nvmet_fc_tgt_q_put(queue); - spin_lock_irqsave(&tgtport->lock, flags); + rcu_read_lock(); + queue = rcu_dereference(assoc->queues[i]); + if (!queue) { + rcu_read_unlock(); + continue; } + + if (!nvmet_fc_tgt_q_get(queue)) { + rcu_read_unlock(); + continue; + } + rcu_read_unlock(); + nvmet_fc_delete_target_queue(queue); + nvmet_fc_tgt_q_put(queue); } - spin_unlock_irqrestore(&tgtport->lock, flags); dev_info(tgtport->dev, "{%d:%d} Association deleted\n", @@ -1234,10 +1232,9 @@ nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport, { struct nvmet_fc_tgt_assoc *assoc; struct nvmet_fc_tgt_assoc *ret = NULL; - unsigned long flags; - spin_lock_irqsave(&tgtport->lock, flags); - list_for_each_entry(assoc, &tgtport->assoc_list, a_list) { + rcu_read_lock(); + list_for_each_entry_rcu(assoc, &tgtport->assoc_list, a_list) { if (association_id == assoc->association_id) { ret = assoc; if (!nvmet_fc_tgt_a_get(assoc)) @@ -1245,7 +1242,7 @@ nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport, break; } } - spin_unlock_irqrestore(&tgtport->lock, flags); + rcu_read_unlock(); return ret; } @@ -1473,19 +1470,17 @@ nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport) static void __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport) { - struct nvmet_fc_tgt_assoc *assoc, *next; - unsigned long flags; + struct nvmet_fc_tgt_assoc *assoc; - spin_lock_irqsave(&tgtport->lock, flags); - list_for_each_entry_safe(assoc, next, - &tgtport->assoc_list, a_list) { + rcu_read_lock(); + list_for_each_entry_rcu(assoc, &tgtport->assoc_list, a_list) { if (!nvmet_fc_tgt_a_get(assoc)) continue; if (!schedule_work(&assoc->del_work)) /* already deleting - release local reference */ nvmet_fc_tgt_a_put(assoc); } - spin_unlock_irqrestore(&tgtport->lock, flags); + rcu_read_unlock(); } /** @@ -1568,16 +1563,16 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) continue; spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); - spin_lock_irqsave(&tgtport->lock, flags); - list_for_each_entry(assoc, &tgtport->assoc_list, a_list) { - queue = assoc->queues[0]; + rcu_read_lock(); + list_for_each_entry_rcu(assoc, &tgtport->assoc_list, a_list) { + queue = rcu_dereference(assoc->queues[0]); if (queue && queue->nvme_sq.ctrl == ctrl) { if (nvmet_fc_tgt_a_get(assoc)) found_ctrl = true; break; } } - spin_unlock_irqrestore(&tgtport->lock, flags); + rcu_read_unlock(); nvmet_fc_tgtport_put(tgtport); From 60b152a50820a125336ecae26da489059fc61ce1 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Sat, 9 Jan 2021 00:41:47 +0100 Subject: [PATCH 11/64] nvme: constify static attribute_group structs The only usage of these is to put their addresses in arrays of pointers to const attribute_groups. Make them const to allow the compiler to put them in read-only memory. Signed-off-by: Rikard Falkeborn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/fc.c | 2 +- drivers/nvme/target/fcloop.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ba5df80881ea..ff0f42652abb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2859,7 +2859,7 @@ static struct attribute *nvme_subsys_attrs[] = { NULL, }; -static struct attribute_group nvme_subsys_attrs_group = { +static const struct attribute_group nvme_subsys_attrs_group = { .attrs = nvme_subsys_attrs, }; @@ -3694,7 +3694,7 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return a->mode; } -static struct attribute_group nvme_dev_attrs_group = { +static const struct attribute_group nvme_dev_attrs_group = { .attrs = nvme_dev_attrs, .is_visible = nvme_dev_attrs_are_visible, }; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5f36cfa8136c..20dadd86e981 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3789,7 +3789,7 @@ static struct attribute *nvme_fc_attrs[] = { NULL }; -static struct attribute_group nvme_fc_attr_group = { +static const struct attribute_group nvme_fc_attr_group = { .attrs = nvme_fc_attrs, }; diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 68213f0a052b..54606f1872b4 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1545,7 +1545,7 @@ static struct attribute *fcloop_dev_attrs[] = { NULL }; -static struct attribute_group fclopp_dev_attrs_group = { +static const struct attribute_group fclopp_dev_attrs_group = { .attrs = fcloop_dev_attrs, }; From f9063a53274d25a878310db3fb645bfa9e49c917 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Fri, 8 Jan 2021 23:46:57 +0900 Subject: [PATCH 12/64] nvme: support command retry delay for admin command The controller can request a delay retrying a failed command by setting the Command Retry Delay (CRD) field in the Completion Queue Entry. Currentlty this features is only applied to commands on the I/O queue, but not to commands on the admin queue. Retreive the nvme_ctrl from the request so that no namespace is required and apply the feature to all commands. Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ff0f42652abb..636a88c93194 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -279,14 +279,13 @@ static blk_status_t nvme_error_status(u16 status) static void nvme_retry_req(struct request *req) { - struct nvme_ns *ns = req->q->queuedata; unsigned long delay = 0; u16 crd; /* The mask and shift result must be <= 3 */ crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11; - if (ns && crd) - delay = ns->ctrl->crdt[crd - 1] * 100; + if (crd) + delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100; nvme_req(req)->retries++; blk_mq_requeue_request(req, false); From cb9b870fba3eba57cf3bcd7c6c4d4aa88bc5fe70 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 14 Jan 2021 13:15:24 -0800 Subject: [PATCH 13/64] nvme-tcp: fix wrong setting of request iov_iter We might set the iov_iter direction wrong, which is harmless for this use-case, but get it right. Also this makes the code slightly cleaner. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 881d28eb15e9..4367923d03e4 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -983,7 +983,6 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) req->state = NVME_TCP_SEND_DATA; if (queue->data_digest) crypto_ahash_init(queue->snd_hash); - nvme_tcp_init_iter(req, WRITE); } else { nvme_tcp_done_send_req(queue); } @@ -1016,8 +1015,6 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) req->state = NVME_TCP_SEND_DATA; if (queue->data_digest) crypto_ahash_init(queue->snd_hash); - if (!req->data_sent) - nvme_tcp_init_iter(req, WRITE); return 1; } req->offset += ret; @@ -2268,12 +2265,12 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, req->data_len = blk_rq_nr_phys_segments(rq) ? blk_rq_payload_bytes(rq) : 0; req->curr_bio = rq->bio; + if (req->curr_bio) + nvme_tcp_init_iter(req, rq_data_dir(rq)); if (rq_data_dir(rq) == WRITE && req->data_len <= nvme_tcp_inline_data_size(queue)) req->pdu_len = req->data_len; - else if (req->curr_bio) - nvme_tcp_init_iter(req, READ); pdu->hdr.type = nvme_tcp_cmd; pdu->hdr.flags = 0; From 60141aa08c08a43f3d22626b3a2532106a90a191 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 14 Jan 2021 13:15:25 -0800 Subject: [PATCH 14/64] nvme-tcp: get rid of unused helper function Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4367923d03e4..f2f3471faed3 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -206,11 +206,6 @@ static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req) req->pdu_len - req->pdu_sent); } -static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req) -{ - return req->iter.iov_offset; -} - static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req) { return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ? From 0dc9edaf80ea3c48231d94cd482355699d453888 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 14 Jan 2021 13:15:26 -0800 Subject: [PATCH 15/64] nvme-tcp: pass multipage bvec to request iov_iter iov_iter uses the right helpers so we should be able to pass in a multipage bvec. Right now the iov_iter is initialized with more segments that it needs which doesn't fail because the iov_iter is capped by byte count, but it is better to use a full multipage bvec iter. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index f2f3471faed3..4c13c7110dbe 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -224,24 +224,29 @@ static void nvme_tcp_init_iter(struct nvme_tcp_request *req, struct request *rq = blk_mq_rq_from_pdu(req); struct bio_vec *vec; unsigned int size; - int nsegs; + int nr_bvec; size_t offset; if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { vec = &rq->special_vec; - nsegs = 1; + nr_bvec = 1; size = blk_rq_payload_bytes(rq); offset = 0; } else { struct bio *bio = req->curr_bio; + struct bvec_iter bi; + struct bio_vec bv; vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); - nsegs = bio_segments(bio); + nr_bvec = 0; + bio_for_each_bvec(bv, bio, bi) { + nr_bvec++; + } size = bio->bi_iter.bi_size; offset = bio->bi_iter.bi_bvec_done; } - iov_iter_bvec(&req->iter, dir, vec, nsegs, size); + iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size); req->iter.iov_offset = offset; } From fc97e942d90c2103755f2fcd9a068a4ee7dfc1bf Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 13 Jan 2021 23:36:27 +0900 Subject: [PATCH 16/64] nvme: refactor ns->ctrl by request Just for current code in nvme_cleanup_cmd(), we don't have to get namespace instance, but we need controller instance. Controller instance can be retrieved by namespace instance, but it can be directly accessed by nvme_request instance from request. ctrl = nvme_req(req)->ctrl; We don't have to go around namespace instance from request instance through gendisk. Signed-off-by: Minwoo Im Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 636a88c93194..009830d247f8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -841,11 +841,11 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, void nvme_cleanup_cmd(struct request *req) { if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - struct nvme_ns *ns = req->rq_disk->private_data; + struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; struct page *page = req->special_vec.bv_page; - if (page == ns->ctrl->discard_page) - clear_bit_unlock(0, &ns->ctrl->discard_page_busy); + if (page == ctrl->discard_page) + clear_bit_unlock(0, &ctrl->discard_page_busy); else kfree(page_address(page) + req->special_vec.bv_offset); } From 624e67fdf9a657fe437d84dd9f28b35e594183dd Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 13 Jan 2021 17:33:52 -0800 Subject: [PATCH 17/64] nvmet: remove extra variable in smart log nsid We remove the extra local variable struct nvmet_ns in nvmet_get_smart_log_nsid() since req already has ns member that can be reused, this also eliminates the explicit call to nvmet_put_namespace() which is already present in the request completion path. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index dc1ea468b182..de804d9762dd 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -74,11 +74,11 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req) static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, struct nvme_smart_log *slog) { - struct nvmet_ns *ns; u64 host_reads, host_writes, data_units_read, data_units_written; - ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid); - if (!ns) { + req->ns = nvmet_find_namespace(req->sq->ctrl, + req->cmd->get_log_page.nsid); + if (!req->ns) { pr_err("Could not find namespace id : %d\n", le32_to_cpu(req->cmd->get_log_page.nsid)); req->error_loc = offsetof(struct nvme_rw_command, nsid); @@ -86,22 +86,20 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, } /* we don't have the right data for file backed ns */ - if (!ns->bdev) - goto out; + if (!req->ns->bdev) + return NVME_SC_SUCCESS; - host_reads = part_stat_read(ns->bdev, ios[READ]); + host_reads = part_stat_read(req->ns->bdev, ios[READ]); data_units_read = - DIV_ROUND_UP(part_stat_read(ns->bdev, sectors[READ]), 1000); - host_writes = part_stat_read(ns->bdev, ios[WRITE]); + DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[READ]), 1000); + host_writes = part_stat_read(req->ns->bdev, ios[WRITE]); data_units_written = - DIV_ROUND_UP(part_stat_read(ns->bdev, sectors[WRITE]), 1000); + DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[WRITE]), 1000); put_unaligned_le64(host_reads, &slog->host_reads[0]); put_unaligned_le64(data_units_read, &slog->data_units_read[0]); put_unaligned_le64(host_writes, &slog->host_writes[0]); put_unaligned_le64(data_units_written, &slog->data_units_written[0]); -out: - nvmet_put_namespace(ns); return NVME_SC_SUCCESS; } From 3631c7f4a24165b9431942b85b502454edb0c33b Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 13 Jan 2021 17:33:53 -0800 Subject: [PATCH 18/64] nvmet: remove extra variable in id-desclist We remove the extra local variable struct nvmet_ns in nvmet_execute_identify_desclist() since req already has ns member that can be reused, this also eliminates the explicit call to nvmet_put_namespace() which is already present in the request completion path. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index de804d9762dd..1cc61ca42a7d 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -605,37 +605,35 @@ static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len, static void nvmet_execute_identify_desclist(struct nvmet_req *req) { - struct nvmet_ns *ns; u16 status = 0; off_t off = 0; - ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); - if (!ns) { + req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); + if (!req->ns) { req->error_loc = offsetof(struct nvme_identify, nsid); status = NVME_SC_INVALID_NS | NVME_SC_DNR; goto out; } - if (memchr_inv(&ns->uuid, 0, sizeof(ns->uuid))) { + if (memchr_inv(&req->ns->uuid, 0, sizeof(req->ns->uuid))) { status = nvmet_copy_ns_identifier(req, NVME_NIDT_UUID, NVME_NIDT_UUID_LEN, - &ns->uuid, &off); + &req->ns->uuid, &off); if (status) - goto out_put_ns; + goto out; } - if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) { + if (memchr_inv(req->ns->nguid, 0, sizeof(req->ns->nguid))) { status = nvmet_copy_ns_identifier(req, NVME_NIDT_NGUID, NVME_NIDT_NGUID_LEN, - &ns->nguid, &off); + &req->ns->nguid, &off); if (status) - goto out_put_ns; + goto out; } if (sg_zero_buffer(req->sg, req->sg_cnt, NVME_IDENTIFY_DATA_SIZE - off, off) != NVME_IDENTIFY_DATA_SIZE - off) status = NVME_SC_INTERNAL | NVME_SC_DNR; -out_put_ns: - nvmet_put_namespace(ns); + out: nvmet_req_complete(req, status); } From 3c7b224f1956ed232b24ed2eb2c54e4476c6acb2 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 13 Jan 2021 17:33:54 -0800 Subject: [PATCH 19/64] nvmet: remove extra variable in identify ns We remove the extra local variable struct nvmet_ns in nvmet_execute_identify_ns() since req already has ns member that can be reused, this also eliminates the explicit call to nvmet_put_namespace() which is already present in the request completion path. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 1cc61ca42a7d..613a4d8feac1 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -467,7 +467,6 @@ out: static void nvmet_execute_identify_ns(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; - struct nvmet_ns *ns; struct nvme_id_ns *id; u16 status = 0; @@ -484,20 +483,21 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) } /* return an all zeroed buffer if we can't find an active namespace */ - ns = nvmet_find_namespace(ctrl, req->cmd->identify.nsid); - if (!ns) { + req->ns = nvmet_find_namespace(ctrl, req->cmd->identify.nsid); + if (!req->ns) { status = NVME_SC_INVALID_NS; goto done; } - nvmet_ns_revalidate(ns); + nvmet_ns_revalidate(req->ns); /* * nuse = ncap = nsze isn't always true, but we have no way to find * that out from the underlying device. */ - id->ncap = id->nsze = cpu_to_le64(ns->size >> ns->blksize_shift); - switch (req->port->ana_state[ns->anagrpid]) { + id->ncap = id->nsze = + cpu_to_le64(req->ns->size >> req->ns->blksize_shift); + switch (req->port->ana_state[req->ns->anagrpid]) { case NVME_ANA_INACCESSIBLE: case NVME_ANA_PERSISTENT_LOSS: break; @@ -506,8 +506,8 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) break; } - if (ns->bdev) - nvmet_bdev_set_limits(ns->bdev, id); + if (req->ns->bdev) + nvmet_bdev_set_limits(req->ns->bdev, id); /* * We just provide a single LBA format that matches what the @@ -521,25 +521,24 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) * controllers, but also with any other user of the block device. */ id->nmic = (1 << 0); - id->anagrpid = cpu_to_le32(ns->anagrpid); + id->anagrpid = cpu_to_le32(req->ns->anagrpid); - memcpy(&id->nguid, &ns->nguid, sizeof(id->nguid)); + memcpy(&id->nguid, &req->ns->nguid, sizeof(id->nguid)); - id->lbaf[0].ds = ns->blksize_shift; + id->lbaf[0].ds = req->ns->blksize_shift; - if (ctrl->pi_support && nvmet_ns_has_pi(ns)) { + if (ctrl->pi_support && nvmet_ns_has_pi(req->ns)) { id->dpc = NVME_NS_DPC_PI_FIRST | NVME_NS_DPC_PI_LAST | NVME_NS_DPC_PI_TYPE1 | NVME_NS_DPC_PI_TYPE2 | NVME_NS_DPC_PI_TYPE3; id->mc = NVME_MC_EXTENDED_LBA; - id->dps = ns->pi_type; + id->dps = req->ns->pi_type; id->flbas = NVME_NS_FLBAS_META_EXT; - id->lbaf[0].ms = cpu_to_le16(ns->metadata_size); + id->lbaf[0].ms = cpu_to_le16(req->ns->metadata_size); } - if (ns->readonly) + if (req->ns->readonly) id->nsattr |= (1 << 0); - nvmet_put_namespace(ns); done: if (!status) status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); From 193fcf371f9e3705c14a0bf1d4bfc44af0f7c124 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 11 Jan 2021 20:26:16 -0800 Subject: [PATCH 20/64] nvmet: add lba to sect conversion helpers In this preparation patch, we add helpers to convert lbas to sectors & sectors to lba. This is needed to eliminate code duplication in the ZBD backend. Use these helpers in the block device backend. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-bdev.c | 8 +++----- drivers/nvme/target/nvmet.h | 10 ++++++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 125dde3f410e..23095bdfce06 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -256,8 +256,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) if (is_pci_p2pdma_page(sg_page(req->sg))) op |= REQ_NOMERGE; - sector = le64_to_cpu(req->cmd->rw.slba); - sector <<= (req->ns->blksize_shift - 9); + sector = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba); if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) { bio = &req->b.inline_bio; @@ -345,7 +344,7 @@ static u16 nvmet_bdev_discard_range(struct nvmet_req *req, int ret; ret = __blkdev_issue_discard(ns->bdev, - le64_to_cpu(range->slba) << (ns->blksize_shift - 9), + nvmet_lba_to_sect(ns, range->slba), le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), GFP_KERNEL, 0, bio); if (ret && ret != -EOPNOTSUPP) { @@ -414,8 +413,7 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req) if (!nvmet_check_transfer_len(req, 0)) return; - sector = le64_to_cpu(write_zeroes->slba) << - (req->ns->blksize_shift - 9); + sector = nvmet_lba_to_sect(req->ns, write_zeroes->slba); nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << (req->ns->blksize_shift - 9)); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 592763732065..8776dd1a0490 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -603,4 +603,14 @@ static inline bool nvmet_ns_has_pi(struct nvmet_ns *ns) return ns->pi_type && ns->metadata_size == sizeof(struct t10_pi_tuple); } +static inline __le64 nvmet_sect_to_lba(struct nvmet_ns *ns, sector_t sect) +{ + return cpu_to_le64(sect >> (ns->blksize_shift - SECTOR_SHIFT)); +} + +static inline sector_t nvmet_lba_to_sect(struct nvmet_ns *ns, __le64 lba) +{ + return le64_to_cpu(lba) << (ns->blksize_shift - SECTOR_SHIFT); +} + #endif /* _NVMET_H */ From 3254899e0b52f10b9a3e7db4d10f081f60705ba9 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 21 Jan 2021 09:09:47 +0000 Subject: [PATCH 21/64] nvme: update enumerations for status codes All the updates are mentioned in the ratified NVMe 1.4 spec. Reviewed-by: Hannes Reinecke Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bfed36e342cc..458719544253 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1473,20 +1473,29 @@ enum { NVME_SC_SGL_INVALID_DATA = 0xf, NVME_SC_SGL_INVALID_METADATA = 0x10, NVME_SC_SGL_INVALID_TYPE = 0x11, - + NVME_SC_CMB_INVALID_USE = 0x12, + NVME_SC_PRP_INVALID_OFFSET = 0x13, + NVME_SC_ATOMIC_WU_EXCEEDED = 0x14, + NVME_SC_OP_DENIED = 0x15, NVME_SC_SGL_INVALID_OFFSET = 0x16, - NVME_SC_SGL_INVALID_SUBTYPE = 0x17, - + NVME_SC_RESERVED = 0x17, + NVME_SC_HOST_ID_INCONSIST = 0x18, + NVME_SC_KA_TIMEOUT_EXPIRED = 0x19, + NVME_SC_KA_TIMEOUT_INVALID = 0x1A, + NVME_SC_ABORTED_PREEMPT_ABORT = 0x1B, NVME_SC_SANITIZE_FAILED = 0x1C, NVME_SC_SANITIZE_IN_PROGRESS = 0x1D, - + NVME_SC_SGL_INVALID_GRANULARITY = 0x1E, + NVME_SC_CMD_NOT_SUP_CMB_QUEUE = 0x1F, NVME_SC_NS_WRITE_PROTECTED = 0x20, NVME_SC_CMD_INTERRUPTED = 0x21, + NVME_SC_TRANSIENT_TR_ERR = 0x22, NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, NVME_SC_NS_NOT_READY = 0x82, NVME_SC_RESERVATION_CONFLICT = 0x83, + NVME_SC_FORMAT_IN_PROGRESS = 0x84, /* * Command Specific Status: @@ -1519,8 +1528,15 @@ enum { NVME_SC_NS_NOT_ATTACHED = 0x11a, NVME_SC_THIN_PROV_NOT_SUPP = 0x11b, NVME_SC_CTRL_LIST_INVALID = 0x11c, + NVME_SC_SELT_TEST_IN_PROGRESS = 0x11d, NVME_SC_BP_WRITE_PROHIBITED = 0x11e, + NVME_SC_CTRL_ID_INVALID = 0x11f, + NVME_SC_SEC_CTRL_STATE_INVALID = 0x120, + NVME_SC_CTRL_RES_NUM_INVALID = 0x121, + NVME_SC_RES_ID_INVALID = 0x122, NVME_SC_PMR_SAN_PROHIBITED = 0x123, + NVME_SC_ANA_GROUP_ID_INVALID = 0x124, + NVME_SC_ANA_ATTACH_FAILED = 0x125, /* * I/O Command Set Specific - NVM commands: From 3a98c51a24825173455c479822aa2f89fecbe6af Mon Sep 17 00:00:00 2001 From: Michal Krakowiak Date: Mon, 4 Jan 2021 16:53:43 +0100 Subject: [PATCH 22/64] nvme: parse format nvm command details when tracing Add detailed parsing of format nvm admin command to make the trace log more consistent and human-readable. Signed-off-by: Michal Krakowiak Acked-by: Dan Williams Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 5c3cb6928f3c..e0400de713b5 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -102,6 +102,23 @@ static const char *nvme_trace_get_lba_status(struct trace_seq *p, return ret; } +static const char *nvme_trace_admin_format_nvm(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 lbaf = cdw10[0] & 0xF; + u8 mset = (cdw10[0] >> 4) & 0x1; + u8 pi = (cdw10[0] >> 5) & 0x7; + u8 pil = cdw10[1] & 0x1; + u8 ses = (cdw10[1] >> 1) & 0x7; + + trace_seq_printf(p, "lbaf=%u, mset=%u, pi=%u, pil=%u, ses=%u", + lbaf, mset, pi, pil, ses); + + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -159,6 +176,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_admin_get_features(p, cdw10); case nvme_admin_get_lba_status: return nvme_trace_get_lba_status(p, cdw10); + case nvme_admin_format_nvm: + return nvme_trace_admin_format_nvm(p, cdw10); default: return nvme_trace_common(p, cdw10); } From 4a407d5ebc7ac1ea8c6e2692bd79320459dc60f6 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 27 Jan 2021 02:50:00 +0900 Subject: [PATCH 23/64] nvme: add tracing of zns commands When support for the NVMe ZNS commands was merged, tracing of these has been omitted. Add nvme_cmd_zone_mgmt_send, nvme_cmd_zone_mgmt_recv as well as nvme_cmd_zone_append to the nvme driver's tracing facility. Signed-off-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/nvme.h | 6 +++++- 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index e0400de713b5..6543015b6121 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -148,6 +148,35 @@ static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u8 zsa = cdw10[12]; + u8 all = cdw10[13]; + + trace_seq_printf(p, "slba=%llu, zsa=%u, all=%u", slba, zsa, all); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 numd = get_unaligned_le32(cdw10 + 8); + u8 zra = cdw10[12]; + u8 zrasf = cdw10[13]; + u8 pr = cdw10[14]; + + trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u, pr=%u", + slba, numd, zra, zrasf, pr); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -190,9 +219,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, case nvme_cmd_read: case nvme_cmd_write: case nvme_cmd_write_zeroes: + case nvme_cmd_zone_append: return nvme_trace_read_write(p, cdw10); case nvme_cmd_dsm: return nvme_trace_dsm(p, cdw10); + case nvme_cmd_zone_mgmt_send: + return nvme_trace_zone_mgmt_send(p, cdw10); + case nvme_cmd_zone_mgmt_recv: + return nvme_trace_zone_mgmt_recv(p, cdw10); default: return nvme_trace_common(p, cdw10); } diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 458719544253..b08787cd0881 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -697,7 +697,11 @@ enum nvme_opcode { nvme_opcode_name(nvme_cmd_resv_register), \ nvme_opcode_name(nvme_cmd_resv_report), \ nvme_opcode_name(nvme_cmd_resv_acquire), \ - nvme_opcode_name(nvme_cmd_resv_release)) + nvme_opcode_name(nvme_cmd_resv_release), \ + nvme_opcode_name(nvme_cmd_zone_mgmt_send), \ + nvme_opcode_name(nvme_cmd_zone_mgmt_recv), \ + nvme_opcode_name(nvme_cmd_zone_append)) + /* From 8f8ea928fd77db60dc22276e3acdb9ca41cbf8dd Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 26 Jan 2021 11:47:52 -0800 Subject: [PATCH 24/64] nvme-core: get rid of the extra space Remove the extra space in the nvme_free_cels() when calling xa_for_each loop which is not a common practice (except drivers/infiniband/core/ not sure why). Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 009830d247f8..168601d96f48 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4448,7 +4448,7 @@ static void nvme_free_cels(struct nvme_ctrl *ctrl) struct nvme_effects_log *cel; unsigned long i; - xa_for_each (&ctrl->cels, i, cel) { + xa_for_each(&ctrl->cels, i, cel) { xa_erase(&ctrl->cels, i); kfree(cel); } From 2547906982e2e6a0d42f8957f55af5bb51a7e55f Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Thu, 21 Jan 2021 11:32:36 +0800 Subject: [PATCH 25/64] nvme-core: add cancel tagset helpers Add nvme_cancel_tagset and nvme_cancel_admin_tagset for tear down and reconnection error handling. Signed-off-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 20 ++++++++++++++++++++ drivers/nvme/host/nvme.h | 2 ++ 2 files changed, 22 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 168601d96f48..4e8e310033c9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -370,6 +370,26 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved) } EXPORT_SYMBOL_GPL(nvme_cancel_request); +void nvme_cancel_tagset(struct nvme_ctrl *ctrl) +{ + if (ctrl->tagset) { + blk_mq_tagset_busy_iter(ctrl->tagset, + nvme_cancel_request, ctrl); + blk_mq_tagset_wait_completed_request(ctrl->tagset); + } +} +EXPORT_SYMBOL_GPL(nvme_cancel_tagset); + +void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl) +{ + if (ctrl->admin_tagset) { + blk_mq_tagset_busy_iter(ctrl->admin_tagset, + nvme_cancel_request, ctrl); + blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); + } +} +EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset); + bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 88a6b97247f5..a72f07181091 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -576,6 +576,8 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) void nvme_complete_rq(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); +void nvme_cancel_tagset(struct nvme_ctrl *ctrl); +void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); bool nvme_wait_reset(struct nvme_ctrl *ctrl); From 958dc1d32c80566f58d18f05ef1f05bd32d172c1 Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Thu, 21 Jan 2021 11:32:37 +0800 Subject: [PATCH 26/64] nvme-rdma: add clean action for failed reconnection A crash happens when inject failed reconnection. If reconnect failed after start io queues, the queues will be unquiesced and new requests continue to be delivered. Reconnection error handling process directly free queues without cancel suspend requests. The suppend request will time out, and then crash due to use the queue after free. Add sync queues and cancel suppend requests for reconnection error handling. Signed-off-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index f5ef3edeb2fd..d92132cbcbbe 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -919,12 +919,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, error = nvme_init_identify(&ctrl->ctrl); if (error) - goto out_stop_queue; + goto out_quiesce_queue; return 0; +out_quiesce_queue: + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + blk_sync_queue(ctrl->ctrl.admin_q); out_stop_queue: nvme_rdma_stop_queue(&ctrl->queues[0]); + nvme_cancel_admin_tagset(&ctrl->ctrl); out_cleanup_queue: if (new) blk_cleanup_queue(ctrl->ctrl.admin_q); @@ -1001,8 +1005,10 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) out_wait_freeze_timed_out: nvme_stop_queues(&ctrl->ctrl); + nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); out_cleanup_connect_q: + nvme_cancel_tagset(&ctrl->ctrl); if (new) blk_cleanup_queue(ctrl->ctrl.connect_q); out_free_tag_set: @@ -1144,10 +1150,18 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) return 0; destroy_io: - if (ctrl->ctrl.queue_count > 1) + if (ctrl->ctrl.queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + nvme_sync_io_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); + nvme_cancel_tagset(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, new); + } destroy_admin: + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); + nvme_cancel_admin_tagset(&ctrl->ctrl); nvme_rdma_destroy_admin_queue(ctrl, new); return ret; } From 70a99574a79f1cd4dc7ad56ea37be40844bfb97b Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Thu, 21 Jan 2021 11:32:38 +0800 Subject: [PATCH 27/64] nvme-tcp: add clean action for failed reconnection If reconnect failed after start io queues, the queues will be unquiesced and new requests continue to be delivered. Reconnection error handling process directly free queues without cancel suspend requests. The suppend request will time out, and then crash due to use the queue after free. Add sync queues and cancel suppend requests for reconnection error handling. Signed-off-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4c13c7110dbe..8c256adb8c41 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1812,8 +1812,10 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) out_wait_freeze_timed_out: nvme_stop_queues(ctrl); + nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); out_cleanup_connect_q: + nvme_cancel_tagset(ctrl); if (new) blk_cleanup_queue(ctrl->connect_q); out_free_tag_set: @@ -1875,12 +1877,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) error = nvme_init_identify(ctrl); if (error) - goto out_stop_queue; + goto out_quiesce_queue; return 0; +out_quiesce_queue: + blk_mq_quiesce_queue(ctrl->admin_q); + blk_sync_queue(ctrl->admin_q); out_stop_queue: nvme_tcp_stop_queue(ctrl, 0); + nvme_cancel_admin_tagset(ctrl); out_cleanup_queue: if (new) blk_cleanup_queue(ctrl->admin_q); @@ -2000,10 +2006,18 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) return 0; destroy_io: - if (ctrl->queue_count > 1) + if (ctrl->queue_count > 1) { + nvme_stop_queues(ctrl); + nvme_sync_io_queues(ctrl); + nvme_tcp_stop_io_queues(ctrl); + nvme_cancel_tagset(ctrl); nvme_tcp_destroy_io_queues(ctrl, new); + } destroy_admin: + blk_mq_quiesce_queue(ctrl->admin_q); + blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); + nvme_cancel_admin_tagset(ctrl); nvme_tcp_destroy_admin_queue(ctrl, new); return ret; } From c4189d680e12f0a41eea94a1f466142b2bf02c3d Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Thu, 21 Jan 2021 11:32:39 +0800 Subject: [PATCH 28/64] nvme-rdma: use cancel tagset helper for tear down Use nvme_cancel_tagset and nvme_cancel_admin_tagset to clean code for tear down process. Signed-off-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index d92132cbcbbe..6700d8bab68a 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1025,11 +1025,7 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); - if (ctrl->ctrl.admin_tagset) { - blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset, - nvme_cancel_request, &ctrl->ctrl); - blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset); - } + nvme_cancel_admin_tagset(&ctrl->ctrl); if (remove) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); @@ -1043,11 +1039,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, nvme_stop_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); - if (ctrl->ctrl.tagset) { - blk_mq_tagset_busy_iter(ctrl->ctrl.tagset, - nvme_cancel_request, &ctrl->ctrl); - blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset); - } + nvme_cancel_tagset(&ctrl->ctrl); if (remove) nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, remove); From 563c81586d0ab2841487a61fb34d6e9cd5efded7 Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Thu, 21 Jan 2021 11:32:40 +0800 Subject: [PATCH 29/64] nvme-tcp: use cancel tagset helper for tear down Use nvme_cancel_tagset and nvme_cancel_admin_tagset to clean code for tear down process. Signed-off-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 8c256adb8c41..619b0d8f6e38 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1907,11 +1907,7 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, blk_mq_quiesce_queue(ctrl->admin_q); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); - if (ctrl->admin_tagset) { - blk_mq_tagset_busy_iter(ctrl->admin_tagset, - nvme_cancel_request, ctrl); - blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); - } + nvme_cancel_admin_tagset(ctrl); if (remove) blk_mq_unquiesce_queue(ctrl->admin_q); nvme_tcp_destroy_admin_queue(ctrl, remove); @@ -1927,11 +1923,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, nvme_stop_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); - if (ctrl->tagset) { - blk_mq_tagset_busy_iter(ctrl->tagset, - nvme_cancel_request, ctrl); - blk_mq_tagset_wait_completed_request(ctrl->tagset); - } + nvme_cancel_tagset(ctrl); if (remove) nvme_start_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); From c5eec74f252dfba25269cd68f9a3407aedefd330 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 16 Dec 2020 02:26:22 +0100 Subject: [PATCH 30/64] md/raid5: cast chunk_sectors to sector_t value Currently, raid5 calculates dev_sectors from chunk_sectors without proper cast, which is problematic. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f411b9e5c332..b71f50132495 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7661,7 +7661,7 @@ static int raid5_run(struct mddev *mddev) } /* device size must be a multiple of chunk size */ - mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); + mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1); mddev->resync_max_sectors = mddev->dev_sectors; if (mddev->degraded > dirty_parity_disks && From 8a0c014cd20516ade9654fc13b51345ec58e7be8 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 22 Jan 2021 12:13:20 +0100 Subject: [PATCH 31/64] floppy: reintroduce O_NDELAY fix This issue was originally fixed in 09954bad4 ("floppy: refactor open() flags handling"). The fix as a side-effect, however, introduce issue for open(O_ACCMODE) that is being used for ioctl-only open. I wrote a fix for that, but instead of it being merged, full revert of 09954bad4 was performed, re-introducing the O_NDELAY / O_NONBLOCK issue, and it strikes again. This is a forward-port of the original fix to current codebase; the original submission had the changelog below: ==== Commit 09954bad4 ("floppy: refactor open() flags handling"), as a side-effect, causes open(/dev/fdX, O_ACCMODE) to fail. It turns out that this is being used setfdprm userspace for ioctl-only open(). Reintroduce back the original behavior wrt !(FMODE_READ|FMODE_WRITE) modes, while still keeping the original O_NDELAY bug fixed. Link: https://lore.kernel.org/r/nycvar.YFH.7.76.2101221209060.5622@cbobk.fhfr.pm Cc: stable@vger.kernel.org Reported-by: Wim Osterholt Tested-by: Wim Osterholt Reported-and-tested-by: Kurt Garloff Fixes: 09954bad4 ("floppy: refactor open() flags handling") Fixes: f2791e7ead ("Revert "floppy: refactor open() flags handling"") Signed-off-by: Jiri Kosina Signed-off-by: Denis Efremov --- drivers/block/floppy.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index dfe1dfc901cc..0b71292d9d5a 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4121,23 +4121,23 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) if (fdc_state[FDC(drive)].rawcmd == 1) fdc_state[FDC(drive)].rawcmd = 2; - if (!(mode & FMODE_NDELAY)) { - if (mode & (FMODE_READ|FMODE_WRITE)) { - drive_state[drive].last_checked = 0; - clear_bit(FD_OPEN_SHOULD_FAIL_BIT, - &drive_state[drive].flags); - if (bdev_check_media_change(bdev)) - floppy_revalidate(bdev->bd_disk); - if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags)) - goto out; - if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags)) - goto out; - } - res = -EROFS; - if ((mode & FMODE_WRITE) && - !test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags)) + if (mode & (FMODE_READ|FMODE_WRITE)) { + drive_state[drive].last_checked = 0; + clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags); + if (bdev_check_media_change(bdev)) + floppy_revalidate(bdev->bd_disk); + if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags)) + goto out; + if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags)) goto out; } + + res = -EROFS; + + if ((mode & FMODE_WRITE) && + !test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags)) + goto out; + mutex_unlock(&open_lock); mutex_unlock(&floppy_mutex); return 0; From ee8f353b1591cef4a29cddeb379c1503559f474e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Feb 2021 17:43:42 +0900 Subject: [PATCH 32/64] block: remove skd driver The STEC S1220 PCIe SSD cards are EOL since 2014 and not supported by the vendor anymore. As the skd driver for this SSD is starting to cause problems with improvements to the block layer, stop supporting it in newer kernel versions. Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- MAINTAINERS | 6 - drivers/block/Kconfig | 10 - drivers/block/Makefile | 2 - drivers/block/skd_main.c | 3670 ------------------------------------- drivers/block/skd_s1120.h | 322 ---- 5 files changed, 4010 deletions(-) delete mode 100644 drivers/block/skd_main.c delete mode 100644 drivers/block/skd_s1120.h diff --git a/MAINTAINERS b/MAINTAINERS index 992fe3b0900a..f4766335189a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16938,12 +16938,6 @@ F: include/linux/static_call*.h F: kernel/jump_label.c F: kernel/static_call.c -STEC S1220 SKD DRIVER -M: Damien Le Moal -L: linux-block@vger.kernel.org -S: Maintained -F: drivers/block/skd*[ch] - STI AUDIO (ASoC) DRIVERS M: Arnaud Pouliquen L: alsa-devel@alsa-project.org (moderated for non-subscribers) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 583b671b1d2d..2779e85795a7 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -267,16 +267,6 @@ config BLK_DEV_NBD If unsure, say N. -config BLK_DEV_SKD - tristate "STEC S1120 Block Driver" - depends on PCI - depends on 64BIT - help - Saying Y or M here will enable support for the - STEC, Inc. S1120 PCIe SSD. - - Use device /dev/skd$N amd /dev/skd$Np$M. - config BLK_DEV_SX8 tristate "Promise SATA SX8 support" depends on PCI diff --git a/drivers/block/Makefile b/drivers/block/Makefile index a3170859e01d..b501b8728fb9 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -22,7 +22,6 @@ obj-$(CONFIG_BLK_DEV_LOOP) += loop.o obj-$(CONFIG_XILINX_SYSACE) += xsysace.o obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o obj-$(CONFIG_SUNVDC) += sunvdc.o -obj-$(CONFIG_BLK_DEV_SKD) += skd.o obj-$(CONFIG_BLK_DEV_UMEM) += umem.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o @@ -43,5 +42,4 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ -skd-y := skd_main.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c deleted file mode 100644 index a962b4551bed..000000000000 --- a/drivers/block/skd_main.c +++ /dev/null @@ -1,3670 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Driver for sTec s1120 PCIe SSDs. sTec was acquired in 2013 by HGST and HGST - * was acquired by Western Digital in 2012. - * - * Copyright 2012 sTec, Inc. - * Copyright (c) 2017 Western Digital Corporation or its affiliates. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "skd_s1120.h" - -static int skd_dbg_level; -static int skd_isr_comp_limit = 4; - -#define SKD_ASSERT(expr) \ - do { \ - if (unlikely(!(expr))) { \ - pr_err("Assertion failed! %s,%s,%s,line=%d\n", \ - # expr, __FILE__, __func__, __LINE__); \ - } \ - } while (0) - -#define DRV_NAME "skd" -#define PFX DRV_NAME ": " - -MODULE_LICENSE("GPL"); - -MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver"); - -#define PCI_VENDOR_ID_STEC 0x1B39 -#define PCI_DEVICE_ID_S1120 0x0001 - -#define SKD_FUA_NV (1 << 1) -#define SKD_MINORS_PER_DEVICE 16 - -#define SKD_MAX_QUEUE_DEPTH 200u - -#define SKD_PAUSE_TIMEOUT (5 * 1000) - -#define SKD_N_FITMSG_BYTES (512u) -#define SKD_MAX_REQ_PER_MSG 14 - -#define SKD_N_SPECIAL_FITMSG_BYTES (128u) - -/* SG elements are 32 bytes, so we can make this 4096 and still be under the - * 128KB limit. That allows 4096*4K = 16M xfer size - */ -#define SKD_N_SG_PER_REQ_DEFAULT 256u - -#define SKD_N_COMPLETION_ENTRY 256u -#define SKD_N_READ_CAP_BYTES (8u) - -#define SKD_N_INTERNAL_BYTES (512u) - -#define SKD_SKCOMP_SIZE \ - ((sizeof(struct fit_completion_entry_v1) + \ - sizeof(struct fit_comp_error_info)) * SKD_N_COMPLETION_ENTRY) - -/* 5 bits of uniqifier, 0xF800 */ -#define SKD_ID_TABLE_MASK (3u << 8u) -#define SKD_ID_RW_REQUEST (0u << 8u) -#define SKD_ID_INTERNAL (1u << 8u) -#define SKD_ID_FIT_MSG (3u << 8u) -#define SKD_ID_SLOT_MASK 0x00FFu -#define SKD_ID_SLOT_AND_TABLE_MASK 0x03FFu - -#define SKD_N_MAX_SECTORS 2048u - -#define SKD_MAX_RETRIES 2u - -#define SKD_TIMER_SECONDS(seconds) (seconds) -#define SKD_TIMER_MINUTES(minutes) ((minutes) * (60)) - -#define INQ_STD_NBYTES 36 - -enum skd_drvr_state { - SKD_DRVR_STATE_LOAD, - SKD_DRVR_STATE_IDLE, - SKD_DRVR_STATE_BUSY, - SKD_DRVR_STATE_STARTING, - SKD_DRVR_STATE_ONLINE, - SKD_DRVR_STATE_PAUSING, - SKD_DRVR_STATE_PAUSED, - SKD_DRVR_STATE_RESTARTING, - SKD_DRVR_STATE_RESUMING, - SKD_DRVR_STATE_STOPPING, - SKD_DRVR_STATE_FAULT, - SKD_DRVR_STATE_DISAPPEARED, - SKD_DRVR_STATE_PROTOCOL_MISMATCH, - SKD_DRVR_STATE_BUSY_ERASE, - SKD_DRVR_STATE_BUSY_SANITIZE, - SKD_DRVR_STATE_BUSY_IMMINENT, - SKD_DRVR_STATE_WAIT_BOOT, - SKD_DRVR_STATE_SYNCING, -}; - -#define SKD_WAIT_BOOT_TIMO SKD_TIMER_SECONDS(90u) -#define SKD_STARTING_TIMO SKD_TIMER_SECONDS(8u) -#define SKD_RESTARTING_TIMO SKD_TIMER_MINUTES(4u) -#define SKD_BUSY_TIMO SKD_TIMER_MINUTES(20u) -#define SKD_STARTED_BUSY_TIMO SKD_TIMER_SECONDS(60u) -#define SKD_START_WAIT_SECONDS 90u - -enum skd_req_state { - SKD_REQ_STATE_IDLE, - SKD_REQ_STATE_SETUP, - SKD_REQ_STATE_BUSY, - SKD_REQ_STATE_COMPLETED, - SKD_REQ_STATE_TIMEOUT, -}; - -enum skd_check_status_action { - SKD_CHECK_STATUS_REPORT_GOOD, - SKD_CHECK_STATUS_REPORT_SMART_ALERT, - SKD_CHECK_STATUS_REQUEUE_REQUEST, - SKD_CHECK_STATUS_REPORT_ERROR, - SKD_CHECK_STATUS_BUSY_IMMINENT, -}; - -struct skd_msg_buf { - struct fit_msg_hdr fmh; - struct skd_scsi_request scsi[SKD_MAX_REQ_PER_MSG]; -}; - -struct skd_fitmsg_context { - u32 id; - - u32 length; - - struct skd_msg_buf *msg_buf; - dma_addr_t mb_dma_address; -}; - -struct skd_request_context { - enum skd_req_state state; - - u16 id; - u32 fitmsg_id; - - u8 flush_cmd; - - enum dma_data_direction data_dir; - struct scatterlist *sg; - u32 n_sg; - u32 sg_byte_count; - - struct fit_sg_descriptor *sksg_list; - dma_addr_t sksg_dma_address; - - struct fit_completion_entry_v1 completion; - - struct fit_comp_error_info err_info; - int retries; - - blk_status_t status; -}; - -struct skd_special_context { - struct skd_request_context req; - - void *data_buf; - dma_addr_t db_dma_address; - - struct skd_msg_buf *msg_buf; - dma_addr_t mb_dma_address; -}; - -typedef enum skd_irq_type { - SKD_IRQ_LEGACY, - SKD_IRQ_MSI, - SKD_IRQ_MSIX -} skd_irq_type_t; - -#define SKD_MAX_BARS 2 - -struct skd_device { - void __iomem *mem_map[SKD_MAX_BARS]; - resource_size_t mem_phys[SKD_MAX_BARS]; - u32 mem_size[SKD_MAX_BARS]; - - struct skd_msix_entry *msix_entries; - - struct pci_dev *pdev; - int pcie_error_reporting_is_enabled; - - spinlock_t lock; - struct gendisk *disk; - struct blk_mq_tag_set tag_set; - struct request_queue *queue; - struct skd_fitmsg_context *skmsg; - struct device *class_dev; - int gendisk_on; - int sync_done; - - u32 devno; - u32 major; - char isr_name[30]; - - enum skd_drvr_state state; - u32 drive_state; - - u32 cur_max_queue_depth; - u32 queue_low_water_mark; - u32 dev_max_queue_depth; - - u32 num_fitmsg_context; - u32 num_req_context; - - struct skd_fitmsg_context *skmsg_table; - - struct skd_special_context internal_skspcl; - u32 read_cap_blocksize; - u32 read_cap_last_lba; - int read_cap_is_valid; - int inquiry_is_valid; - u8 inq_serial_num[13]; /*12 chars plus null term */ - - u8 skcomp_cycle; - u32 skcomp_ix; - struct kmem_cache *msgbuf_cache; - struct kmem_cache *sglist_cache; - struct kmem_cache *databuf_cache; - struct fit_completion_entry_v1 *skcomp_table; - struct fit_comp_error_info *skerr_table; - dma_addr_t cq_dma_address; - - wait_queue_head_t waitq; - - struct timer_list timer; - u32 timer_countdown; - u32 timer_substate; - - int sgs_per_request; - u32 last_mtd; - - u32 proto_ver; - - int dbg_level; - u32 connect_time_stamp; - int connect_retries; -#define SKD_MAX_CONNECT_RETRIES 16 - u32 drive_jiffies; - - u32 timo_slot; - - struct work_struct start_queue; - struct work_struct completion_worker; -}; - -#define SKD_WRITEL(DEV, VAL, OFF) skd_reg_write32(DEV, VAL, OFF) -#define SKD_READL(DEV, OFF) skd_reg_read32(DEV, OFF) -#define SKD_WRITEQ(DEV, VAL, OFF) skd_reg_write64(DEV, VAL, OFF) - -static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset) -{ - u32 val = readl(skdev->mem_map[1] + offset); - - if (unlikely(skdev->dbg_level >= 2)) - dev_dbg(&skdev->pdev->dev, "offset %x = %x\n", offset, val); - return val; -} - -static inline void skd_reg_write32(struct skd_device *skdev, u32 val, - u32 offset) -{ - writel(val, skdev->mem_map[1] + offset); - if (unlikely(skdev->dbg_level >= 2)) - dev_dbg(&skdev->pdev->dev, "offset %x = %x\n", offset, val); -} - -static inline void skd_reg_write64(struct skd_device *skdev, u64 val, - u32 offset) -{ - writeq(val, skdev->mem_map[1] + offset); - if (unlikely(skdev->dbg_level >= 2)) - dev_dbg(&skdev->pdev->dev, "offset %x = %016llx\n", offset, - val); -} - - -#define SKD_IRQ_DEFAULT SKD_IRQ_MSIX -static int skd_isr_type = SKD_IRQ_DEFAULT; - -module_param(skd_isr_type, int, 0444); -MODULE_PARM_DESC(skd_isr_type, "Interrupt type capability." - " (0==legacy, 1==MSI, 2==MSI-X, default==1)"); - -#define SKD_MAX_REQ_PER_MSG_DEFAULT 1 -static int skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT; - -module_param(skd_max_req_per_msg, int, 0444); -MODULE_PARM_DESC(skd_max_req_per_msg, - "Maximum SCSI requests packed in a single message." - " (1-" __stringify(SKD_MAX_REQ_PER_MSG) ", default==1)"); - -#define SKD_MAX_QUEUE_DEPTH_DEFAULT 64 -#define SKD_MAX_QUEUE_DEPTH_DEFAULT_STR "64" -static int skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT; - -module_param(skd_max_queue_depth, int, 0444); -MODULE_PARM_DESC(skd_max_queue_depth, - "Maximum SCSI requests issued to s1120." - " (1-200, default==" SKD_MAX_QUEUE_DEPTH_DEFAULT_STR ")"); - -static int skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT; -module_param(skd_sgs_per_request, int, 0444); -MODULE_PARM_DESC(skd_sgs_per_request, - "Maximum SG elements per block request." - " (1-4096, default==256)"); - -static int skd_max_pass_thru = 1; -module_param(skd_max_pass_thru, int, 0444); -MODULE_PARM_DESC(skd_max_pass_thru, - "Maximum SCSI pass-thru at a time. IGNORED"); - -module_param(skd_dbg_level, int, 0444); -MODULE_PARM_DESC(skd_dbg_level, "s1120 debug level (0,1,2)"); - -module_param(skd_isr_comp_limit, int, 0444); -MODULE_PARM_DESC(skd_isr_comp_limit, "s1120 isr comp limit (0=none) default=4"); - -/* Major device number dynamically assigned. */ -static u32 skd_major; - -static void skd_destruct(struct skd_device *skdev); -static const struct block_device_operations skd_blockdev_ops; -static void skd_send_fitmsg(struct skd_device *skdev, - struct skd_fitmsg_context *skmsg); -static void skd_send_special_fitmsg(struct skd_device *skdev, - struct skd_special_context *skspcl); -static bool skd_preop_sg_list(struct skd_device *skdev, - struct skd_request_context *skreq); -static void skd_postop_sg_list(struct skd_device *skdev, - struct skd_request_context *skreq); - -static void skd_restart_device(struct skd_device *skdev); -static int skd_quiesce_dev(struct skd_device *skdev); -static int skd_unquiesce_dev(struct skd_device *skdev); -static void skd_disable_interrupts(struct skd_device *skdev); -static void skd_isr_fwstate(struct skd_device *skdev); -static void skd_recover_requests(struct skd_device *skdev); -static void skd_soft_reset(struct skd_device *skdev); - -const char *skd_drive_state_to_str(int state); -const char *skd_skdev_state_to_str(enum skd_drvr_state state); -static void skd_log_skdev(struct skd_device *skdev, const char *event); -static void skd_log_skreq(struct skd_device *skdev, - struct skd_request_context *skreq, const char *event); - -/* - ***************************************************************************** - * READ/WRITE REQUESTS - ***************************************************************************** - */ -static bool skd_inc_in_flight(struct request *rq, void *data, bool reserved) -{ - int *count = data; - - count++; - return true; -} - -static int skd_in_flight(struct skd_device *skdev) -{ - int count = 0; - - blk_mq_tagset_busy_iter(&skdev->tag_set, skd_inc_in_flight, &count); - - return count; -} - -static void -skd_prep_rw_cdb(struct skd_scsi_request *scsi_req, - int data_dir, unsigned lba, - unsigned count) -{ - if (data_dir == READ) - scsi_req->cdb[0] = READ_10; - else - scsi_req->cdb[0] = WRITE_10; - - scsi_req->cdb[1] = 0; - scsi_req->cdb[2] = (lba & 0xff000000) >> 24; - scsi_req->cdb[3] = (lba & 0xff0000) >> 16; - scsi_req->cdb[4] = (lba & 0xff00) >> 8; - scsi_req->cdb[5] = (lba & 0xff); - scsi_req->cdb[6] = 0; - scsi_req->cdb[7] = (count & 0xff00) >> 8; - scsi_req->cdb[8] = count & 0xff; - scsi_req->cdb[9] = 0; -} - -static void -skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req, - struct skd_request_context *skreq) -{ - skreq->flush_cmd = 1; - - scsi_req->cdb[0] = SYNCHRONIZE_CACHE; - scsi_req->cdb[1] = 0; - scsi_req->cdb[2] = 0; - scsi_req->cdb[3] = 0; - scsi_req->cdb[4] = 0; - scsi_req->cdb[5] = 0; - scsi_req->cdb[6] = 0; - scsi_req->cdb[7] = 0; - scsi_req->cdb[8] = 0; - scsi_req->cdb[9] = 0; -} - -/* - * Return true if and only if all pending requests should be failed. - */ -static bool skd_fail_all(struct request_queue *q) -{ - struct skd_device *skdev = q->queuedata; - - SKD_ASSERT(skdev->state != SKD_DRVR_STATE_ONLINE); - - skd_log_skdev(skdev, "req_not_online"); - switch (skdev->state) { - case SKD_DRVR_STATE_PAUSING: - case SKD_DRVR_STATE_PAUSED: - case SKD_DRVR_STATE_STARTING: - case SKD_DRVR_STATE_RESTARTING: - case SKD_DRVR_STATE_WAIT_BOOT: - /* In case of starting, we haven't started the queue, - * so we can't get here... but requests are - * possibly hanging out waiting for us because we - * reported the dev/skd0 already. They'll wait - * forever if connect doesn't complete. - * What to do??? delay dev/skd0 ?? - */ - case SKD_DRVR_STATE_BUSY: - case SKD_DRVR_STATE_BUSY_IMMINENT: - case SKD_DRVR_STATE_BUSY_ERASE: - return false; - - case SKD_DRVR_STATE_BUSY_SANITIZE: - case SKD_DRVR_STATE_STOPPING: - case SKD_DRVR_STATE_SYNCING: - case SKD_DRVR_STATE_FAULT: - case SKD_DRVR_STATE_DISAPPEARED: - default: - return true; - } -} - -static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *mqd) -{ - struct request *const req = mqd->rq; - struct request_queue *const q = req->q; - struct skd_device *skdev = q->queuedata; - struct skd_fitmsg_context *skmsg; - struct fit_msg_hdr *fmh; - const u32 tag = blk_mq_unique_tag(req); - struct skd_request_context *const skreq = blk_mq_rq_to_pdu(req); - struct skd_scsi_request *scsi_req; - unsigned long flags = 0; - const u32 lba = blk_rq_pos(req); - const u32 count = blk_rq_sectors(req); - const int data_dir = rq_data_dir(req); - - if (unlikely(skdev->state != SKD_DRVR_STATE_ONLINE)) - return skd_fail_all(q) ? BLK_STS_IOERR : BLK_STS_RESOURCE; - - if (!(req->rq_flags & RQF_DONTPREP)) { - skreq->retries = 0; - req->rq_flags |= RQF_DONTPREP; - } - - blk_mq_start_request(req); - - WARN_ONCE(tag >= skd_max_queue_depth, "%#x > %#x (nr_requests = %lu)\n", - tag, skd_max_queue_depth, q->nr_requests); - - SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE); - - dev_dbg(&skdev->pdev->dev, - "new req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", req, lba, - lba, count, count, data_dir); - - skreq->id = tag + SKD_ID_RW_REQUEST; - skreq->flush_cmd = 0; - skreq->n_sg = 0; - skreq->sg_byte_count = 0; - - skreq->fitmsg_id = 0; - - skreq->data_dir = data_dir == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - - if (req->bio && !skd_preop_sg_list(skdev, skreq)) { - dev_dbg(&skdev->pdev->dev, "error Out\n"); - skreq->status = BLK_STS_RESOURCE; - blk_mq_complete_request(req); - return BLK_STS_OK; - } - - dma_sync_single_for_device(&skdev->pdev->dev, skreq->sksg_dma_address, - skreq->n_sg * - sizeof(struct fit_sg_descriptor), - DMA_TO_DEVICE); - - /* Either a FIT msg is in progress or we have to start one. */ - if (skd_max_req_per_msg == 1) { - skmsg = NULL; - } else { - spin_lock_irqsave(&skdev->lock, flags); - skmsg = skdev->skmsg; - } - if (!skmsg) { - skmsg = &skdev->skmsg_table[tag]; - skdev->skmsg = skmsg; - - /* Initialize the FIT msg header */ - fmh = &skmsg->msg_buf->fmh; - memset(fmh, 0, sizeof(*fmh)); - fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; - skmsg->length = sizeof(*fmh); - } else { - fmh = &skmsg->msg_buf->fmh; - } - - skreq->fitmsg_id = skmsg->id; - - scsi_req = &skmsg->msg_buf->scsi[fmh->num_protocol_cmds_coalesced]; - memset(scsi_req, 0, sizeof(*scsi_req)); - - scsi_req->hdr.tag = skreq->id; - scsi_req->hdr.sg_list_dma_address = - cpu_to_be64(skreq->sksg_dma_address); - - if (req_op(req) == REQ_OP_FLUSH) { - skd_prep_zerosize_flush_cdb(scsi_req, skreq); - SKD_ASSERT(skreq->flush_cmd == 1); - } else { - skd_prep_rw_cdb(scsi_req, data_dir, lba, count); - } - - if (req->cmd_flags & REQ_FUA) - scsi_req->cdb[1] |= SKD_FUA_NV; - - scsi_req->hdr.sg_list_len_bytes = cpu_to_be32(skreq->sg_byte_count); - - /* Complete resource allocations. */ - skreq->state = SKD_REQ_STATE_BUSY; - - skmsg->length += sizeof(struct skd_scsi_request); - fmh->num_protocol_cmds_coalesced++; - - dev_dbg(&skdev->pdev->dev, "req=0x%x busy=%d\n", skreq->id, - skd_in_flight(skdev)); - - /* - * If the FIT msg buffer is full send it. - */ - if (skd_max_req_per_msg == 1) { - skd_send_fitmsg(skdev, skmsg); - } else { - if (mqd->last || - fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { - skd_send_fitmsg(skdev, skmsg); - skdev->skmsg = NULL; - } - spin_unlock_irqrestore(&skdev->lock, flags); - } - - return BLK_STS_OK; -} - -static enum blk_eh_timer_return skd_timed_out(struct request *req, - bool reserved) -{ - struct skd_device *skdev = req->q->queuedata; - - dev_err(&skdev->pdev->dev, "request with tag %#x timed out\n", - blk_mq_unique_tag(req)); - - return BLK_EH_RESET_TIMER; -} - -static void skd_complete_rq(struct request *req) -{ - struct skd_request_context *skreq = blk_mq_rq_to_pdu(req); - - blk_mq_end_request(req, skreq->status); -} - -static bool skd_preop_sg_list(struct skd_device *skdev, - struct skd_request_context *skreq) -{ - struct request *req = blk_mq_rq_from_pdu(skreq); - struct scatterlist *sgl = &skreq->sg[0], *sg; - int n_sg; - int i; - - skreq->sg_byte_count = 0; - - WARN_ON_ONCE(skreq->data_dir != DMA_TO_DEVICE && - skreq->data_dir != DMA_FROM_DEVICE); - - n_sg = blk_rq_map_sg(skdev->queue, req, sgl); - if (n_sg <= 0) - return false; - - /* - * Map scatterlist to PCI bus addresses. - * Note PCI might change the number of entries. - */ - n_sg = dma_map_sg(&skdev->pdev->dev, sgl, n_sg, skreq->data_dir); - if (n_sg <= 0) - return false; - - SKD_ASSERT(n_sg <= skdev->sgs_per_request); - - skreq->n_sg = n_sg; - - for_each_sg(sgl, sg, n_sg, i) { - struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; - u32 cnt = sg_dma_len(sg); - uint64_t dma_addr = sg_dma_address(sg); - - sgd->control = FIT_SGD_CONTROL_NOT_LAST; - sgd->byte_count = cnt; - skreq->sg_byte_count += cnt; - sgd->host_side_addr = dma_addr; - sgd->dev_side_addr = 0; - } - - skreq->sksg_list[n_sg - 1].next_desc_ptr = 0LL; - skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST; - - if (unlikely(skdev->dbg_level > 1)) { - dev_dbg(&skdev->pdev->dev, - "skreq=%x sksg_list=%p sksg_dma=%pad\n", - skreq->id, skreq->sksg_list, &skreq->sksg_dma_address); - for (i = 0; i < n_sg; i++) { - struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; - - dev_dbg(&skdev->pdev->dev, - " sg[%d] count=%u ctrl=0x%x addr=0x%llx next=0x%llx\n", - i, sgd->byte_count, sgd->control, - sgd->host_side_addr, sgd->next_desc_ptr); - } - } - - return true; -} - -static void skd_postop_sg_list(struct skd_device *skdev, - struct skd_request_context *skreq) -{ - /* - * restore the next ptr for next IO request so we - * don't have to set it every time. - */ - skreq->sksg_list[skreq->n_sg - 1].next_desc_ptr = - skreq->sksg_dma_address + - ((skreq->n_sg) * sizeof(struct fit_sg_descriptor)); - dma_unmap_sg(&skdev->pdev->dev, &skreq->sg[0], skreq->n_sg, - skreq->data_dir); -} - -/* - ***************************************************************************** - * TIMER - ***************************************************************************** - */ - -static void skd_timer_tick_not_online(struct skd_device *skdev); - -static void skd_start_queue(struct work_struct *work) -{ - struct skd_device *skdev = container_of(work, typeof(*skdev), - start_queue); - - /* - * Although it is safe to call blk_start_queue() from interrupt - * context, blk_mq_start_hw_queues() must not be called from - * interrupt context. - */ - blk_mq_start_hw_queues(skdev->queue); -} - -static void skd_timer_tick(struct timer_list *t) -{ - struct skd_device *skdev = from_timer(skdev, t, timer); - unsigned long reqflags; - u32 state; - - if (skdev->state == SKD_DRVR_STATE_FAULT) - /* The driver has declared fault, and we want it to - * stay that way until driver is reloaded. - */ - return; - - spin_lock_irqsave(&skdev->lock, reqflags); - - state = SKD_READL(skdev, FIT_STATUS); - state &= FIT_SR_DRIVE_STATE_MASK; - if (state != skdev->drive_state) - skd_isr_fwstate(skdev); - - if (skdev->state != SKD_DRVR_STATE_ONLINE) - skd_timer_tick_not_online(skdev); - - mod_timer(&skdev->timer, (jiffies + HZ)); - - spin_unlock_irqrestore(&skdev->lock, reqflags); -} - -static void skd_timer_tick_not_online(struct skd_device *skdev) -{ - switch (skdev->state) { - case SKD_DRVR_STATE_IDLE: - case SKD_DRVR_STATE_LOAD: - break; - case SKD_DRVR_STATE_BUSY_SANITIZE: - dev_dbg(&skdev->pdev->dev, - "drive busy sanitize[%x], driver[%x]\n", - skdev->drive_state, skdev->state); - /* If we've been in sanitize for 3 seconds, we figure we're not - * going to get anymore completions, so recover requests now - */ - if (skdev->timer_countdown > 0) { - skdev->timer_countdown--; - return; - } - skd_recover_requests(skdev); - break; - - case SKD_DRVR_STATE_BUSY: - case SKD_DRVR_STATE_BUSY_IMMINENT: - case SKD_DRVR_STATE_BUSY_ERASE: - dev_dbg(&skdev->pdev->dev, "busy[%x], countdown=%d\n", - skdev->state, skdev->timer_countdown); - if (skdev->timer_countdown > 0) { - skdev->timer_countdown--; - return; - } - dev_dbg(&skdev->pdev->dev, - "busy[%x], timedout=%d, restarting device.", - skdev->state, skdev->timer_countdown); - skd_restart_device(skdev); - break; - - case SKD_DRVR_STATE_WAIT_BOOT: - case SKD_DRVR_STATE_STARTING: - if (skdev->timer_countdown > 0) { - skdev->timer_countdown--; - return; - } - /* For now, we fault the drive. Could attempt resets to - * revcover at some point. */ - skdev->state = SKD_DRVR_STATE_FAULT; - - dev_err(&skdev->pdev->dev, "DriveFault Connect Timeout (%x)\n", - skdev->drive_state); - - /*start the queue so we can respond with error to requests */ - /* wakeup anyone waiting for startup complete */ - schedule_work(&skdev->start_queue); - skdev->gendisk_on = -1; - wake_up_interruptible(&skdev->waitq); - break; - - case SKD_DRVR_STATE_ONLINE: - /* shouldn't get here. */ - break; - - case SKD_DRVR_STATE_PAUSING: - case SKD_DRVR_STATE_PAUSED: - break; - - case SKD_DRVR_STATE_RESTARTING: - if (skdev->timer_countdown > 0) { - skdev->timer_countdown--; - return; - } - /* For now, we fault the drive. Could attempt resets to - * revcover at some point. */ - skdev->state = SKD_DRVR_STATE_FAULT; - dev_err(&skdev->pdev->dev, - "DriveFault Reconnect Timeout (%x)\n", - skdev->drive_state); - - /* - * Recovering does two things: - * 1. completes IO with error - * 2. reclaims dma resources - * When is it safe to recover requests? - * - if the drive state is faulted - * - if the state is still soft reset after out timeout - * - if the drive registers are dead (state = FF) - * If it is "unsafe", we still need to recover, so we will - * disable pci bus mastering and disable our interrupts. - */ - - if ((skdev->drive_state == FIT_SR_DRIVE_SOFT_RESET) || - (skdev->drive_state == FIT_SR_DRIVE_FAULT) || - (skdev->drive_state == FIT_SR_DRIVE_STATE_MASK)) - /* It never came out of soft reset. Try to - * recover the requests and then let them - * fail. This is to mitigate hung processes. */ - skd_recover_requests(skdev); - else { - dev_err(&skdev->pdev->dev, "Disable BusMaster (%x)\n", - skdev->drive_state); - pci_disable_device(skdev->pdev); - skd_disable_interrupts(skdev); - skd_recover_requests(skdev); - } - - /*start the queue so we can respond with error to requests */ - /* wakeup anyone waiting for startup complete */ - schedule_work(&skdev->start_queue); - skdev->gendisk_on = -1; - wake_up_interruptible(&skdev->waitq); - break; - - case SKD_DRVR_STATE_RESUMING: - case SKD_DRVR_STATE_STOPPING: - case SKD_DRVR_STATE_SYNCING: - case SKD_DRVR_STATE_FAULT: - case SKD_DRVR_STATE_DISAPPEARED: - default: - break; - } -} - -static int skd_start_timer(struct skd_device *skdev) -{ - int rc; - - timer_setup(&skdev->timer, skd_timer_tick, 0); - - rc = mod_timer(&skdev->timer, (jiffies + HZ)); - if (rc) - dev_err(&skdev->pdev->dev, "failed to start timer %d\n", rc); - return rc; -} - -static void skd_kill_timer(struct skd_device *skdev) -{ - del_timer_sync(&skdev->timer); -} - -/* - ***************************************************************************** - * INTERNAL REQUESTS -- generated by driver itself - ***************************************************************************** - */ - -static int skd_format_internal_skspcl(struct skd_device *skdev) -{ - struct skd_special_context *skspcl = &skdev->internal_skspcl; - struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0]; - struct fit_msg_hdr *fmh; - uint64_t dma_address; - struct skd_scsi_request *scsi; - - fmh = &skspcl->msg_buf->fmh; - fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; - fmh->num_protocol_cmds_coalesced = 1; - - scsi = &skspcl->msg_buf->scsi[0]; - memset(scsi, 0, sizeof(*scsi)); - dma_address = skspcl->req.sksg_dma_address; - scsi->hdr.sg_list_dma_address = cpu_to_be64(dma_address); - skspcl->req.n_sg = 1; - sgd->control = FIT_SGD_CONTROL_LAST; - sgd->byte_count = 0; - sgd->host_side_addr = skspcl->db_dma_address; - sgd->dev_side_addr = 0; - sgd->next_desc_ptr = 0LL; - - return 1; -} - -#define WR_BUF_SIZE SKD_N_INTERNAL_BYTES - -static void skd_send_internal_skspcl(struct skd_device *skdev, - struct skd_special_context *skspcl, - u8 opcode) -{ - struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0]; - struct skd_scsi_request *scsi; - unsigned char *buf = skspcl->data_buf; - int i; - - if (skspcl->req.state != SKD_REQ_STATE_IDLE) - /* - * A refresh is already in progress. - * Just wait for it to finish. - */ - return; - - skspcl->req.state = SKD_REQ_STATE_BUSY; - - scsi = &skspcl->msg_buf->scsi[0]; - scsi->hdr.tag = skspcl->req.id; - - memset(scsi->cdb, 0, sizeof(scsi->cdb)); - - switch (opcode) { - case TEST_UNIT_READY: - scsi->cdb[0] = TEST_UNIT_READY; - sgd->byte_count = 0; - scsi->hdr.sg_list_len_bytes = 0; - break; - - case READ_CAPACITY: - scsi->cdb[0] = READ_CAPACITY; - sgd->byte_count = SKD_N_READ_CAP_BYTES; - scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); - break; - - case INQUIRY: - scsi->cdb[0] = INQUIRY; - scsi->cdb[1] = 0x01; /* evpd */ - scsi->cdb[2] = 0x80; /* serial number page */ - scsi->cdb[4] = 0x10; - sgd->byte_count = 16; - scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); - break; - - case SYNCHRONIZE_CACHE: - scsi->cdb[0] = SYNCHRONIZE_CACHE; - sgd->byte_count = 0; - scsi->hdr.sg_list_len_bytes = 0; - break; - - case WRITE_BUFFER: - scsi->cdb[0] = WRITE_BUFFER; - scsi->cdb[1] = 0x02; - scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8; - scsi->cdb[8] = WR_BUF_SIZE & 0xFF; - sgd->byte_count = WR_BUF_SIZE; - scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); - /* fill incrementing byte pattern */ - for (i = 0; i < sgd->byte_count; i++) - buf[i] = i & 0xFF; - break; - - case READ_BUFFER: - scsi->cdb[0] = READ_BUFFER; - scsi->cdb[1] = 0x02; - scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8; - scsi->cdb[8] = WR_BUF_SIZE & 0xFF; - sgd->byte_count = WR_BUF_SIZE; - scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); - memset(skspcl->data_buf, 0, sgd->byte_count); - break; - - default: - SKD_ASSERT("Don't know what to send"); - return; - - } - skd_send_special_fitmsg(skdev, skspcl); -} - -static void skd_refresh_device_data(struct skd_device *skdev) -{ - struct skd_special_context *skspcl = &skdev->internal_skspcl; - - skd_send_internal_skspcl(skdev, skspcl, TEST_UNIT_READY); -} - -static int skd_chk_read_buf(struct skd_device *skdev, - struct skd_special_context *skspcl) -{ - unsigned char *buf = skspcl->data_buf; - int i; - - /* check for incrementing byte pattern */ - for (i = 0; i < WR_BUF_SIZE; i++) - if (buf[i] != (i & 0xFF)) - return 1; - - return 0; -} - -static void skd_log_check_status(struct skd_device *skdev, u8 status, u8 key, - u8 code, u8 qual, u8 fruc) -{ - /* If the check condition is of special interest, log a message */ - if ((status == SAM_STAT_CHECK_CONDITION) && (key == 0x02) - && (code == 0x04) && (qual == 0x06)) { - dev_err(&skdev->pdev->dev, - "*** LOST_WRITE_DATA ERROR *** key/asc/ascq/fruc %02x/%02x/%02x/%02x\n", - key, code, qual, fruc); - } -} - -static void skd_complete_internal(struct skd_device *skdev, - struct fit_completion_entry_v1 *skcomp, - struct fit_comp_error_info *skerr, - struct skd_special_context *skspcl) -{ - u8 *buf = skspcl->data_buf; - u8 status; - int i; - struct skd_scsi_request *scsi = &skspcl->msg_buf->scsi[0]; - - lockdep_assert_held(&skdev->lock); - - SKD_ASSERT(skspcl == &skdev->internal_skspcl); - - dev_dbg(&skdev->pdev->dev, "complete internal %x\n", scsi->cdb[0]); - - dma_sync_single_for_cpu(&skdev->pdev->dev, - skspcl->db_dma_address, - skspcl->req.sksg_list[0].byte_count, - DMA_BIDIRECTIONAL); - - skspcl->req.completion = *skcomp; - skspcl->req.state = SKD_REQ_STATE_IDLE; - - status = skspcl->req.completion.status; - - skd_log_check_status(skdev, status, skerr->key, skerr->code, - skerr->qual, skerr->fruc); - - switch (scsi->cdb[0]) { - case TEST_UNIT_READY: - if (status == SAM_STAT_GOOD) - skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER); - else if ((status == SAM_STAT_CHECK_CONDITION) && - (skerr->key == MEDIUM_ERROR)) - skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER); - else { - if (skdev->state == SKD_DRVR_STATE_STOPPING) { - dev_dbg(&skdev->pdev->dev, - "TUR failed, don't send anymore state 0x%x\n", - skdev->state); - return; - } - dev_dbg(&skdev->pdev->dev, - "**** TUR failed, retry skerr\n"); - skd_send_internal_skspcl(skdev, skspcl, - TEST_UNIT_READY); - } - break; - - case WRITE_BUFFER: - if (status == SAM_STAT_GOOD) - skd_send_internal_skspcl(skdev, skspcl, READ_BUFFER); - else { - if (skdev->state == SKD_DRVR_STATE_STOPPING) { - dev_dbg(&skdev->pdev->dev, - "write buffer failed, don't send anymore state 0x%x\n", - skdev->state); - return; - } - dev_dbg(&skdev->pdev->dev, - "**** write buffer failed, retry skerr\n"); - skd_send_internal_skspcl(skdev, skspcl, - TEST_UNIT_READY); - } - break; - - case READ_BUFFER: - if (status == SAM_STAT_GOOD) { - if (skd_chk_read_buf(skdev, skspcl) == 0) - skd_send_internal_skspcl(skdev, skspcl, - READ_CAPACITY); - else { - dev_err(&skdev->pdev->dev, - "*** W/R Buffer mismatch %d ***\n", - skdev->connect_retries); - if (skdev->connect_retries < - SKD_MAX_CONNECT_RETRIES) { - skdev->connect_retries++; - skd_soft_reset(skdev); - } else { - dev_err(&skdev->pdev->dev, - "W/R Buffer Connect Error\n"); - return; - } - } - - } else { - if (skdev->state == SKD_DRVR_STATE_STOPPING) { - dev_dbg(&skdev->pdev->dev, - "read buffer failed, don't send anymore state 0x%x\n", - skdev->state); - return; - } - dev_dbg(&skdev->pdev->dev, - "**** read buffer failed, retry skerr\n"); - skd_send_internal_skspcl(skdev, skspcl, - TEST_UNIT_READY); - } - break; - - case READ_CAPACITY: - skdev->read_cap_is_valid = 0; - if (status == SAM_STAT_GOOD) { - skdev->read_cap_last_lba = - (buf[0] << 24) | (buf[1] << 16) | - (buf[2] << 8) | buf[3]; - skdev->read_cap_blocksize = - (buf[4] << 24) | (buf[5] << 16) | - (buf[6] << 8) | buf[7]; - - dev_dbg(&skdev->pdev->dev, "last lba %d, bs %d\n", - skdev->read_cap_last_lba, - skdev->read_cap_blocksize); - - set_capacity(skdev->disk, skdev->read_cap_last_lba + 1); - - skdev->read_cap_is_valid = 1; - - skd_send_internal_skspcl(skdev, skspcl, INQUIRY); - } else if ((status == SAM_STAT_CHECK_CONDITION) && - (skerr->key == MEDIUM_ERROR)) { - skdev->read_cap_last_lba = ~0; - set_capacity(skdev->disk, skdev->read_cap_last_lba + 1); - dev_dbg(&skdev->pdev->dev, "**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n"); - skd_send_internal_skspcl(skdev, skspcl, INQUIRY); - } else { - dev_dbg(&skdev->pdev->dev, "**** READCAP failed, retry TUR\n"); - skd_send_internal_skspcl(skdev, skspcl, - TEST_UNIT_READY); - } - break; - - case INQUIRY: - skdev->inquiry_is_valid = 0; - if (status == SAM_STAT_GOOD) { - skdev->inquiry_is_valid = 1; - - for (i = 0; i < 12; i++) - skdev->inq_serial_num[i] = buf[i + 4]; - skdev->inq_serial_num[12] = 0; - } - - if (skd_unquiesce_dev(skdev) < 0) - dev_dbg(&skdev->pdev->dev, "**** failed, to ONLINE device\n"); - /* connection is complete */ - skdev->connect_retries = 0; - break; - - case SYNCHRONIZE_CACHE: - if (status == SAM_STAT_GOOD) - skdev->sync_done = 1; - else - skdev->sync_done = -1; - wake_up_interruptible(&skdev->waitq); - break; - - default: - SKD_ASSERT("we didn't send this"); - } -} - -/* - ***************************************************************************** - * FIT MESSAGES - ***************************************************************************** - */ - -static void skd_send_fitmsg(struct skd_device *skdev, - struct skd_fitmsg_context *skmsg) -{ - u64 qcmd; - - dev_dbg(&skdev->pdev->dev, "dma address %pad, busy=%d\n", - &skmsg->mb_dma_address, skd_in_flight(skdev)); - dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf); - - qcmd = skmsg->mb_dma_address; - qcmd |= FIT_QCMD_QID_NORMAL; - - if (unlikely(skdev->dbg_level > 1)) { - u8 *bp = (u8 *)skmsg->msg_buf; - int i; - for (i = 0; i < skmsg->length; i += 8) { - dev_dbg(&skdev->pdev->dev, "msg[%2d] %8ph\n", i, - &bp[i]); - if (i == 0) - i = 64 - 8; - } - } - - if (skmsg->length > 256) - qcmd |= FIT_QCMD_MSGSIZE_512; - else if (skmsg->length > 128) - qcmd |= FIT_QCMD_MSGSIZE_256; - else if (skmsg->length > 64) - qcmd |= FIT_QCMD_MSGSIZE_128; - else - /* - * This makes no sense because the FIT msg header is - * 64 bytes. If the msg is only 64 bytes long it has - * no payload. - */ - qcmd |= FIT_QCMD_MSGSIZE_64; - - dma_sync_single_for_device(&skdev->pdev->dev, skmsg->mb_dma_address, - skmsg->length, DMA_TO_DEVICE); - - /* Make sure skd_msg_buf is written before the doorbell is triggered. */ - smp_wmb(); - - SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); -} - -static void skd_send_special_fitmsg(struct skd_device *skdev, - struct skd_special_context *skspcl) -{ - u64 qcmd; - - WARN_ON_ONCE(skspcl->req.n_sg != 1); - - if (unlikely(skdev->dbg_level > 1)) { - u8 *bp = (u8 *)skspcl->msg_buf; - int i; - - for (i = 0; i < SKD_N_SPECIAL_FITMSG_BYTES; i += 8) { - dev_dbg(&skdev->pdev->dev, " spcl[%2d] %8ph\n", i, - &bp[i]); - if (i == 0) - i = 64 - 8; - } - - dev_dbg(&skdev->pdev->dev, - "skspcl=%p id=%04x sksg_list=%p sksg_dma=%pad\n", - skspcl, skspcl->req.id, skspcl->req.sksg_list, - &skspcl->req.sksg_dma_address); - for (i = 0; i < skspcl->req.n_sg; i++) { - struct fit_sg_descriptor *sgd = - &skspcl->req.sksg_list[i]; - - dev_dbg(&skdev->pdev->dev, - " sg[%d] count=%u ctrl=0x%x addr=0x%llx next=0x%llx\n", - i, sgd->byte_count, sgd->control, - sgd->host_side_addr, sgd->next_desc_ptr); - } - } - - /* - * Special FIT msgs are always 128 bytes: a 64-byte FIT hdr - * and one 64-byte SSDI command. - */ - qcmd = skspcl->mb_dma_address; - qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128; - - dma_sync_single_for_device(&skdev->pdev->dev, skspcl->mb_dma_address, - SKD_N_SPECIAL_FITMSG_BYTES, DMA_TO_DEVICE); - dma_sync_single_for_device(&skdev->pdev->dev, - skspcl->req.sksg_dma_address, - 1 * sizeof(struct fit_sg_descriptor), - DMA_TO_DEVICE); - dma_sync_single_for_device(&skdev->pdev->dev, - skspcl->db_dma_address, - skspcl->req.sksg_list[0].byte_count, - DMA_BIDIRECTIONAL); - - /* Make sure skd_msg_buf is written before the doorbell is triggered. */ - smp_wmb(); - - SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); -} - -/* - ***************************************************************************** - * COMPLETION QUEUE - ***************************************************************************** - */ - -static void skd_complete_other(struct skd_device *skdev, - struct fit_completion_entry_v1 *skcomp, - struct fit_comp_error_info *skerr); - -struct sns_info { - u8 type; - u8 stat; - u8 key; - u8 asc; - u8 ascq; - u8 mask; - enum skd_check_status_action action; -}; - -static struct sns_info skd_chkstat_table[] = { - /* Good */ - { 0x70, 0x02, RECOVERED_ERROR, 0, 0, 0x1c, - SKD_CHECK_STATUS_REPORT_GOOD }, - - /* Smart alerts */ - { 0x70, 0x02, NO_SENSE, 0x0B, 0x00, 0x1E, /* warnings */ - SKD_CHECK_STATUS_REPORT_SMART_ALERT }, - { 0x70, 0x02, NO_SENSE, 0x5D, 0x00, 0x1E, /* thresholds */ - SKD_CHECK_STATUS_REPORT_SMART_ALERT }, - { 0x70, 0x02, RECOVERED_ERROR, 0x0B, 0x01, 0x1F, /* temperature over trigger */ - SKD_CHECK_STATUS_REPORT_SMART_ALERT }, - - /* Retry (with limits) */ - { 0x70, 0x02, 0x0B, 0, 0, 0x1C, /* This one is for DMA ERROR */ - SKD_CHECK_STATUS_REQUEUE_REQUEST }, - { 0x70, 0x02, 0x06, 0x0B, 0x00, 0x1E, /* warnings */ - SKD_CHECK_STATUS_REQUEUE_REQUEST }, - { 0x70, 0x02, 0x06, 0x5D, 0x00, 0x1E, /* thresholds */ - SKD_CHECK_STATUS_REQUEUE_REQUEST }, - { 0x70, 0x02, 0x06, 0x80, 0x30, 0x1F, /* backup power */ - SKD_CHECK_STATUS_REQUEUE_REQUEST }, - - /* Busy (or about to be) */ - { 0x70, 0x02, 0x06, 0x3f, 0x01, 0x1F, /* fw changed */ - SKD_CHECK_STATUS_BUSY_IMMINENT }, -}; - -/* - * Look up status and sense data to decide how to handle the error - * from the device. - * mask says which fields must match e.g., mask=0x18 means check - * type and stat, ignore key, asc, ascq. - */ - -static enum skd_check_status_action -skd_check_status(struct skd_device *skdev, - u8 cmp_status, struct fit_comp_error_info *skerr) -{ - int i; - - dev_err(&skdev->pdev->dev, "key/asc/ascq/fruc %02x/%02x/%02x/%02x\n", - skerr->key, skerr->code, skerr->qual, skerr->fruc); - - dev_dbg(&skdev->pdev->dev, - "stat: t=%02x stat=%02x k=%02x c=%02x q=%02x fruc=%02x\n", - skerr->type, cmp_status, skerr->key, skerr->code, skerr->qual, - skerr->fruc); - - /* Does the info match an entry in the good category? */ - for (i = 0; i < ARRAY_SIZE(skd_chkstat_table); i++) { - struct sns_info *sns = &skd_chkstat_table[i]; - - if (sns->mask & 0x10) - if (skerr->type != sns->type) - continue; - - if (sns->mask & 0x08) - if (cmp_status != sns->stat) - continue; - - if (sns->mask & 0x04) - if (skerr->key != sns->key) - continue; - - if (sns->mask & 0x02) - if (skerr->code != sns->asc) - continue; - - if (sns->mask & 0x01) - if (skerr->qual != sns->ascq) - continue; - - if (sns->action == SKD_CHECK_STATUS_REPORT_SMART_ALERT) { - dev_err(&skdev->pdev->dev, - "SMART Alert: sense key/asc/ascq %02x/%02x/%02x\n", - skerr->key, skerr->code, skerr->qual); - } - return sns->action; - } - - /* No other match, so nonzero status means error, - * zero status means good - */ - if (cmp_status) { - dev_dbg(&skdev->pdev->dev, "status check: error\n"); - return SKD_CHECK_STATUS_REPORT_ERROR; - } - - dev_dbg(&skdev->pdev->dev, "status check good default\n"); - return SKD_CHECK_STATUS_REPORT_GOOD; -} - -static void skd_resolve_req_exception(struct skd_device *skdev, - struct skd_request_context *skreq, - struct request *req) -{ - u8 cmp_status = skreq->completion.status; - - switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) { - case SKD_CHECK_STATUS_REPORT_GOOD: - case SKD_CHECK_STATUS_REPORT_SMART_ALERT: - skreq->status = BLK_STS_OK; - if (likely(!blk_should_fake_timeout(req->q))) - blk_mq_complete_request(req); - break; - - case SKD_CHECK_STATUS_BUSY_IMMINENT: - skd_log_skreq(skdev, skreq, "retry(busy)"); - blk_mq_requeue_request(req, true); - dev_info(&skdev->pdev->dev, "drive BUSY imminent\n"); - skdev->state = SKD_DRVR_STATE_BUSY_IMMINENT; - skdev->timer_countdown = SKD_TIMER_MINUTES(20); - skd_quiesce_dev(skdev); - break; - - case SKD_CHECK_STATUS_REQUEUE_REQUEST: - if (++skreq->retries < SKD_MAX_RETRIES) { - skd_log_skreq(skdev, skreq, "retry"); - blk_mq_requeue_request(req, true); - break; - } - fallthrough; - - case SKD_CHECK_STATUS_REPORT_ERROR: - default: - skreq->status = BLK_STS_IOERR; - if (likely(!blk_should_fake_timeout(req->q))) - blk_mq_complete_request(req); - break; - } -} - -static void skd_release_skreq(struct skd_device *skdev, - struct skd_request_context *skreq) -{ - /* - * Reclaim the skd_request_context - */ - skreq->state = SKD_REQ_STATE_IDLE; -} - -static int skd_isr_completion_posted(struct skd_device *skdev, - int limit, int *enqueued) -{ - struct fit_completion_entry_v1 *skcmp; - struct fit_comp_error_info *skerr; - u16 req_id; - u32 tag; - u16 hwq = 0; - struct request *rq; - struct skd_request_context *skreq; - u16 cmp_cntxt; - u8 cmp_status; - u8 cmp_cycle; - u32 cmp_bytes; - int rc = 0; - int processed = 0; - - lockdep_assert_held(&skdev->lock); - - for (;; ) { - SKD_ASSERT(skdev->skcomp_ix < SKD_N_COMPLETION_ENTRY); - - skcmp = &skdev->skcomp_table[skdev->skcomp_ix]; - cmp_cycle = skcmp->cycle; - cmp_cntxt = skcmp->tag; - cmp_status = skcmp->status; - cmp_bytes = be32_to_cpu(skcmp->num_returned_bytes); - - skerr = &skdev->skerr_table[skdev->skcomp_ix]; - - dev_dbg(&skdev->pdev->dev, - "cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d busy=%d rbytes=0x%x proto=%d\n", - skdev->skcomp_cycle, skdev->skcomp_ix, cmp_cycle, - cmp_cntxt, cmp_status, skd_in_flight(skdev), - cmp_bytes, skdev->proto_ver); - - if (cmp_cycle != skdev->skcomp_cycle) { - dev_dbg(&skdev->pdev->dev, "end of completions\n"); - break; - } - /* - * Update the completion queue head index and possibly - * the completion cycle count. 8-bit wrap-around. - */ - skdev->skcomp_ix++; - if (skdev->skcomp_ix >= SKD_N_COMPLETION_ENTRY) { - skdev->skcomp_ix = 0; - skdev->skcomp_cycle++; - } - - /* - * The command context is a unique 32-bit ID. The low order - * bits help locate the request. The request is usually a - * r/w request (see skd_start() above) or a special request. - */ - req_id = cmp_cntxt; - tag = req_id & SKD_ID_SLOT_AND_TABLE_MASK; - - /* Is this other than a r/w request? */ - if (tag >= skdev->num_req_context) { - /* - * This is not a completion for a r/w request. - */ - WARN_ON_ONCE(blk_mq_tag_to_rq(skdev->tag_set.tags[hwq], - tag)); - skd_complete_other(skdev, skcmp, skerr); - continue; - } - - rq = blk_mq_tag_to_rq(skdev->tag_set.tags[hwq], tag); - if (WARN(!rq, "No request for tag %#x -> %#x\n", cmp_cntxt, - tag)) - continue; - skreq = blk_mq_rq_to_pdu(rq); - - /* - * Make sure the request ID for the slot matches. - */ - if (skreq->id != req_id) { - dev_err(&skdev->pdev->dev, - "Completion mismatch comp_id=0x%04x skreq=0x%04x new=0x%04x\n", - req_id, skreq->id, cmp_cntxt); - - continue; - } - - SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY); - - skreq->completion = *skcmp; - if (unlikely(cmp_status == SAM_STAT_CHECK_CONDITION)) { - skreq->err_info = *skerr; - skd_log_check_status(skdev, cmp_status, skerr->key, - skerr->code, skerr->qual, - skerr->fruc); - } - /* Release DMA resources for the request. */ - if (skreq->n_sg > 0) - skd_postop_sg_list(skdev, skreq); - - skd_release_skreq(skdev, skreq); - - /* - * Capture the outcome and post it back to the native request. - */ - if (likely(cmp_status == SAM_STAT_GOOD)) { - skreq->status = BLK_STS_OK; - if (likely(!blk_should_fake_timeout(rq->q))) - blk_mq_complete_request(rq); - } else { - skd_resolve_req_exception(skdev, skreq, rq); - } - - /* skd_isr_comp_limit equal zero means no limit */ - if (limit) { - if (++processed >= limit) { - rc = 1; - break; - } - } - } - - if (skdev->state == SKD_DRVR_STATE_PAUSING && - skd_in_flight(skdev) == 0) { - skdev->state = SKD_DRVR_STATE_PAUSED; - wake_up_interruptible(&skdev->waitq); - } - - return rc; -} - -static void skd_complete_other(struct skd_device *skdev, - struct fit_completion_entry_v1 *skcomp, - struct fit_comp_error_info *skerr) -{ - u32 req_id = 0; - u32 req_table; - u32 req_slot; - struct skd_special_context *skspcl; - - lockdep_assert_held(&skdev->lock); - - req_id = skcomp->tag; - req_table = req_id & SKD_ID_TABLE_MASK; - req_slot = req_id & SKD_ID_SLOT_MASK; - - dev_dbg(&skdev->pdev->dev, "table=0x%x id=0x%x slot=%d\n", req_table, - req_id, req_slot); - - /* - * Based on the request id, determine how to dispatch this completion. - * This swich/case is finding the good cases and forwarding the - * completion entry. Errors are reported below the switch. - */ - switch (req_table) { - case SKD_ID_RW_REQUEST: - /* - * The caller, skd_isr_completion_posted() above, - * handles r/w requests. The only way we get here - * is if the req_slot is out of bounds. - */ - break; - - case SKD_ID_INTERNAL: - if (req_slot == 0) { - skspcl = &skdev->internal_skspcl; - if (skspcl->req.id == req_id && - skspcl->req.state == SKD_REQ_STATE_BUSY) { - skd_complete_internal(skdev, - skcomp, skerr, skspcl); - return; - } - } - break; - - case SKD_ID_FIT_MSG: - /* - * These id's should never appear in a completion record. - */ - break; - - default: - /* - * These id's should never appear anywhere; - */ - break; - } - - /* - * If we get here it is a bad or stale id. - */ -} - -static void skd_reset_skcomp(struct skd_device *skdev) -{ - memset(skdev->skcomp_table, 0, SKD_SKCOMP_SIZE); - - skdev->skcomp_ix = 0; - skdev->skcomp_cycle = 1; -} - -/* - ***************************************************************************** - * INTERRUPTS - ***************************************************************************** - */ -static void skd_completion_worker(struct work_struct *work) -{ - struct skd_device *skdev = - container_of(work, struct skd_device, completion_worker); - unsigned long flags; - int flush_enqueued = 0; - - spin_lock_irqsave(&skdev->lock, flags); - - /* - * pass in limit=0, which means no limit.. - * process everything in compq - */ - skd_isr_completion_posted(skdev, 0, &flush_enqueued); - schedule_work(&skdev->start_queue); - - spin_unlock_irqrestore(&skdev->lock, flags); -} - -static void skd_isr_msg_from_dev(struct skd_device *skdev); - -static irqreturn_t -skd_isr(int irq, void *ptr) -{ - struct skd_device *skdev = ptr; - u32 intstat; - u32 ack; - int rc = 0; - int deferred = 0; - int flush_enqueued = 0; - - spin_lock(&skdev->lock); - - for (;; ) { - intstat = SKD_READL(skdev, FIT_INT_STATUS_HOST); - - ack = FIT_INT_DEF_MASK; - ack &= intstat; - - dev_dbg(&skdev->pdev->dev, "intstat=0x%x ack=0x%x\n", intstat, - ack); - - /* As long as there is an int pending on device, keep - * running loop. When none, get out, but if we've never - * done any processing, call completion handler? - */ - if (ack == 0) { - /* No interrupts on device, but run the completion - * processor anyway? - */ - if (rc == 0) - if (likely (skdev->state - == SKD_DRVR_STATE_ONLINE)) - deferred = 1; - break; - } - - rc = IRQ_HANDLED; - - SKD_WRITEL(skdev, ack, FIT_INT_STATUS_HOST); - - if (likely((skdev->state != SKD_DRVR_STATE_LOAD) && - (skdev->state != SKD_DRVR_STATE_STOPPING))) { - if (intstat & FIT_ISH_COMPLETION_POSTED) { - /* - * If we have already deferred completion - * processing, don't bother running it again - */ - if (deferred == 0) - deferred = - skd_isr_completion_posted(skdev, - skd_isr_comp_limit, &flush_enqueued); - } - - if (intstat & FIT_ISH_FW_STATE_CHANGE) { - skd_isr_fwstate(skdev); - if (skdev->state == SKD_DRVR_STATE_FAULT || - skdev->state == - SKD_DRVR_STATE_DISAPPEARED) { - spin_unlock(&skdev->lock); - return rc; - } - } - - if (intstat & FIT_ISH_MSG_FROM_DEV) - skd_isr_msg_from_dev(skdev); - } - } - - if (unlikely(flush_enqueued)) - schedule_work(&skdev->start_queue); - - if (deferred) - schedule_work(&skdev->completion_worker); - else if (!flush_enqueued) - schedule_work(&skdev->start_queue); - - spin_unlock(&skdev->lock); - - return rc; -} - -static void skd_drive_fault(struct skd_device *skdev) -{ - skdev->state = SKD_DRVR_STATE_FAULT; - dev_err(&skdev->pdev->dev, "Drive FAULT\n"); -} - -static void skd_drive_disappeared(struct skd_device *skdev) -{ - skdev->state = SKD_DRVR_STATE_DISAPPEARED; - dev_err(&skdev->pdev->dev, "Drive DISAPPEARED\n"); -} - -static void skd_isr_fwstate(struct skd_device *skdev) -{ - u32 sense; - u32 state; - u32 mtd; - int prev_driver_state = skdev->state; - - sense = SKD_READL(skdev, FIT_STATUS); - state = sense & FIT_SR_DRIVE_STATE_MASK; - - dev_err(&skdev->pdev->dev, "s1120 state %s(%d)=>%s(%d)\n", - skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, - skd_drive_state_to_str(state), state); - - skdev->drive_state = state; - - switch (skdev->drive_state) { - case FIT_SR_DRIVE_INIT: - if (skdev->state == SKD_DRVR_STATE_PROTOCOL_MISMATCH) { - skd_disable_interrupts(skdev); - break; - } - if (skdev->state == SKD_DRVR_STATE_RESTARTING) - skd_recover_requests(skdev); - if (skdev->state == SKD_DRVR_STATE_WAIT_BOOT) { - skdev->timer_countdown = SKD_STARTING_TIMO; - skdev->state = SKD_DRVR_STATE_STARTING; - skd_soft_reset(skdev); - break; - } - mtd = FIT_MXD_CONS(FIT_MTD_FITFW_INIT, 0, 0); - SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); - skdev->last_mtd = mtd; - break; - - case FIT_SR_DRIVE_ONLINE: - skdev->cur_max_queue_depth = skd_max_queue_depth; - if (skdev->cur_max_queue_depth > skdev->dev_max_queue_depth) - skdev->cur_max_queue_depth = skdev->dev_max_queue_depth; - - skdev->queue_low_water_mark = - skdev->cur_max_queue_depth * 2 / 3 + 1; - if (skdev->queue_low_water_mark < 1) - skdev->queue_low_water_mark = 1; - dev_info(&skdev->pdev->dev, - "Queue depth limit=%d dev=%d lowat=%d\n", - skdev->cur_max_queue_depth, - skdev->dev_max_queue_depth, - skdev->queue_low_water_mark); - - skd_refresh_device_data(skdev); - break; - - case FIT_SR_DRIVE_BUSY: - skdev->state = SKD_DRVR_STATE_BUSY; - skdev->timer_countdown = SKD_BUSY_TIMO; - skd_quiesce_dev(skdev); - break; - case FIT_SR_DRIVE_BUSY_SANITIZE: - /* set timer for 3 seconds, we'll abort any unfinished - * commands after that expires - */ - skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE; - skdev->timer_countdown = SKD_TIMER_SECONDS(3); - schedule_work(&skdev->start_queue); - break; - case FIT_SR_DRIVE_BUSY_ERASE: - skdev->state = SKD_DRVR_STATE_BUSY_ERASE; - skdev->timer_countdown = SKD_BUSY_TIMO; - break; - case FIT_SR_DRIVE_OFFLINE: - skdev->state = SKD_DRVR_STATE_IDLE; - break; - case FIT_SR_DRIVE_SOFT_RESET: - switch (skdev->state) { - case SKD_DRVR_STATE_STARTING: - case SKD_DRVR_STATE_RESTARTING: - /* Expected by a caller of skd_soft_reset() */ - break; - default: - skdev->state = SKD_DRVR_STATE_RESTARTING; - break; - } - break; - case FIT_SR_DRIVE_FW_BOOTING: - dev_dbg(&skdev->pdev->dev, "ISR FIT_SR_DRIVE_FW_BOOTING\n"); - skdev->state = SKD_DRVR_STATE_WAIT_BOOT; - skdev->timer_countdown = SKD_WAIT_BOOT_TIMO; - break; - - case FIT_SR_DRIVE_DEGRADED: - case FIT_SR_PCIE_LINK_DOWN: - case FIT_SR_DRIVE_NEED_FW_DOWNLOAD: - break; - - case FIT_SR_DRIVE_FAULT: - skd_drive_fault(skdev); - skd_recover_requests(skdev); - schedule_work(&skdev->start_queue); - break; - - /* PCIe bus returned all Fs? */ - case 0xFF: - dev_info(&skdev->pdev->dev, "state=0x%x sense=0x%x\n", state, - sense); - skd_drive_disappeared(skdev); - skd_recover_requests(skdev); - schedule_work(&skdev->start_queue); - break; - default: - /* - * Uknown FW State. Wait for a state we recognize. - */ - break; - } - dev_err(&skdev->pdev->dev, "Driver state %s(%d)=>%s(%d)\n", - skd_skdev_state_to_str(prev_driver_state), prev_driver_state, - skd_skdev_state_to_str(skdev->state), skdev->state); -} - -static bool skd_recover_request(struct request *req, void *data, bool reserved) -{ - struct skd_device *const skdev = data; - struct skd_request_context *skreq = blk_mq_rq_to_pdu(req); - - if (skreq->state != SKD_REQ_STATE_BUSY) - return true; - - skd_log_skreq(skdev, skreq, "recover"); - - /* Release DMA resources for the request. */ - if (skreq->n_sg > 0) - skd_postop_sg_list(skdev, skreq); - - skreq->state = SKD_REQ_STATE_IDLE; - skreq->status = BLK_STS_IOERR; - blk_mq_complete_request(req); - return true; -} - -static void skd_recover_requests(struct skd_device *skdev) -{ - blk_mq_tagset_busy_iter(&skdev->tag_set, skd_recover_request, skdev); -} - -static void skd_isr_msg_from_dev(struct skd_device *skdev) -{ - u32 mfd; - u32 mtd; - u32 data; - - mfd = SKD_READL(skdev, FIT_MSG_FROM_DEVICE); - - dev_dbg(&skdev->pdev->dev, "mfd=0x%x last_mtd=0x%x\n", mfd, - skdev->last_mtd); - - /* ignore any mtd that is an ack for something we didn't send */ - if (FIT_MXD_TYPE(mfd) != FIT_MXD_TYPE(skdev->last_mtd)) - return; - - switch (FIT_MXD_TYPE(mfd)) { - case FIT_MTD_FITFW_INIT: - skdev->proto_ver = FIT_PROTOCOL_MAJOR_VER(mfd); - - if (skdev->proto_ver != FIT_PROTOCOL_VERSION_1) { - dev_err(&skdev->pdev->dev, "protocol mismatch\n"); - dev_err(&skdev->pdev->dev, " got=%d support=%d\n", - skdev->proto_ver, FIT_PROTOCOL_VERSION_1); - dev_err(&skdev->pdev->dev, " please upgrade driver\n"); - skdev->state = SKD_DRVR_STATE_PROTOCOL_MISMATCH; - skd_soft_reset(skdev); - break; - } - mtd = FIT_MXD_CONS(FIT_MTD_GET_CMDQ_DEPTH, 0, 0); - SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); - skdev->last_mtd = mtd; - break; - - case FIT_MTD_GET_CMDQ_DEPTH: - skdev->dev_max_queue_depth = FIT_MXD_DATA(mfd); - mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_DEPTH, 0, - SKD_N_COMPLETION_ENTRY); - SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); - skdev->last_mtd = mtd; - break; - - case FIT_MTD_SET_COMPQ_DEPTH: - SKD_WRITEQ(skdev, skdev->cq_dma_address, FIT_MSG_TO_DEVICE_ARG); - mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_ADDR, 0, 0); - SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); - skdev->last_mtd = mtd; - break; - - case FIT_MTD_SET_COMPQ_ADDR: - skd_reset_skcomp(skdev); - mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_HOST_ID, 0, skdev->devno); - SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); - skdev->last_mtd = mtd; - break; - - case FIT_MTD_CMD_LOG_HOST_ID: - /* hardware interface overflows in y2106 */ - skdev->connect_time_stamp = (u32)ktime_get_real_seconds(); - data = skdev->connect_time_stamp & 0xFFFF; - mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data); - SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); - skdev->last_mtd = mtd; - break; - - case FIT_MTD_CMD_LOG_TIME_STAMP_LO: - skdev->drive_jiffies = FIT_MXD_DATA(mfd); - data = (skdev->connect_time_stamp >> 16) & 0xFFFF; - mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_HI, 0, data); - SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); - skdev->last_mtd = mtd; - break; - - case FIT_MTD_CMD_LOG_TIME_STAMP_HI: - skdev->drive_jiffies |= (FIT_MXD_DATA(mfd) << 16); - mtd = FIT_MXD_CONS(FIT_MTD_ARM_QUEUE, 0, 0); - SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); - skdev->last_mtd = mtd; - - dev_err(&skdev->pdev->dev, "Time sync driver=0x%x device=0x%x\n", - skdev->connect_time_stamp, skdev->drive_jiffies); - break; - - case FIT_MTD_ARM_QUEUE: - skdev->last_mtd = 0; - /* - * State should be, or soon will be, FIT_SR_DRIVE_ONLINE. - */ - break; - - default: - break; - } -} - -static void skd_disable_interrupts(struct skd_device *skdev) -{ - u32 sense; - - sense = SKD_READL(skdev, FIT_CONTROL); - sense &= ~FIT_CR_ENABLE_INTERRUPTS; - SKD_WRITEL(skdev, sense, FIT_CONTROL); - dev_dbg(&skdev->pdev->dev, "sense 0x%x\n", sense); - - /* Note that the 1s is written. A 1-bit means - * disable, a 0 means enable. - */ - SKD_WRITEL(skdev, ~0, FIT_INT_MASK_HOST); -} - -static void skd_enable_interrupts(struct skd_device *skdev) -{ - u32 val; - - /* unmask interrupts first */ - val = FIT_ISH_FW_STATE_CHANGE + - FIT_ISH_COMPLETION_POSTED + FIT_ISH_MSG_FROM_DEV; - - /* Note that the compliment of mask is written. A 1-bit means - * disable, a 0 means enable. */ - SKD_WRITEL(skdev, ~val, FIT_INT_MASK_HOST); - dev_dbg(&skdev->pdev->dev, "interrupt mask=0x%x\n", ~val); - - val = SKD_READL(skdev, FIT_CONTROL); - val |= FIT_CR_ENABLE_INTERRUPTS; - dev_dbg(&skdev->pdev->dev, "control=0x%x\n", val); - SKD_WRITEL(skdev, val, FIT_CONTROL); -} - -/* - ***************************************************************************** - * START, STOP, RESTART, QUIESCE, UNQUIESCE - ***************************************************************************** - */ - -static void skd_soft_reset(struct skd_device *skdev) -{ - u32 val; - - val = SKD_READL(skdev, FIT_CONTROL); - val |= (FIT_CR_SOFT_RESET); - dev_dbg(&skdev->pdev->dev, "control=0x%x\n", val); - SKD_WRITEL(skdev, val, FIT_CONTROL); -} - -static void skd_start_device(struct skd_device *skdev) -{ - unsigned long flags; - u32 sense; - u32 state; - - spin_lock_irqsave(&skdev->lock, flags); - - /* ack all ghost interrupts */ - SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST); - - sense = SKD_READL(skdev, FIT_STATUS); - - dev_dbg(&skdev->pdev->dev, "initial status=0x%x\n", sense); - - state = sense & FIT_SR_DRIVE_STATE_MASK; - skdev->drive_state = state; - skdev->last_mtd = 0; - - skdev->state = SKD_DRVR_STATE_STARTING; - skdev->timer_countdown = SKD_STARTING_TIMO; - - skd_enable_interrupts(skdev); - - switch (skdev->drive_state) { - case FIT_SR_DRIVE_OFFLINE: - dev_err(&skdev->pdev->dev, "Drive offline...\n"); - break; - - case FIT_SR_DRIVE_FW_BOOTING: - dev_dbg(&skdev->pdev->dev, "FIT_SR_DRIVE_FW_BOOTING\n"); - skdev->state = SKD_DRVR_STATE_WAIT_BOOT; - skdev->timer_countdown = SKD_WAIT_BOOT_TIMO; - break; - - case FIT_SR_DRIVE_BUSY_SANITIZE: - dev_info(&skdev->pdev->dev, "Start: BUSY_SANITIZE\n"); - skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE; - skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; - break; - - case FIT_SR_DRIVE_BUSY_ERASE: - dev_info(&skdev->pdev->dev, "Start: BUSY_ERASE\n"); - skdev->state = SKD_DRVR_STATE_BUSY_ERASE; - skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; - break; - - case FIT_SR_DRIVE_INIT: - case FIT_SR_DRIVE_ONLINE: - skd_soft_reset(skdev); - break; - - case FIT_SR_DRIVE_BUSY: - dev_err(&skdev->pdev->dev, "Drive Busy...\n"); - skdev->state = SKD_DRVR_STATE_BUSY; - skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; - break; - - case FIT_SR_DRIVE_SOFT_RESET: - dev_err(&skdev->pdev->dev, "drive soft reset in prog\n"); - break; - - case FIT_SR_DRIVE_FAULT: - /* Fault state is bad...soft reset won't do it... - * Hard reset, maybe, but does it work on device? - * For now, just fault so the system doesn't hang. - */ - skd_drive_fault(skdev); - /*start the queue so we can respond with error to requests */ - dev_dbg(&skdev->pdev->dev, "starting queue\n"); - schedule_work(&skdev->start_queue); - skdev->gendisk_on = -1; - wake_up_interruptible(&skdev->waitq); - break; - - case 0xFF: - /* Most likely the device isn't there or isn't responding - * to the BAR1 addresses. */ - skd_drive_disappeared(skdev); - /*start the queue so we can respond with error to requests */ - dev_dbg(&skdev->pdev->dev, - "starting queue to error-out reqs\n"); - schedule_work(&skdev->start_queue); - skdev->gendisk_on = -1; - wake_up_interruptible(&skdev->waitq); - break; - - default: - dev_err(&skdev->pdev->dev, "Start: unknown state %x\n", - skdev->drive_state); - break; - } - - state = SKD_READL(skdev, FIT_CONTROL); - dev_dbg(&skdev->pdev->dev, "FIT Control Status=0x%x\n", state); - - state = SKD_READL(skdev, FIT_INT_STATUS_HOST); - dev_dbg(&skdev->pdev->dev, "Intr Status=0x%x\n", state); - - state = SKD_READL(skdev, FIT_INT_MASK_HOST); - dev_dbg(&skdev->pdev->dev, "Intr Mask=0x%x\n", state); - - state = SKD_READL(skdev, FIT_MSG_FROM_DEVICE); - dev_dbg(&skdev->pdev->dev, "Msg from Dev=0x%x\n", state); - - state = SKD_READL(skdev, FIT_HW_VERSION); - dev_dbg(&skdev->pdev->dev, "HW version=0x%x\n", state); - - spin_unlock_irqrestore(&skdev->lock, flags); -} - -static void skd_stop_device(struct skd_device *skdev) -{ - unsigned long flags; - struct skd_special_context *skspcl = &skdev->internal_skspcl; - u32 dev_state; - int i; - - spin_lock_irqsave(&skdev->lock, flags); - - if (skdev->state != SKD_DRVR_STATE_ONLINE) { - dev_err(&skdev->pdev->dev, "%s not online no sync\n", __func__); - goto stop_out; - } - - if (skspcl->req.state != SKD_REQ_STATE_IDLE) { - dev_err(&skdev->pdev->dev, "%s no special\n", __func__); - goto stop_out; - } - - skdev->state = SKD_DRVR_STATE_SYNCING; - skdev->sync_done = 0; - - skd_send_internal_skspcl(skdev, skspcl, SYNCHRONIZE_CACHE); - - spin_unlock_irqrestore(&skdev->lock, flags); - - wait_event_interruptible_timeout(skdev->waitq, - (skdev->sync_done), (10 * HZ)); - - spin_lock_irqsave(&skdev->lock, flags); - - switch (skdev->sync_done) { - case 0: - dev_err(&skdev->pdev->dev, "%s no sync\n", __func__); - break; - case 1: - dev_err(&skdev->pdev->dev, "%s sync done\n", __func__); - break; - default: - dev_err(&skdev->pdev->dev, "%s sync error\n", __func__); - } - -stop_out: - skdev->state = SKD_DRVR_STATE_STOPPING; - spin_unlock_irqrestore(&skdev->lock, flags); - - skd_kill_timer(skdev); - - spin_lock_irqsave(&skdev->lock, flags); - skd_disable_interrupts(skdev); - - /* ensure all ints on device are cleared */ - /* soft reset the device to unload with a clean slate */ - SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST); - SKD_WRITEL(skdev, FIT_CR_SOFT_RESET, FIT_CONTROL); - - spin_unlock_irqrestore(&skdev->lock, flags); - - /* poll every 100ms, 1 second timeout */ - for (i = 0; i < 10; i++) { - dev_state = - SKD_READL(skdev, FIT_STATUS) & FIT_SR_DRIVE_STATE_MASK; - if (dev_state == FIT_SR_DRIVE_INIT) - break; - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(msecs_to_jiffies(100)); - } - - if (dev_state != FIT_SR_DRIVE_INIT) - dev_err(&skdev->pdev->dev, "%s state error 0x%02x\n", __func__, - dev_state); -} - -/* assume spinlock is held */ -static void skd_restart_device(struct skd_device *skdev) -{ - u32 state; - - /* ack all ghost interrupts */ - SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST); - - state = SKD_READL(skdev, FIT_STATUS); - - dev_dbg(&skdev->pdev->dev, "drive status=0x%x\n", state); - - state &= FIT_SR_DRIVE_STATE_MASK; - skdev->drive_state = state; - skdev->last_mtd = 0; - - skdev->state = SKD_DRVR_STATE_RESTARTING; - skdev->timer_countdown = SKD_RESTARTING_TIMO; - - skd_soft_reset(skdev); -} - -/* assume spinlock is held */ -static int skd_quiesce_dev(struct skd_device *skdev) -{ - int rc = 0; - - switch (skdev->state) { - case SKD_DRVR_STATE_BUSY: - case SKD_DRVR_STATE_BUSY_IMMINENT: - dev_dbg(&skdev->pdev->dev, "stopping queue\n"); - blk_mq_stop_hw_queues(skdev->queue); - break; - case SKD_DRVR_STATE_ONLINE: - case SKD_DRVR_STATE_STOPPING: - case SKD_DRVR_STATE_SYNCING: - case SKD_DRVR_STATE_PAUSING: - case SKD_DRVR_STATE_PAUSED: - case SKD_DRVR_STATE_STARTING: - case SKD_DRVR_STATE_RESTARTING: - case SKD_DRVR_STATE_RESUMING: - default: - rc = -EINVAL; - dev_dbg(&skdev->pdev->dev, "state [%d] not implemented\n", - skdev->state); - } - return rc; -} - -/* assume spinlock is held */ -static int skd_unquiesce_dev(struct skd_device *skdev) -{ - int prev_driver_state = skdev->state; - - skd_log_skdev(skdev, "unquiesce"); - if (skdev->state == SKD_DRVR_STATE_ONLINE) { - dev_dbg(&skdev->pdev->dev, "**** device already ONLINE\n"); - return 0; - } - if (skdev->drive_state != FIT_SR_DRIVE_ONLINE) { - /* - * If there has been an state change to other than - * ONLINE, we will rely on controller state change - * to come back online and restart the queue. - * The BUSY state means that driver is ready to - * continue normal processing but waiting for controller - * to become available. - */ - skdev->state = SKD_DRVR_STATE_BUSY; - dev_dbg(&skdev->pdev->dev, "drive BUSY state\n"); - return 0; - } - - /* - * Drive has just come online, driver is either in startup, - * paused performing a task, or bust waiting for hardware. - */ - switch (skdev->state) { - case SKD_DRVR_STATE_PAUSED: - case SKD_DRVR_STATE_BUSY: - case SKD_DRVR_STATE_BUSY_IMMINENT: - case SKD_DRVR_STATE_BUSY_ERASE: - case SKD_DRVR_STATE_STARTING: - case SKD_DRVR_STATE_RESTARTING: - case SKD_DRVR_STATE_FAULT: - case SKD_DRVR_STATE_IDLE: - case SKD_DRVR_STATE_LOAD: - skdev->state = SKD_DRVR_STATE_ONLINE; - dev_err(&skdev->pdev->dev, "Driver state %s(%d)=>%s(%d)\n", - skd_skdev_state_to_str(prev_driver_state), - prev_driver_state, skd_skdev_state_to_str(skdev->state), - skdev->state); - dev_dbg(&skdev->pdev->dev, - "**** device ONLINE...starting block queue\n"); - dev_dbg(&skdev->pdev->dev, "starting queue\n"); - dev_info(&skdev->pdev->dev, "STEC s1120 ONLINE\n"); - schedule_work(&skdev->start_queue); - skdev->gendisk_on = 1; - wake_up_interruptible(&skdev->waitq); - break; - - case SKD_DRVR_STATE_DISAPPEARED: - default: - dev_dbg(&skdev->pdev->dev, - "**** driver state %d, not implemented\n", - skdev->state); - return -EBUSY; - } - return 0; -} - -/* - ***************************************************************************** - * PCIe MSI/MSI-X INTERRUPT HANDLERS - ***************************************************************************** - */ - -static irqreturn_t skd_reserved_isr(int irq, void *skd_host_data) -{ - struct skd_device *skdev = skd_host_data; - unsigned long flags; - - spin_lock_irqsave(&skdev->lock, flags); - dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n", - SKD_READL(skdev, FIT_INT_STATUS_HOST)); - dev_err(&skdev->pdev->dev, "MSIX reserved irq %d = 0x%x\n", irq, - SKD_READL(skdev, FIT_INT_STATUS_HOST)); - SKD_WRITEL(skdev, FIT_INT_RESERVED_MASK, FIT_INT_STATUS_HOST); - spin_unlock_irqrestore(&skdev->lock, flags); - return IRQ_HANDLED; -} - -static irqreturn_t skd_statec_isr(int irq, void *skd_host_data) -{ - struct skd_device *skdev = skd_host_data; - unsigned long flags; - - spin_lock_irqsave(&skdev->lock, flags); - dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n", - SKD_READL(skdev, FIT_INT_STATUS_HOST)); - SKD_WRITEL(skdev, FIT_ISH_FW_STATE_CHANGE, FIT_INT_STATUS_HOST); - skd_isr_fwstate(skdev); - spin_unlock_irqrestore(&skdev->lock, flags); - return IRQ_HANDLED; -} - -static irqreturn_t skd_comp_q(int irq, void *skd_host_data) -{ - struct skd_device *skdev = skd_host_data; - unsigned long flags; - int flush_enqueued = 0; - int deferred; - - spin_lock_irqsave(&skdev->lock, flags); - dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n", - SKD_READL(skdev, FIT_INT_STATUS_HOST)); - SKD_WRITEL(skdev, FIT_ISH_COMPLETION_POSTED, FIT_INT_STATUS_HOST); - deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit, - &flush_enqueued); - if (flush_enqueued) - schedule_work(&skdev->start_queue); - - if (deferred) - schedule_work(&skdev->completion_worker); - else if (!flush_enqueued) - schedule_work(&skdev->start_queue); - - spin_unlock_irqrestore(&skdev->lock, flags); - - return IRQ_HANDLED; -} - -static irqreturn_t skd_msg_isr(int irq, void *skd_host_data) -{ - struct skd_device *skdev = skd_host_data; - unsigned long flags; - - spin_lock_irqsave(&skdev->lock, flags); - dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n", - SKD_READL(skdev, FIT_INT_STATUS_HOST)); - SKD_WRITEL(skdev, FIT_ISH_MSG_FROM_DEV, FIT_INT_STATUS_HOST); - skd_isr_msg_from_dev(skdev); - spin_unlock_irqrestore(&skdev->lock, flags); - return IRQ_HANDLED; -} - -static irqreturn_t skd_qfull_isr(int irq, void *skd_host_data) -{ - struct skd_device *skdev = skd_host_data; - unsigned long flags; - - spin_lock_irqsave(&skdev->lock, flags); - dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n", - SKD_READL(skdev, FIT_INT_STATUS_HOST)); - SKD_WRITEL(skdev, FIT_INT_QUEUE_FULL, FIT_INT_STATUS_HOST); - spin_unlock_irqrestore(&skdev->lock, flags); - return IRQ_HANDLED; -} - -/* - ***************************************************************************** - * PCIe MSI/MSI-X SETUP - ***************************************************************************** - */ - -struct skd_msix_entry { - char isr_name[30]; -}; - -struct skd_init_msix_entry { - const char *name; - irq_handler_t handler; -}; - -#define SKD_MAX_MSIX_COUNT 13 -#define SKD_MIN_MSIX_COUNT 7 -#define SKD_BASE_MSIX_IRQ 4 - -static struct skd_init_msix_entry msix_entries[SKD_MAX_MSIX_COUNT] = { - { "(DMA 0)", skd_reserved_isr }, - { "(DMA 1)", skd_reserved_isr }, - { "(DMA 2)", skd_reserved_isr }, - { "(DMA 3)", skd_reserved_isr }, - { "(State Change)", skd_statec_isr }, - { "(COMPL_Q)", skd_comp_q }, - { "(MSG)", skd_msg_isr }, - { "(Reserved)", skd_reserved_isr }, - { "(Reserved)", skd_reserved_isr }, - { "(Queue Full 0)", skd_qfull_isr }, - { "(Queue Full 1)", skd_qfull_isr }, - { "(Queue Full 2)", skd_qfull_isr }, - { "(Queue Full 3)", skd_qfull_isr }, -}; - -static int skd_acquire_msix(struct skd_device *skdev) -{ - int i, rc; - struct pci_dev *pdev = skdev->pdev; - - rc = pci_alloc_irq_vectors(pdev, SKD_MAX_MSIX_COUNT, SKD_MAX_MSIX_COUNT, - PCI_IRQ_MSIX); - if (rc < 0) { - dev_err(&skdev->pdev->dev, "failed to enable MSI-X %d\n", rc); - goto out; - } - - skdev->msix_entries = kcalloc(SKD_MAX_MSIX_COUNT, - sizeof(struct skd_msix_entry), GFP_KERNEL); - if (!skdev->msix_entries) { - rc = -ENOMEM; - dev_err(&skdev->pdev->dev, "msix table allocation error\n"); - goto out; - } - - /* Enable MSI-X vectors for the base queue */ - for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) { - struct skd_msix_entry *qentry = &skdev->msix_entries[i]; - - snprintf(qentry->isr_name, sizeof(qentry->isr_name), - "%s%d-msix %s", DRV_NAME, skdev->devno, - msix_entries[i].name); - - rc = devm_request_irq(&skdev->pdev->dev, - pci_irq_vector(skdev->pdev, i), - msix_entries[i].handler, 0, - qentry->isr_name, skdev); - if (rc) { - dev_err(&skdev->pdev->dev, - "Unable to register(%d) MSI-X handler %d: %s\n", - rc, i, qentry->isr_name); - goto msix_out; - } - } - - dev_dbg(&skdev->pdev->dev, "%d msix irq(s) enabled\n", - SKD_MAX_MSIX_COUNT); - return 0; - -msix_out: - while (--i >= 0) - devm_free_irq(&pdev->dev, pci_irq_vector(pdev, i), skdev); -out: - kfree(skdev->msix_entries); - skdev->msix_entries = NULL; - return rc; -} - -static int skd_acquire_irq(struct skd_device *skdev) -{ - struct pci_dev *pdev = skdev->pdev; - unsigned int irq_flag = PCI_IRQ_LEGACY; - int rc; - - if (skd_isr_type == SKD_IRQ_MSIX) { - rc = skd_acquire_msix(skdev); - if (!rc) - return 0; - - dev_err(&skdev->pdev->dev, - "failed to enable MSI-X, re-trying with MSI %d\n", rc); - } - - snprintf(skdev->isr_name, sizeof(skdev->isr_name), "%s%d", DRV_NAME, - skdev->devno); - - if (skd_isr_type != SKD_IRQ_LEGACY) - irq_flag |= PCI_IRQ_MSI; - rc = pci_alloc_irq_vectors(pdev, 1, 1, irq_flag); - if (rc < 0) { - dev_err(&skdev->pdev->dev, - "failed to allocate the MSI interrupt %d\n", rc); - return rc; - } - - rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr, - pdev->msi_enabled ? 0 : IRQF_SHARED, - skdev->isr_name, skdev); - if (rc) { - pci_free_irq_vectors(pdev); - dev_err(&skdev->pdev->dev, "failed to allocate interrupt %d\n", - rc); - return rc; - } - - return 0; -} - -static void skd_release_irq(struct skd_device *skdev) -{ - struct pci_dev *pdev = skdev->pdev; - - if (skdev->msix_entries) { - int i; - - for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) { - devm_free_irq(&pdev->dev, pci_irq_vector(pdev, i), - skdev); - } - - kfree(skdev->msix_entries); - skdev->msix_entries = NULL; - } else { - devm_free_irq(&pdev->dev, pdev->irq, skdev); - } - - pci_free_irq_vectors(pdev); -} - -/* - ***************************************************************************** - * CONSTRUCT - ***************************************************************************** - */ - -static void *skd_alloc_dma(struct skd_device *skdev, struct kmem_cache *s, - dma_addr_t *dma_handle, gfp_t gfp, - enum dma_data_direction dir) -{ - struct device *dev = &skdev->pdev->dev; - void *buf; - - buf = kmem_cache_alloc(s, gfp); - if (!buf) - return NULL; - *dma_handle = dma_map_single(dev, buf, - kmem_cache_size(s), dir); - if (dma_mapping_error(dev, *dma_handle)) { - kmem_cache_free(s, buf); - buf = NULL; - } - return buf; -} - -static void skd_free_dma(struct skd_device *skdev, struct kmem_cache *s, - void *vaddr, dma_addr_t dma_handle, - enum dma_data_direction dir) -{ - if (!vaddr) - return; - - dma_unmap_single(&skdev->pdev->dev, dma_handle, - kmem_cache_size(s), dir); - kmem_cache_free(s, vaddr); -} - -static int skd_cons_skcomp(struct skd_device *skdev) -{ - int rc = 0; - struct fit_completion_entry_v1 *skcomp; - - dev_dbg(&skdev->pdev->dev, - "comp pci_alloc, total bytes %zd entries %d\n", - SKD_SKCOMP_SIZE, SKD_N_COMPLETION_ENTRY); - - skcomp = dma_alloc_coherent(&skdev->pdev->dev, SKD_SKCOMP_SIZE, - &skdev->cq_dma_address, GFP_KERNEL); - - if (skcomp == NULL) { - rc = -ENOMEM; - goto err_out; - } - - skdev->skcomp_table = skcomp; - skdev->skerr_table = (struct fit_comp_error_info *)((char *)skcomp + - sizeof(*skcomp) * - SKD_N_COMPLETION_ENTRY); - -err_out: - return rc; -} - -static int skd_cons_skmsg(struct skd_device *skdev) -{ - int rc = 0; - u32 i; - - dev_dbg(&skdev->pdev->dev, - "skmsg_table kcalloc, struct %lu, count %u total %lu\n", - sizeof(struct skd_fitmsg_context), skdev->num_fitmsg_context, - sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context); - - skdev->skmsg_table = kcalloc(skdev->num_fitmsg_context, - sizeof(struct skd_fitmsg_context), - GFP_KERNEL); - if (skdev->skmsg_table == NULL) { - rc = -ENOMEM; - goto err_out; - } - - for (i = 0; i < skdev->num_fitmsg_context; i++) { - struct skd_fitmsg_context *skmsg; - - skmsg = &skdev->skmsg_table[i]; - - skmsg->id = i + SKD_ID_FIT_MSG; - - skmsg->msg_buf = dma_alloc_coherent(&skdev->pdev->dev, - SKD_N_FITMSG_BYTES, - &skmsg->mb_dma_address, - GFP_KERNEL); - if (skmsg->msg_buf == NULL) { - rc = -ENOMEM; - goto err_out; - } - - WARN(((uintptr_t)skmsg->msg_buf | skmsg->mb_dma_address) & - (FIT_QCMD_ALIGN - 1), - "not aligned: msg_buf %p mb_dma_address %pad\n", - skmsg->msg_buf, &skmsg->mb_dma_address); - } - -err_out: - return rc; -} - -static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev, - u32 n_sg, - dma_addr_t *ret_dma_addr) -{ - struct fit_sg_descriptor *sg_list; - - sg_list = skd_alloc_dma(skdev, skdev->sglist_cache, ret_dma_addr, - GFP_DMA | __GFP_ZERO, DMA_TO_DEVICE); - - if (sg_list != NULL) { - uint64_t dma_address = *ret_dma_addr; - u32 i; - - for (i = 0; i < n_sg - 1; i++) { - uint64_t ndp_off; - ndp_off = (i + 1) * sizeof(struct fit_sg_descriptor); - - sg_list[i].next_desc_ptr = dma_address + ndp_off; - } - sg_list[i].next_desc_ptr = 0LL; - } - - return sg_list; -} - -static void skd_free_sg_list(struct skd_device *skdev, - struct fit_sg_descriptor *sg_list, - dma_addr_t dma_addr) -{ - if (WARN_ON_ONCE(!sg_list)) - return; - - skd_free_dma(skdev, skdev->sglist_cache, sg_list, dma_addr, - DMA_TO_DEVICE); -} - -static int skd_init_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct skd_device *skdev = set->driver_data; - struct skd_request_context *skreq = blk_mq_rq_to_pdu(rq); - - skreq->state = SKD_REQ_STATE_IDLE; - skreq->sg = (void *)(skreq + 1); - sg_init_table(skreq->sg, skd_sgs_per_request); - skreq->sksg_list = skd_cons_sg_list(skdev, skd_sgs_per_request, - &skreq->sksg_dma_address); - - return skreq->sksg_list ? 0 : -ENOMEM; -} - -static void skd_exit_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx) -{ - struct skd_device *skdev = set->driver_data; - struct skd_request_context *skreq = blk_mq_rq_to_pdu(rq); - - skd_free_sg_list(skdev, skreq->sksg_list, skreq->sksg_dma_address); -} - -static int skd_cons_sksb(struct skd_device *skdev) -{ - int rc = 0; - struct skd_special_context *skspcl; - - skspcl = &skdev->internal_skspcl; - - skspcl->req.id = 0 + SKD_ID_INTERNAL; - skspcl->req.state = SKD_REQ_STATE_IDLE; - - skspcl->data_buf = skd_alloc_dma(skdev, skdev->databuf_cache, - &skspcl->db_dma_address, - GFP_DMA | __GFP_ZERO, - DMA_BIDIRECTIONAL); - if (skspcl->data_buf == NULL) { - rc = -ENOMEM; - goto err_out; - } - - skspcl->msg_buf = skd_alloc_dma(skdev, skdev->msgbuf_cache, - &skspcl->mb_dma_address, - GFP_DMA | __GFP_ZERO, DMA_TO_DEVICE); - if (skspcl->msg_buf == NULL) { - rc = -ENOMEM; - goto err_out; - } - - skspcl->req.sksg_list = skd_cons_sg_list(skdev, 1, - &skspcl->req.sksg_dma_address); - if (skspcl->req.sksg_list == NULL) { - rc = -ENOMEM; - goto err_out; - } - - if (!skd_format_internal_skspcl(skdev)) { - rc = -EINVAL; - goto err_out; - } - -err_out: - return rc; -} - -static const struct blk_mq_ops skd_mq_ops = { - .queue_rq = skd_mq_queue_rq, - .complete = skd_complete_rq, - .timeout = skd_timed_out, - .init_request = skd_init_request, - .exit_request = skd_exit_request, -}; - -static int skd_cons_disk(struct skd_device *skdev) -{ - int rc = 0; - struct gendisk *disk; - struct request_queue *q; - unsigned long flags; - - disk = alloc_disk(SKD_MINORS_PER_DEVICE); - if (!disk) { - rc = -ENOMEM; - goto err_out; - } - - skdev->disk = disk; - sprintf(disk->disk_name, DRV_NAME "%u", skdev->devno); - - disk->major = skdev->major; - disk->first_minor = skdev->devno * SKD_MINORS_PER_DEVICE; - disk->fops = &skd_blockdev_ops; - disk->private_data = skdev; - - memset(&skdev->tag_set, 0, sizeof(skdev->tag_set)); - skdev->tag_set.ops = &skd_mq_ops; - skdev->tag_set.nr_hw_queues = 1; - skdev->tag_set.queue_depth = skd_max_queue_depth; - skdev->tag_set.cmd_size = sizeof(struct skd_request_context) + - skdev->sgs_per_request * sizeof(struct scatterlist); - skdev->tag_set.numa_node = NUMA_NO_NODE; - skdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | - BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO); - skdev->tag_set.driver_data = skdev; - rc = blk_mq_alloc_tag_set(&skdev->tag_set); - if (rc) - goto err_out; - q = blk_mq_init_queue(&skdev->tag_set); - if (IS_ERR(q)) { - blk_mq_free_tag_set(&skdev->tag_set); - rc = PTR_ERR(q); - goto err_out; - } - q->queuedata = skdev; - - skdev->queue = q; - disk->queue = q; - - blk_queue_write_cache(q, true, true); - blk_queue_max_segments(q, skdev->sgs_per_request); - blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS); - - /* set optimal I/O size to 8KB */ - blk_queue_io_opt(q, 8192); - - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); - - blk_queue_rq_timeout(q, 8 * HZ); - - spin_lock_irqsave(&skdev->lock, flags); - dev_dbg(&skdev->pdev->dev, "stopping queue\n"); - blk_mq_stop_hw_queues(skdev->queue); - spin_unlock_irqrestore(&skdev->lock, flags); - -err_out: - return rc; -} - -#define SKD_N_DEV_TABLE 16u -static u32 skd_next_devno; - -static struct skd_device *skd_construct(struct pci_dev *pdev) -{ - struct skd_device *skdev; - int blk_major = skd_major; - size_t size; - int rc; - - skdev = kzalloc(sizeof(*skdev), GFP_KERNEL); - - if (!skdev) { - dev_err(&pdev->dev, "memory alloc failure\n"); - return NULL; - } - - skdev->state = SKD_DRVR_STATE_LOAD; - skdev->pdev = pdev; - skdev->devno = skd_next_devno++; - skdev->major = blk_major; - skdev->dev_max_queue_depth = 0; - - skdev->num_req_context = skd_max_queue_depth; - skdev->num_fitmsg_context = skd_max_queue_depth; - skdev->cur_max_queue_depth = 1; - skdev->queue_low_water_mark = 1; - skdev->proto_ver = 99; - skdev->sgs_per_request = skd_sgs_per_request; - skdev->dbg_level = skd_dbg_level; - - spin_lock_init(&skdev->lock); - - INIT_WORK(&skdev->start_queue, skd_start_queue); - INIT_WORK(&skdev->completion_worker, skd_completion_worker); - - size = max(SKD_N_FITMSG_BYTES, SKD_N_SPECIAL_FITMSG_BYTES); - skdev->msgbuf_cache = kmem_cache_create("skd-msgbuf", size, 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!skdev->msgbuf_cache) - goto err_out; - WARN_ONCE(kmem_cache_size(skdev->msgbuf_cache) < size, - "skd-msgbuf: %d < %zd\n", - kmem_cache_size(skdev->msgbuf_cache), size); - size = skd_sgs_per_request * sizeof(struct fit_sg_descriptor); - skdev->sglist_cache = kmem_cache_create("skd-sglist", size, 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!skdev->sglist_cache) - goto err_out; - WARN_ONCE(kmem_cache_size(skdev->sglist_cache) < size, - "skd-sglist: %d < %zd\n", - kmem_cache_size(skdev->sglist_cache), size); - size = SKD_N_INTERNAL_BYTES; - skdev->databuf_cache = kmem_cache_create("skd-databuf", size, 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!skdev->databuf_cache) - goto err_out; - WARN_ONCE(kmem_cache_size(skdev->databuf_cache) < size, - "skd-databuf: %d < %zd\n", - kmem_cache_size(skdev->databuf_cache), size); - - dev_dbg(&skdev->pdev->dev, "skcomp\n"); - rc = skd_cons_skcomp(skdev); - if (rc < 0) - goto err_out; - - dev_dbg(&skdev->pdev->dev, "skmsg\n"); - rc = skd_cons_skmsg(skdev); - if (rc < 0) - goto err_out; - - dev_dbg(&skdev->pdev->dev, "sksb\n"); - rc = skd_cons_sksb(skdev); - if (rc < 0) - goto err_out; - - dev_dbg(&skdev->pdev->dev, "disk\n"); - rc = skd_cons_disk(skdev); - if (rc < 0) - goto err_out; - - dev_dbg(&skdev->pdev->dev, "VICTORY\n"); - return skdev; - -err_out: - dev_dbg(&skdev->pdev->dev, "construct failed\n"); - skd_destruct(skdev); - return NULL; -} - -/* - ***************************************************************************** - * DESTRUCT (FREE) - ***************************************************************************** - */ - -static void skd_free_skcomp(struct skd_device *skdev) -{ - if (skdev->skcomp_table) - dma_free_coherent(&skdev->pdev->dev, SKD_SKCOMP_SIZE, - skdev->skcomp_table, skdev->cq_dma_address); - - skdev->skcomp_table = NULL; - skdev->cq_dma_address = 0; -} - -static void skd_free_skmsg(struct skd_device *skdev) -{ - u32 i; - - if (skdev->skmsg_table == NULL) - return; - - for (i = 0; i < skdev->num_fitmsg_context; i++) { - struct skd_fitmsg_context *skmsg; - - skmsg = &skdev->skmsg_table[i]; - - if (skmsg->msg_buf != NULL) { - dma_free_coherent(&skdev->pdev->dev, SKD_N_FITMSG_BYTES, - skmsg->msg_buf, - skmsg->mb_dma_address); - } - skmsg->msg_buf = NULL; - skmsg->mb_dma_address = 0; - } - - kfree(skdev->skmsg_table); - skdev->skmsg_table = NULL; -} - -static void skd_free_sksb(struct skd_device *skdev) -{ - struct skd_special_context *skspcl = &skdev->internal_skspcl; - - skd_free_dma(skdev, skdev->databuf_cache, skspcl->data_buf, - skspcl->db_dma_address, DMA_BIDIRECTIONAL); - - skspcl->data_buf = NULL; - skspcl->db_dma_address = 0; - - skd_free_dma(skdev, skdev->msgbuf_cache, skspcl->msg_buf, - skspcl->mb_dma_address, DMA_TO_DEVICE); - - skspcl->msg_buf = NULL; - skspcl->mb_dma_address = 0; - - skd_free_sg_list(skdev, skspcl->req.sksg_list, - skspcl->req.sksg_dma_address); - - skspcl->req.sksg_list = NULL; - skspcl->req.sksg_dma_address = 0; -} - -static void skd_free_disk(struct skd_device *skdev) -{ - struct gendisk *disk = skdev->disk; - - if (disk && (disk->flags & GENHD_FL_UP)) - del_gendisk(disk); - - if (skdev->queue) { - blk_cleanup_queue(skdev->queue); - skdev->queue = NULL; - if (disk) - disk->queue = NULL; - } - - if (skdev->tag_set.tags) - blk_mq_free_tag_set(&skdev->tag_set); - - put_disk(disk); - skdev->disk = NULL; -} - -static void skd_destruct(struct skd_device *skdev) -{ - if (skdev == NULL) - return; - - cancel_work_sync(&skdev->start_queue); - - dev_dbg(&skdev->pdev->dev, "disk\n"); - skd_free_disk(skdev); - - dev_dbg(&skdev->pdev->dev, "sksb\n"); - skd_free_sksb(skdev); - - dev_dbg(&skdev->pdev->dev, "skmsg\n"); - skd_free_skmsg(skdev); - - dev_dbg(&skdev->pdev->dev, "skcomp\n"); - skd_free_skcomp(skdev); - - kmem_cache_destroy(skdev->databuf_cache); - kmem_cache_destroy(skdev->sglist_cache); - kmem_cache_destroy(skdev->msgbuf_cache); - - dev_dbg(&skdev->pdev->dev, "skdev\n"); - kfree(skdev); -} - -/* - ***************************************************************************** - * BLOCK DEVICE (BDEV) GLUE - ***************************************************************************** - */ - -static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct skd_device *skdev; - u64 capacity; - - skdev = bdev->bd_disk->private_data; - - dev_dbg(&skdev->pdev->dev, "%s: CMD[%s] getgeo device\n", - bdev->bd_disk->disk_name, current->comm); - - if (skdev->read_cap_is_valid) { - capacity = get_capacity(skdev->disk); - geo->heads = 64; - geo->sectors = 255; - geo->cylinders = (capacity) / (255 * 64); - - return 0; - } - return -EIO; -} - -static int skd_bdev_attach(struct device *parent, struct skd_device *skdev) -{ - dev_dbg(&skdev->pdev->dev, "add_disk\n"); - device_add_disk(parent, skdev->disk, NULL); - return 0; -} - -static const struct block_device_operations skd_blockdev_ops = { - .owner = THIS_MODULE, - .getgeo = skd_bdev_getgeo, -}; - -/* - ***************************************************************************** - * PCIe DRIVER GLUE - ***************************************************************************** - */ - -static const struct pci_device_id skd_pci_tbl[] = { - { PCI_VENDOR_ID_STEC, PCI_DEVICE_ID_S1120, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, - { 0 } /* terminate list */ -}; - -MODULE_DEVICE_TABLE(pci, skd_pci_tbl); - -static char *skd_pci_info(struct skd_device *skdev, char *str) -{ - int pcie_reg; - - strcpy(str, "PCIe ("); - pcie_reg = pci_find_capability(skdev->pdev, PCI_CAP_ID_EXP); - - if (pcie_reg) { - - char lwstr[6]; - uint16_t pcie_lstat, lspeed, lwidth; - - pcie_reg += 0x12; - pci_read_config_word(skdev->pdev, pcie_reg, &pcie_lstat); - lspeed = pcie_lstat & (0xF); - lwidth = (pcie_lstat & 0x3F0) >> 4; - - if (lspeed == 1) - strcat(str, "2.5GT/s "); - else if (lspeed == 2) - strcat(str, "5.0GT/s "); - else - strcat(str, " "); - snprintf(lwstr, sizeof(lwstr), "%dX)", lwidth); - strcat(str, lwstr); - } - return str; -} - -static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) -{ - int i; - int rc = 0; - char pci_str[32]; - struct skd_device *skdev; - - dev_dbg(&pdev->dev, "vendor=%04X device=%04x\n", pdev->vendor, - pdev->device); - - rc = pci_enable_device(pdev); - if (rc) - return rc; - rc = pci_request_regions(pdev, DRV_NAME); - if (rc) - goto err_out; - rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); - if (rc) - rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); - if (rc) { - dev_err(&pdev->dev, "DMA mask error %d\n", rc); - goto err_out_regions; - } - - if (!skd_major) { - rc = register_blkdev(0, DRV_NAME); - if (rc < 0) - goto err_out_regions; - BUG_ON(!rc); - skd_major = rc; - } - - skdev = skd_construct(pdev); - if (skdev == NULL) { - rc = -ENOMEM; - goto err_out_regions; - } - - skd_pci_info(skdev, pci_str); - dev_info(&pdev->dev, "%s 64bit\n", pci_str); - - pci_set_master(pdev); - rc = pci_enable_pcie_error_reporting(pdev); - if (rc) { - dev_err(&pdev->dev, - "bad enable of PCIe error reporting rc=%d\n", rc); - skdev->pcie_error_reporting_is_enabled = 0; - } else - skdev->pcie_error_reporting_is_enabled = 1; - - pci_set_drvdata(pdev, skdev); - - for (i = 0; i < SKD_MAX_BARS; i++) { - skdev->mem_phys[i] = pci_resource_start(pdev, i); - skdev->mem_size[i] = (u32)pci_resource_len(pdev, i); - skdev->mem_map[i] = ioremap(skdev->mem_phys[i], - skdev->mem_size[i]); - if (!skdev->mem_map[i]) { - dev_err(&pdev->dev, - "Unable to map adapter memory!\n"); - rc = -ENODEV; - goto err_out_iounmap; - } - dev_dbg(&pdev->dev, "mem_map=%p, phyd=%016llx, size=%d\n", - skdev->mem_map[i], (uint64_t)skdev->mem_phys[i], - skdev->mem_size[i]); - } - - rc = skd_acquire_irq(skdev); - if (rc) { - dev_err(&pdev->dev, "interrupt resource error %d\n", rc); - goto err_out_iounmap; - } - - rc = skd_start_timer(skdev); - if (rc) - goto err_out_timer; - - init_waitqueue_head(&skdev->waitq); - - skd_start_device(skdev); - - rc = wait_event_interruptible_timeout(skdev->waitq, - (skdev->gendisk_on), - (SKD_START_WAIT_SECONDS * HZ)); - if (skdev->gendisk_on > 0) { - /* device came on-line after reset */ - skd_bdev_attach(&pdev->dev, skdev); - rc = 0; - } else { - /* we timed out, something is wrong with the device, - don't add the disk structure */ - dev_err(&pdev->dev, "error: waiting for s1120 timed out %d!\n", - rc); - /* in case of no error; we timeout with ENXIO */ - if (!rc) - rc = -ENXIO; - goto err_out_timer; - } - - return rc; - -err_out_timer: - skd_stop_device(skdev); - skd_release_irq(skdev); - -err_out_iounmap: - for (i = 0; i < SKD_MAX_BARS; i++) - if (skdev->mem_map[i]) - iounmap(skdev->mem_map[i]); - - if (skdev->pcie_error_reporting_is_enabled) - pci_disable_pcie_error_reporting(pdev); - - skd_destruct(skdev); - -err_out_regions: - pci_release_regions(pdev); - -err_out: - pci_disable_device(pdev); - pci_set_drvdata(pdev, NULL); - return rc; -} - -static void skd_pci_remove(struct pci_dev *pdev) -{ - int i; - struct skd_device *skdev; - - skdev = pci_get_drvdata(pdev); - if (!skdev) { - dev_err(&pdev->dev, "no device data for PCI\n"); - return; - } - skd_stop_device(skdev); - skd_release_irq(skdev); - - for (i = 0; i < SKD_MAX_BARS; i++) - if (skdev->mem_map[i]) - iounmap(skdev->mem_map[i]); - - if (skdev->pcie_error_reporting_is_enabled) - pci_disable_pcie_error_reporting(pdev); - - skd_destruct(skdev); - - pci_release_regions(pdev); - pci_disable_device(pdev); - pci_set_drvdata(pdev, NULL); - - return; -} - -static int skd_pci_suspend(struct pci_dev *pdev, pm_message_t state) -{ - int i; - struct skd_device *skdev; - - skdev = pci_get_drvdata(pdev); - if (!skdev) { - dev_err(&pdev->dev, "no device data for PCI\n"); - return -EIO; - } - - skd_stop_device(skdev); - - skd_release_irq(skdev); - - for (i = 0; i < SKD_MAX_BARS; i++) - if (skdev->mem_map[i]) - iounmap(skdev->mem_map[i]); - - if (skdev->pcie_error_reporting_is_enabled) - pci_disable_pcie_error_reporting(pdev); - - pci_release_regions(pdev); - pci_save_state(pdev); - pci_disable_device(pdev); - pci_set_power_state(pdev, pci_choose_state(pdev, state)); - return 0; -} - -static int skd_pci_resume(struct pci_dev *pdev) -{ - int i; - int rc = 0; - struct skd_device *skdev; - - skdev = pci_get_drvdata(pdev); - if (!skdev) { - dev_err(&pdev->dev, "no device data for PCI\n"); - return -1; - } - - pci_set_power_state(pdev, PCI_D0); - pci_enable_wake(pdev, PCI_D0, 0); - pci_restore_state(pdev); - - rc = pci_enable_device(pdev); - if (rc) - return rc; - rc = pci_request_regions(pdev, DRV_NAME); - if (rc) - goto err_out; - rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); - if (rc) - rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); - if (rc) { - dev_err(&pdev->dev, "DMA mask error %d\n", rc); - goto err_out_regions; - } - - pci_set_master(pdev); - rc = pci_enable_pcie_error_reporting(pdev); - if (rc) { - dev_err(&pdev->dev, - "bad enable of PCIe error reporting rc=%d\n", rc); - skdev->pcie_error_reporting_is_enabled = 0; - } else - skdev->pcie_error_reporting_is_enabled = 1; - - for (i = 0; i < SKD_MAX_BARS; i++) { - - skdev->mem_phys[i] = pci_resource_start(pdev, i); - skdev->mem_size[i] = (u32)pci_resource_len(pdev, i); - skdev->mem_map[i] = ioremap(skdev->mem_phys[i], - skdev->mem_size[i]); - if (!skdev->mem_map[i]) { - dev_err(&pdev->dev, "Unable to map adapter memory!\n"); - rc = -ENODEV; - goto err_out_iounmap; - } - dev_dbg(&pdev->dev, "mem_map=%p, phyd=%016llx, size=%d\n", - skdev->mem_map[i], (uint64_t)skdev->mem_phys[i], - skdev->mem_size[i]); - } - rc = skd_acquire_irq(skdev); - if (rc) { - dev_err(&pdev->dev, "interrupt resource error %d\n", rc); - goto err_out_iounmap; - } - - rc = skd_start_timer(skdev); - if (rc) - goto err_out_timer; - - init_waitqueue_head(&skdev->waitq); - - skd_start_device(skdev); - - return rc; - -err_out_timer: - skd_stop_device(skdev); - skd_release_irq(skdev); - -err_out_iounmap: - for (i = 0; i < SKD_MAX_BARS; i++) - if (skdev->mem_map[i]) - iounmap(skdev->mem_map[i]); - - if (skdev->pcie_error_reporting_is_enabled) - pci_disable_pcie_error_reporting(pdev); - -err_out_regions: - pci_release_regions(pdev); - -err_out: - pci_disable_device(pdev); - return rc; -} - -static void skd_pci_shutdown(struct pci_dev *pdev) -{ - struct skd_device *skdev; - - dev_err(&pdev->dev, "%s called\n", __func__); - - skdev = pci_get_drvdata(pdev); - if (!skdev) { - dev_err(&pdev->dev, "no device data for PCI\n"); - return; - } - - dev_err(&pdev->dev, "calling stop\n"); - skd_stop_device(skdev); -} - -static struct pci_driver skd_driver = { - .name = DRV_NAME, - .id_table = skd_pci_tbl, - .probe = skd_pci_probe, - .remove = skd_pci_remove, - .suspend = skd_pci_suspend, - .resume = skd_pci_resume, - .shutdown = skd_pci_shutdown, -}; - -/* - ***************************************************************************** - * LOGGING SUPPORT - ***************************************************************************** - */ - -const char *skd_drive_state_to_str(int state) -{ - switch (state) { - case FIT_SR_DRIVE_OFFLINE: - return "OFFLINE"; - case FIT_SR_DRIVE_INIT: - return "INIT"; - case FIT_SR_DRIVE_ONLINE: - return "ONLINE"; - case FIT_SR_DRIVE_BUSY: - return "BUSY"; - case FIT_SR_DRIVE_FAULT: - return "FAULT"; - case FIT_SR_DRIVE_DEGRADED: - return "DEGRADED"; - case FIT_SR_PCIE_LINK_DOWN: - return "INK_DOWN"; - case FIT_SR_DRIVE_SOFT_RESET: - return "SOFT_RESET"; - case FIT_SR_DRIVE_NEED_FW_DOWNLOAD: - return "NEED_FW"; - case FIT_SR_DRIVE_INIT_FAULT: - return "INIT_FAULT"; - case FIT_SR_DRIVE_BUSY_SANITIZE: - return "BUSY_SANITIZE"; - case FIT_SR_DRIVE_BUSY_ERASE: - return "BUSY_ERASE"; - case FIT_SR_DRIVE_FW_BOOTING: - return "FW_BOOTING"; - default: - return "???"; - } -} - -const char *skd_skdev_state_to_str(enum skd_drvr_state state) -{ - switch (state) { - case SKD_DRVR_STATE_LOAD: - return "LOAD"; - case SKD_DRVR_STATE_IDLE: - return "IDLE"; - case SKD_DRVR_STATE_BUSY: - return "BUSY"; - case SKD_DRVR_STATE_STARTING: - return "STARTING"; - case SKD_DRVR_STATE_ONLINE: - return "ONLINE"; - case SKD_DRVR_STATE_PAUSING: - return "PAUSING"; - case SKD_DRVR_STATE_PAUSED: - return "PAUSED"; - case SKD_DRVR_STATE_RESTARTING: - return "RESTARTING"; - case SKD_DRVR_STATE_RESUMING: - return "RESUMING"; - case SKD_DRVR_STATE_STOPPING: - return "STOPPING"; - case SKD_DRVR_STATE_SYNCING: - return "SYNCING"; - case SKD_DRVR_STATE_FAULT: - return "FAULT"; - case SKD_DRVR_STATE_DISAPPEARED: - return "DISAPPEARED"; - case SKD_DRVR_STATE_BUSY_ERASE: - return "BUSY_ERASE"; - case SKD_DRVR_STATE_BUSY_SANITIZE: - return "BUSY_SANITIZE"; - case SKD_DRVR_STATE_BUSY_IMMINENT: - return "BUSY_IMMINENT"; - case SKD_DRVR_STATE_WAIT_BOOT: - return "WAIT_BOOT"; - - default: - return "???"; - } -} - -static const char *skd_skreq_state_to_str(enum skd_req_state state) -{ - switch (state) { - case SKD_REQ_STATE_IDLE: - return "IDLE"; - case SKD_REQ_STATE_SETUP: - return "SETUP"; - case SKD_REQ_STATE_BUSY: - return "BUSY"; - case SKD_REQ_STATE_COMPLETED: - return "COMPLETED"; - case SKD_REQ_STATE_TIMEOUT: - return "TIMEOUT"; - default: - return "???"; - } -} - -static void skd_log_skdev(struct skd_device *skdev, const char *event) -{ - dev_dbg(&skdev->pdev->dev, "skdev=%p event='%s'\n", skdev, event); - dev_dbg(&skdev->pdev->dev, " drive_state=%s(%d) driver_state=%s(%d)\n", - skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, - skd_skdev_state_to_str(skdev->state), skdev->state); - dev_dbg(&skdev->pdev->dev, " busy=%d limit=%d dev=%d lowat=%d\n", - skd_in_flight(skdev), skdev->cur_max_queue_depth, - skdev->dev_max_queue_depth, skdev->queue_low_water_mark); - dev_dbg(&skdev->pdev->dev, " cycle=%d cycle_ix=%d\n", - skdev->skcomp_cycle, skdev->skcomp_ix); -} - -static void skd_log_skreq(struct skd_device *skdev, - struct skd_request_context *skreq, const char *event) -{ - struct request *req = blk_mq_rq_from_pdu(skreq); - u32 lba = blk_rq_pos(req); - u32 count = blk_rq_sectors(req); - - dev_dbg(&skdev->pdev->dev, "skreq=%p event='%s'\n", skreq, event); - dev_dbg(&skdev->pdev->dev, " state=%s(%d) id=0x%04x fitmsg=0x%04x\n", - skd_skreq_state_to_str(skreq->state), skreq->state, skreq->id, - skreq->fitmsg_id); - dev_dbg(&skdev->pdev->dev, " sg_dir=%d n_sg=%d\n", - skreq->data_dir, skreq->n_sg); - - dev_dbg(&skdev->pdev->dev, - "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", req, lba, lba, - count, count, (int)rq_data_dir(req)); -} - -/* - ***************************************************************************** - * MODULE GLUE - ***************************************************************************** - */ - -static int __init skd_init(void) -{ - BUILD_BUG_ON(sizeof(struct fit_completion_entry_v1) != 8); - BUILD_BUG_ON(sizeof(struct fit_comp_error_info) != 32); - BUILD_BUG_ON(sizeof(struct skd_command_header) != 16); - BUILD_BUG_ON(sizeof(struct skd_scsi_request) != 32); - BUILD_BUG_ON(sizeof(struct driver_inquiry_data) != 44); - BUILD_BUG_ON(offsetof(struct skd_msg_buf, fmh) != 0); - BUILD_BUG_ON(offsetof(struct skd_msg_buf, scsi) != 64); - BUILD_BUG_ON(sizeof(struct skd_msg_buf) != SKD_N_FITMSG_BYTES); - - switch (skd_isr_type) { - case SKD_IRQ_LEGACY: - case SKD_IRQ_MSI: - case SKD_IRQ_MSIX: - break; - default: - pr_err(PFX "skd_isr_type %d invalid, re-set to %d\n", - skd_isr_type, SKD_IRQ_DEFAULT); - skd_isr_type = SKD_IRQ_DEFAULT; - } - - if (skd_max_queue_depth < 1 || - skd_max_queue_depth > SKD_MAX_QUEUE_DEPTH) { - pr_err(PFX "skd_max_queue_depth %d invalid, re-set to %d\n", - skd_max_queue_depth, SKD_MAX_QUEUE_DEPTH_DEFAULT); - skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT; - } - - if (skd_max_req_per_msg < 1 || - skd_max_req_per_msg > SKD_MAX_REQ_PER_MSG) { - pr_err(PFX "skd_max_req_per_msg %d invalid, re-set to %d\n", - skd_max_req_per_msg, SKD_MAX_REQ_PER_MSG_DEFAULT); - skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT; - } - - if (skd_sgs_per_request < 1 || skd_sgs_per_request > 4096) { - pr_err(PFX "skd_sg_per_request %d invalid, re-set to %d\n", - skd_sgs_per_request, SKD_N_SG_PER_REQ_DEFAULT); - skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT; - } - - if (skd_dbg_level < 0 || skd_dbg_level > 2) { - pr_err(PFX "skd_dbg_level %d invalid, re-set to %d\n", - skd_dbg_level, 0); - skd_dbg_level = 0; - } - - if (skd_isr_comp_limit < 0) { - pr_err(PFX "skd_isr_comp_limit %d invalid, set to %d\n", - skd_isr_comp_limit, 0); - skd_isr_comp_limit = 0; - } - - return pci_register_driver(&skd_driver); -} - -static void __exit skd_exit(void) -{ - pci_unregister_driver(&skd_driver); - - if (skd_major) - unregister_blkdev(skd_major, DRV_NAME); -} - -module_init(skd_init); -module_exit(skd_exit); diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h deleted file mode 100644 index c30bb98c7cd2..000000000000 --- a/drivers/block/skd_s1120.h +++ /dev/null @@ -1,322 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2012 STEC, Inc. - * Copyright (c) 2017 Western Digital Corporation or its affiliates. - */ - - -#ifndef SKD_S1120_H -#define SKD_S1120_H - -/* - * Q-channel, 64-bit r/w - */ -#define FIT_Q_COMMAND 0x400u -#define FIT_QCMD_QID_MASK (0x3 << 1) -#define FIT_QCMD_QID0 (0x0 << 1) -#define FIT_QCMD_QID_NORMAL FIT_QCMD_QID0 -#define FIT_QCMD_QID1 (0x1 << 1) -#define FIT_QCMD_QID2 (0x2 << 1) -#define FIT_QCMD_QID3 (0x3 << 1) -#define FIT_QCMD_FLUSH_QUEUE (0ull) /* add QID */ -#define FIT_QCMD_MSGSIZE_MASK (0x3 << 4) -#define FIT_QCMD_MSGSIZE_64 (0x0 << 4) -#define FIT_QCMD_MSGSIZE_128 (0x1 << 4) -#define FIT_QCMD_MSGSIZE_256 (0x2 << 4) -#define FIT_QCMD_MSGSIZE_512 (0x3 << 4) -#define FIT_QCMD_ALIGN L1_CACHE_BYTES - -/* - * Control, 32-bit r/w - */ -#define FIT_CONTROL 0x500u -#define FIT_CR_HARD_RESET (1u << 0u) -#define FIT_CR_SOFT_RESET (1u << 1u) -#define FIT_CR_DIS_TIMESTAMPS (1u << 6u) -#define FIT_CR_ENABLE_INTERRUPTS (1u << 7u) - -/* - * Status, 32-bit, r/o - */ -#define FIT_STATUS 0x510u -#define FIT_SR_DRIVE_STATE_MASK 0x000000FFu -#define FIT_SR_SIGNATURE (0xFF << 8) -#define FIT_SR_PIO_DMA (1 << 16) -#define FIT_SR_DRIVE_OFFLINE 0x00 -#define FIT_SR_DRIVE_INIT 0x01 -/* #define FIT_SR_DRIVE_READY 0x02 */ -#define FIT_SR_DRIVE_ONLINE 0x03 -#define FIT_SR_DRIVE_BUSY 0x04 -#define FIT_SR_DRIVE_FAULT 0x05 -#define FIT_SR_DRIVE_DEGRADED 0x06 -#define FIT_SR_PCIE_LINK_DOWN 0x07 -#define FIT_SR_DRIVE_SOFT_RESET 0x08 -#define FIT_SR_DRIVE_INIT_FAULT 0x09 -#define FIT_SR_DRIVE_BUSY_SANITIZE 0x0A -#define FIT_SR_DRIVE_BUSY_ERASE 0x0B -#define FIT_SR_DRIVE_FW_BOOTING 0x0C -#define FIT_SR_DRIVE_NEED_FW_DOWNLOAD 0xFE -#define FIT_SR_DEVICE_MISSING 0xFF -#define FIT_SR__RESERVED 0xFFFFFF00u - -/* - * FIT_STATUS - Status register data definition - */ -#define FIT_SR_STATE_MASK (0xFF << 0) -#define FIT_SR_SIGNATURE (0xFF << 8) -#define FIT_SR_PIO_DMA (1 << 16) - -/* - * Interrupt status, 32-bit r/w1c (w1c ==> write 1 to clear) - */ -#define FIT_INT_STATUS_HOST 0x520u -#define FIT_ISH_FW_STATE_CHANGE (1u << 0u) -#define FIT_ISH_COMPLETION_POSTED (1u << 1u) -#define FIT_ISH_MSG_FROM_DEV (1u << 2u) -#define FIT_ISH_UNDEFINED_3 (1u << 3u) -#define FIT_ISH_UNDEFINED_4 (1u << 4u) -#define FIT_ISH_Q0_FULL (1u << 5u) -#define FIT_ISH_Q1_FULL (1u << 6u) -#define FIT_ISH_Q2_FULL (1u << 7u) -#define FIT_ISH_Q3_FULL (1u << 8u) -#define FIT_ISH_QCMD_FIFO_OVERRUN (1u << 9u) -#define FIT_ISH_BAD_EXP_ROM_READ (1u << 10u) - -#define FIT_INT_DEF_MASK \ - (FIT_ISH_FW_STATE_CHANGE | \ - FIT_ISH_COMPLETION_POSTED | \ - FIT_ISH_MSG_FROM_DEV | \ - FIT_ISH_Q0_FULL | \ - FIT_ISH_Q1_FULL | \ - FIT_ISH_Q2_FULL | \ - FIT_ISH_Q3_FULL | \ - FIT_ISH_QCMD_FIFO_OVERRUN | \ - FIT_ISH_BAD_EXP_ROM_READ) - -#define FIT_INT_QUEUE_FULL \ - (FIT_ISH_Q0_FULL | \ - FIT_ISH_Q1_FULL | \ - FIT_ISH_Q2_FULL | \ - FIT_ISH_Q3_FULL) - -#define MSI_MSG_NWL_ERROR_0 0x00000000 -#define MSI_MSG_NWL_ERROR_1 0x00000001 -#define MSI_MSG_NWL_ERROR_2 0x00000002 -#define MSI_MSG_NWL_ERROR_3 0x00000003 -#define MSI_MSG_STATE_CHANGE 0x00000004 -#define MSI_MSG_COMPLETION_POSTED 0x00000005 -#define MSI_MSG_MSG_FROM_DEV 0x00000006 -#define MSI_MSG_RESERVED_0 0x00000007 -#define MSI_MSG_RESERVED_1 0x00000008 -#define MSI_MSG_QUEUE_0_FULL 0x00000009 -#define MSI_MSG_QUEUE_1_FULL 0x0000000A -#define MSI_MSG_QUEUE_2_FULL 0x0000000B -#define MSI_MSG_QUEUE_3_FULL 0x0000000C - -#define FIT_INT_RESERVED_MASK \ - (FIT_ISH_UNDEFINED_3 | \ - FIT_ISH_UNDEFINED_4) - -/* - * Interrupt mask, 32-bit r/w - * Bit definitions are the same as FIT_INT_STATUS_HOST - */ -#define FIT_INT_MASK_HOST 0x528u - -/* - * Message to device, 32-bit r/w - */ -#define FIT_MSG_TO_DEVICE 0x540u - -/* - * Message from device, 32-bit, r/o - */ -#define FIT_MSG_FROM_DEVICE 0x548u - -/* - * 32-bit messages to/from device, composition/extraction macros - */ -#define FIT_MXD_CONS(TYPE, PARAM, DATA) \ - ((((TYPE) & 0xFFu) << 24u) | \ - (((PARAM) & 0xFFu) << 16u) | \ - (((DATA) & 0xFFFFu) << 0u)) -#define FIT_MXD_TYPE(MXD) (((MXD) >> 24u) & 0xFFu) -#define FIT_MXD_PARAM(MXD) (((MXD) >> 16u) & 0xFFu) -#define FIT_MXD_DATA(MXD) (((MXD) >> 0u) & 0xFFFFu) - -/* - * Types of messages to/from device - */ -#define FIT_MTD_FITFW_INIT 0x01u -#define FIT_MTD_GET_CMDQ_DEPTH 0x02u -#define FIT_MTD_SET_COMPQ_DEPTH 0x03u -#define FIT_MTD_SET_COMPQ_ADDR 0x04u -#define FIT_MTD_ARM_QUEUE 0x05u -#define FIT_MTD_CMD_LOG_HOST_ID 0x07u -#define FIT_MTD_CMD_LOG_TIME_STAMP_LO 0x08u -#define FIT_MTD_CMD_LOG_TIME_STAMP_HI 0x09u -#define FIT_MFD_SMART_EXCEEDED 0x10u -#define FIT_MFD_POWER_DOWN 0x11u -#define FIT_MFD_OFFLINE 0x12u -#define FIT_MFD_ONLINE 0x13u -#define FIT_MFD_FW_RESTARTING 0x14u -#define FIT_MFD_PM_ACTIVE 0x15u -#define FIT_MFD_PM_STANDBY 0x16u -#define FIT_MFD_PM_SLEEP 0x17u -#define FIT_MFD_CMD_PROGRESS 0x18u - -#define FIT_MTD_DEBUG 0xFEu -#define FIT_MFD_DEBUG 0xFFu - -#define FIT_MFD_MASK (0xFFu) -#define FIT_MFD_DATA_MASK (0xFFu) -#define FIT_MFD_MSG(x) (((x) >> 24) & FIT_MFD_MASK) -#define FIT_MFD_DATA(x) ((x) & FIT_MFD_MASK) - -/* - * Extra arg to FIT_MSG_TO_DEVICE, 64-bit r/w - * Used to set completion queue address (FIT_MTD_SET_COMPQ_ADDR) - * (was Response buffer in docs) - */ -#define FIT_MSG_TO_DEVICE_ARG 0x580u - -/* - * Hardware (ASIC) version, 32-bit r/o - */ -#define FIT_HW_VERSION 0x588u - -/* - * Scatter/gather list descriptor. - * 32-bytes and must be aligned on a 32-byte boundary. - * All fields are in little endian order. - */ -struct fit_sg_descriptor { - uint32_t control; - uint32_t byte_count; - uint64_t host_side_addr; - uint64_t dev_side_addr; - uint64_t next_desc_ptr; -}; - -#define FIT_SGD_CONTROL_NOT_LAST 0x000u -#define FIT_SGD_CONTROL_LAST 0x40Eu - -/* - * Header at the beginning of a FIT message. The header - * is followed by SSDI requests each 64 bytes. - * A FIT message can be up to 512 bytes long and must start - * on a 64-byte boundary. - */ -struct fit_msg_hdr { - uint8_t protocol_id; - uint8_t num_protocol_cmds_coalesced; - uint8_t _reserved[62]; -}; - -#define FIT_PROTOCOL_ID_FIT 1 -#define FIT_PROTOCOL_ID_SSDI 2 -#define FIT_PROTOCOL_ID_SOFIT 3 - - -#define FIT_PROTOCOL_MINOR_VER(mtd_val) ((mtd_val >> 16) & 0xF) -#define FIT_PROTOCOL_MAJOR_VER(mtd_val) ((mtd_val >> 20) & 0xF) - -/* - * Format of a completion entry. The completion queue is circular - * and must have at least as many entries as the maximum number - * of commands that may be issued to the device. - * - * There are no head/tail pointers. The cycle value is used to - * infer the presence of new completion records. - * Initially the cycle in all entries is 0, the index is 0, and - * the cycle value to expect is 1. When completions are added - * their cycle values are set to 1. When the index wraps the - * cycle value to expect is incremented. - * - * Command_context is opaque and taken verbatim from the SSDI command. - * All other fields are big endian. - */ -#define FIT_PROTOCOL_VERSION_0 0 - -/* - * Protocol major version 1 completion entry. - * The major protocol version is found in bits - * 20-23 of the FIT_MTD_FITFW_INIT response. - */ -struct fit_completion_entry_v1 { - __be32 num_returned_bytes; - uint16_t tag; - uint8_t status; /* SCSI status */ - uint8_t cycle; -}; -#define FIT_PROTOCOL_VERSION_1 1 -#define FIT_PROTOCOL_VERSION_CURRENT FIT_PROTOCOL_VERSION_1 - -struct fit_comp_error_info { - uint8_t type:7; /* 00: Bits0-6 indicates the type of sense data. */ - uint8_t valid:1; /* 00: Bit 7 := 1 ==> info field is valid. */ - uint8_t reserved0; /* 01: Obsolete field */ - uint8_t key:4; /* 02: Bits0-3 indicate the sense key. */ - uint8_t reserved2:1; /* 02: Reserved bit. */ - uint8_t bad_length:1; /* 02: Incorrect Length Indicator */ - uint8_t end_medium:1; /* 02: End of Medium */ - uint8_t file_mark:1; /* 02: Filemark */ - uint8_t info[4]; /* 03: */ - uint8_t reserved1; /* 07: Additional Sense Length */ - uint8_t cmd_spec[4]; /* 08: Command Specific Information */ - uint8_t code; /* 0C: Additional Sense Code */ - uint8_t qual; /* 0D: Additional Sense Code Qualifier */ - uint8_t fruc; /* 0E: Field Replaceable Unit Code */ - uint8_t sks_high:7; /* 0F: Sense Key Specific (MSB) */ - uint8_t sks_valid:1; /* 0F: Sense Key Specific Valid */ - uint16_t sks_low; /* 10: Sense Key Specific (LSW) */ - uint16_t reserved3; /* 12: Part of additional sense bytes (unused) */ - uint16_t uec; /* 14: Additional Sense Bytes */ - uint64_t per __packed; /* 16: Additional Sense Bytes */ - uint8_t reserved4[2]; /* 1E: Additional Sense Bytes (unused) */ -}; - - -/* Task management constants */ -#define SOFT_TASK_SIMPLE 0x00 -#define SOFT_TASK_HEAD_OF_QUEUE 0x01 -#define SOFT_TASK_ORDERED 0x02 - -/* Version zero has the last 32 bits reserved, - * Version one has the last 32 bits sg_list_len_bytes; - */ -struct skd_command_header { - __be64 sg_list_dma_address; - uint16_t tag; - uint8_t attribute; - uint8_t add_cdb_len; /* In 32 bit words */ - __be32 sg_list_len_bytes; -}; - -struct skd_scsi_request { - struct skd_command_header hdr; - unsigned char cdb[16]; -/* unsigned char _reserved[16]; */ -}; - -struct driver_inquiry_data { - uint8_t peripheral_device_type:5; - uint8_t qualifier:3; - uint8_t page_code; - __be16 page_length; - __be16 pcie_bus_number; - uint8_t pcie_device_number; - uint8_t pcie_function_number; - uint8_t pcie_link_speed; - uint8_t pcie_link_lanes; - __be16 pcie_vendor_id; - __be16 pcie_device_id; - __be16 pcie_subsystem_vendor_id; - __be16 pcie_subsystem_device_id; - uint8_t reserved1[2]; - uint8_t reserved2[3]; - uint8_t driver_version_length; - uint8_t driver_version[0x14]; -}; - -#endif /* SKD_S1120_H */ From 71dda2a5625f31bc3410cb69c3d31376a2b66f28 Mon Sep 17 00:00:00 2001 From: dongdong tao Date: Wed, 10 Feb 2021 13:07:23 +0800 Subject: [PATCH 33/64] bcache: consider the fragmentation when update the writeback rate Current way to calculate the writeback rate only considered the dirty sectors, this usually works fine when the fragmentation is not high, but it will give us unreasonable small rate when we are under a situation that very few dirty sectors consumed a lot dirty buckets. In some case, the dirty bucekts can reached to CUTOFF_WRITEBACK_SYNC while the dirty data(sectors) not even reached the writeback_percent, the writeback rate will still be the minimum value (4k), thus it will cause all the writes to be stucked in a non-writeback mode because of the slow writeback. We accelerate the rate in 3 stages with different aggressiveness, the first stage starts when dirty buckets percent reach above BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW (50), the second is BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID (57), the third is BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH (64). By default the first stage tries to writeback the amount of dirty data in one bucket (on average) in (1 / (dirty_buckets_percent - 50)) second, the second stage tries to writeback the amount of dirty data in one bucket in (1 / (dirty_buckets_percent - 57)) * 100 millisecond, the third stage tries to writeback the amount of dirty data in one bucket in (1 / (dirty_buckets_percent - 64)) millisecond. the initial rate at each stage can be controlled by 3 configurable parameters writeback_rate_fp_term_{low|mid|high}, they are by default 1, 10, 1000, the hint of IO throughput that these values are trying to achieve is described by above paragraph, the reason that I choose those value as default is based on the testing and the production data, below is some details: A. When it comes to the low stage, there is still a bit far from the 70 threshold, so we only want to give it a little bit push by setting the term to 1, it means the initial rate will be 170 if the fragment is 6, it is calculated by bucket_size/fragment, this rate is very small, but still much reasonable than the minimum 8. For a production bcache with unheavy workload, if the cache device is bigger than 1 TB, it may take hours to consume 1% buckets, so it is very possible to reclaim enough dirty buckets in this stage, thus to avoid entering the next stage. B. If the dirty buckets ratio didn't turn around during the first stage, it comes to the mid stage, then it is necessary for mid stage to be more aggressive than low stage, so i choose the initial rate to be 10 times more than low stage, that means 1700 as the initial rate if the fragment is 6. This is some normal rate we usually see for a normal workload when writeback happens because of writeback_percent. C. If the dirty buckets ratio didn't turn around during the low and mid stages, it comes to the third stage, and it is the last chance that we can turn around to avoid the horrible cutoff writeback sync issue, then we choose 100 times more aggressive than the mid stage, that means 170000 as the initial rate if the fragment is 6. This is also inferred from a production bcache, I've got one week's writeback rate data from a production bcache which has quite heavy workloads, again, the writeback is triggered by the writeback percent, the highest rate area is around 100000 to 240000, so I believe this kind aggressiveness at this stage is reasonable for production. And it should be mostly enough because the hint is trying to reclaim 1000 bucket per second, and from that heavy production env, it is consuming 50 bucket per second on average in one week's data. Option writeback_consider_fragment is to control whether we want this feature to be on or off, it's on by default. Lastly, below is the performance data for all the testing result, including the data from production env: https://docs.google.com/document/d/1AmbIEa_2MhB9bqhC3rfga9tp7n9YX9PLn0jSUxscVW0/edit?usp=sharing Signed-off-by: dongdong tao Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 4 ++++ drivers/md/bcache/sysfs.c | 23 +++++++++++++++++++ drivers/md/bcache/writeback.c | 42 +++++++++++++++++++++++++++++++++++ drivers/md/bcache/writeback.h | 4 ++++ 4 files changed, 73 insertions(+) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 1d57f48307e6..d7a84327b7f1 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -373,6 +373,7 @@ struct cached_dev { unsigned int partial_stripes_expensive:1; unsigned int writeback_metadata:1; unsigned int writeback_running:1; + unsigned int writeback_consider_fragment:1; unsigned char writeback_percent; unsigned int writeback_delay; @@ -385,6 +386,9 @@ struct cached_dev { unsigned int writeback_rate_update_seconds; unsigned int writeback_rate_i_term_inverse; unsigned int writeback_rate_p_term_inverse; + unsigned int writeback_rate_fp_term_low; + unsigned int writeback_rate_fp_term_mid; + unsigned int writeback_rate_fp_term_high; unsigned int writeback_rate_minimum; enum stop_on_failure stop_when_cache_set_failed; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 00a520c03f41..eef15f8022ba 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -117,10 +117,14 @@ rw_attribute(writeback_running); rw_attribute(writeback_percent); rw_attribute(writeback_delay); rw_attribute(writeback_rate); +rw_attribute(writeback_consider_fragment); rw_attribute(writeback_rate_update_seconds); rw_attribute(writeback_rate_i_term_inverse); rw_attribute(writeback_rate_p_term_inverse); +rw_attribute(writeback_rate_fp_term_low); +rw_attribute(writeback_rate_fp_term_mid); +rw_attribute(writeback_rate_fp_term_high); rw_attribute(writeback_rate_minimum); read_attribute(writeback_rate_debug); @@ -195,6 +199,7 @@ SHOW(__bch_cached_dev) var_printf(bypass_torture_test, "%i"); var_printf(writeback_metadata, "%i"); var_printf(writeback_running, "%i"); + var_printf(writeback_consider_fragment, "%i"); var_print(writeback_delay); var_print(writeback_percent); sysfs_hprint(writeback_rate, @@ -205,6 +210,9 @@ SHOW(__bch_cached_dev) var_print(writeback_rate_update_seconds); var_print(writeback_rate_i_term_inverse); var_print(writeback_rate_p_term_inverse); + var_print(writeback_rate_fp_term_low); + var_print(writeback_rate_fp_term_mid); + var_print(writeback_rate_fp_term_high); var_print(writeback_rate_minimum); if (attr == &sysfs_writeback_rate_debug) { @@ -303,6 +311,7 @@ STORE(__cached_dev) sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test); sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata); sysfs_strtoul_bool(writeback_running, dc->writeback_running); + sysfs_strtoul_bool(writeback_consider_fragment, dc->writeback_consider_fragment); sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX); sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, @@ -331,6 +340,16 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_rate_p_term_inverse, dc->writeback_rate_p_term_inverse, 1, UINT_MAX); + sysfs_strtoul_clamp(writeback_rate_fp_term_low, + dc->writeback_rate_fp_term_low, + 1, dc->writeback_rate_fp_term_mid - 1); + sysfs_strtoul_clamp(writeback_rate_fp_term_mid, + dc->writeback_rate_fp_term_mid, + dc->writeback_rate_fp_term_low + 1, + dc->writeback_rate_fp_term_high - 1); + sysfs_strtoul_clamp(writeback_rate_fp_term_high, + dc->writeback_rate_fp_term_high, + dc->writeback_rate_fp_term_mid + 1, UINT_MAX); sysfs_strtoul_clamp(writeback_rate_minimum, dc->writeback_rate_minimum, 1, UINT_MAX); @@ -499,9 +518,13 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_delay, &sysfs_writeback_percent, &sysfs_writeback_rate, + &sysfs_writeback_consider_fragment, &sysfs_writeback_rate_update_seconds, &sysfs_writeback_rate_i_term_inverse, &sysfs_writeback_rate_p_term_inverse, + &sysfs_writeback_rate_fp_term_low, + &sysfs_writeback_rate_fp_term_mid, + &sysfs_writeback_rate_fp_term_high, &sysfs_writeback_rate_minimum, &sysfs_writeback_rate_debug, &sysfs_io_errors, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index a129e4d2707c..82d4e0880a99 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -88,6 +88,44 @@ static void __update_writeback_rate(struct cached_dev *dc) int64_t integral_scaled; uint32_t new_rate; + /* + * We need to consider the number of dirty buckets as well + * when calculating the proportional_scaled, Otherwise we might + * have an unreasonable small writeback rate at a highly fragmented situation + * when very few dirty sectors consumed a lot dirty buckets, the + * worst case is when dirty buckets reached cutoff_writeback_sync and + * dirty data is still not even reached to writeback percent, so the rate + * still will be at the minimum value, which will cause the write + * stuck at a non-writeback mode. + */ + struct cache_set *c = dc->disk.c; + + int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets; + + if (dc->writeback_consider_fragment && + c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) { + int64_t fragment = + div_s64((dirty_buckets * c->cache->sb.bucket_size), dirty); + int64_t fp_term; + int64_t fps; + + if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) { + fp_term = dc->writeback_rate_fp_term_low * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW); + } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) { + fp_term = dc->writeback_rate_fp_term_mid * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID); + } else { + fp_term = dc->writeback_rate_fp_term_high * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH); + } + fps = div_s64(dirty, dirty_buckets) * fp_term; + if (fragment > 3 && fps > proportional_scaled) { + /* Only overrite the p when fragment > 3 */ + proportional_scaled = fps; + } + } + if ((error < 0 && dc->writeback_rate_integral > 0) || (error > 0 && time_before64(local_clock(), dc->writeback_rate.next + NSEC_PER_MSEC))) { @@ -977,6 +1015,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_metadata = true; dc->writeback_running = false; + dc->writeback_consider_fragment = true; dc->writeback_percent = 10; dc->writeback_delay = 30; atomic_long_set(&dc->writeback_rate.rate, 1024); @@ -984,6 +1023,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_fp_term_low = 1; + dc->writeback_rate_fp_term_mid = 10; + dc->writeback_rate_fp_term_high = 1000; dc->writeback_rate_i_term_inverse = 10000; WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 3f1230e22de0..02b2f9df73f6 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -16,6 +16,10 @@ #define BCH_AUTO_GC_DIRTY_THRESHOLD 50 +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW 50 +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID 57 +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH 64 + #define BCH_DIRTY_INIT_THRD_MAX 64 /* * 14 (16384ths) is chosen here as something that each backing device From d7fae7b4fa152795ab70c680d3a63c7843c9368c Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Wed, 10 Feb 2021 13:07:24 +0800 Subject: [PATCH 34/64] bcache: Fix register_device_aync typo Should be `register_device_async`. Cc: Coly Li Signed-off-by: Kai Krakow Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 193fe7652329..dfbaf6aa3e4f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2517,7 +2517,7 @@ out: module_put(THIS_MODULE); } -static void register_device_aync(struct async_reg_args *args) +static void register_device_async(struct async_reg_args *args) { if (SB_IS_BDEV(args->sb)) INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker); @@ -2611,7 +2611,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, args->sb = sb; args->sb_disk = sb_disk; args->bdev = bdev; - register_device_aync(args); + register_device_async(args); /* No wait and returns to user space */ goto async_done; } From 9f233ffe02e5cef611100cd8c5bcf4de26ca7bef Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Wed, 10 Feb 2021 13:07:25 +0800 Subject: [PATCH 35/64] Revert "bcache: Kill btree_io_wq" This reverts commit 56b30770b27d54d68ad51eccc6d888282b568cee. With the btree using the `system_wq`, I seem to see a lot more desktop latency than I should. After some more investigation, it looks like the original assumption of 56b3077 no longer is true, and bcache has a very high potential of congesting the `system_wq`. In turn, this introduces laggy desktop performance, IO stalls (at least with btrfs), and input events may be delayed. So let's revert this. It's important to note that the semantics of using `system_wq` previously mean that `btree_io_wq` should be created before and destroyed after other bcache wqs to keep the same assumptions. Cc: Coly Li Cc: stable@vger.kernel.org # 5.4+ Signed-off-by: Kai Krakow Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 ++ drivers/md/bcache/btree.c | 21 +++++++++++++++++++-- drivers/md/bcache/super.c | 4 ++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d7a84327b7f1..2b8c7dd2cfae 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1046,5 +1046,7 @@ void bch_debug_exit(void); void bch_debug_init(void); void bch_request_exit(void); int bch_request_init(void); +void bch_btree_exit(void); +int bch_btree_init(void); #endif /* _BCACHE_H */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 910df242c83d..952f022db5a5 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -99,6 +99,8 @@ #define PTR_HASH(c, k) \ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) +static struct workqueue_struct *btree_io_wq; + #define insert_lock(s, b) ((b)->level <= (s)->lock) @@ -308,7 +310,7 @@ static void __btree_node_write_done(struct closure *cl) btree_complete_write(b, w); if (btree_node_dirty(b)) - schedule_delayed_work(&b->work, 30 * HZ); + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); closure_return_with_destructor(cl, btree_node_write_unlock); } @@ -481,7 +483,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) BUG_ON(!i->keys); if (!btree_node_dirty(b)) - schedule_delayed_work(&b->work, 30 * HZ); + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); set_btree_node_dirty(b); @@ -2764,3 +2766,18 @@ void bch_keybuf_init(struct keybuf *buf) spin_lock_init(&buf->lock); array_allocator_init(&buf->freelist); } + +void bch_btree_exit(void) +{ + if (btree_io_wq) + destroy_workqueue(btree_io_wq); +} + +int __init bch_btree_init(void) +{ + btree_io_wq = create_singlethread_workqueue("bch_btree_io"); + if (!btree_io_wq) + return -ENOMEM; + + return 0; +} diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index dfbaf6aa3e4f..97405aec4b51 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2821,6 +2821,7 @@ static void bcache_exit(void) destroy_workqueue(bcache_wq); if (bch_journal_wq) destroy_workqueue(bch_journal_wq); + bch_btree_exit(); if (bcache_major) unregister_blkdev(bcache_major, "bcache"); @@ -2876,6 +2877,9 @@ static int __init bcache_init(void) return bcache_major; } + if (bch_btree_init()) + goto err; + bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); if (!bcache_wq) goto err; From d797bd9897e3559eb48d68368550d637d32e468c Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Wed, 10 Feb 2021 13:07:26 +0800 Subject: [PATCH 36/64] bcache: Give btree_io_wq correct semantics again Before killing `btree_io_wq`, the queue was allocated using `create_singlethread_workqueue()` which has `WQ_MEM_RECLAIM`. After killing it, it no longer had this property but `system_wq` is not single threaded. Let's combine both worlds and make it multi threaded but able to reclaim memory. Cc: Coly Li Cc: stable@vger.kernel.org # 5.4+ Signed-off-by: Kai Krakow Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 952f022db5a5..fe6dce125aba 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2775,7 +2775,7 @@ void bch_btree_exit(void) int __init bch_btree_init(void) { - btree_io_wq = create_singlethread_workqueue("bch_btree_io"); + btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0); if (!btree_io_wq) return -ENOMEM; From afe78ab46f638ecdf80a35b122ffc92c20d9ae5d Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Wed, 10 Feb 2021 13:07:27 +0800 Subject: [PATCH 37/64] bcache: Move journal work to new flush wq This is potentially long running and not latency sensitive, let's get it out of the way of other latency sensitive events. As observed in the previous commit, the `system_wq` comes easily congested by bcache, and this fixes a few more stalls I was observing every once in a while. Let's not make this `WQ_MEM_RECLAIM` as it showed to reduce performance of boot and file system operations in my tests. Also, without `WQ_MEM_RECLAIM`, I no longer see desktop stalls. This matches the previous behavior as `system_wq` also does no memory reclaim: > // workqueue.c: > system_wq = alloc_workqueue("events", 0, 0); Cc: Coly Li Cc: stable@vger.kernel.org # 5.4+ Signed-off-by: Kai Krakow Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/journal.c | 4 ++-- drivers/md/bcache/super.c | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 2b8c7dd2cfae..848dd4db1659 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1005,6 +1005,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; extern struct workqueue_struct *bch_journal_wq; +extern struct workqueue_struct *bch_flush_wq; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index aefbdb7e003b..c6613e817333 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -932,8 +932,8 @@ atomic_t *bch_journal(struct cache_set *c, journal_try_write(c); } else if (!w->dirty) { w->dirty = true; - schedule_delayed_work(&c->journal.work, - msecs_to_jiffies(c->journal_delay_ms)); + queue_delayed_work(bch_flush_wq, &c->journal.work, + msecs_to_jiffies(c->journal_delay_ms)); spin_unlock(&c->journal.lock); } else { spin_unlock(&c->journal.lock); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 97405aec4b51..71691f32959b 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -49,6 +49,7 @@ static int bcache_major; static DEFINE_IDA(bcache_device_idx); static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; +struct workqueue_struct *bch_flush_wq; struct workqueue_struct *bch_journal_wq; @@ -2821,6 +2822,8 @@ static void bcache_exit(void) destroy_workqueue(bcache_wq); if (bch_journal_wq) destroy_workqueue(bch_journal_wq); + if (bch_flush_wq) + destroy_workqueue(bch_flush_wq); bch_btree_exit(); if (bcache_major) @@ -2884,6 +2887,19 @@ static int __init bcache_init(void) if (!bcache_wq) goto err; + /* + * Let's not make this `WQ_MEM_RECLAIM` for the following reasons: + * + * 1. It used `system_wq` before which also does no memory reclaim. + * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and + * reduced throughput can be observed. + * + * We still want to user our own queue to not congest the `system_wq`. + */ + bch_flush_wq = alloc_workqueue("bch_flush", 0, 0); + if (!bch_flush_wq) + goto err; + bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0); if (!bch_journal_wq) goto err; From 6751c1e3cff3aa763c760c08862627069a37b50e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 10 Feb 2021 13:07:28 +0800 Subject: [PATCH 38/64] bcache: Avoid comma separated statements Use semicolons and braces. Signed-off-by: Joe Perches Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 12 ++++++++---- drivers/md/bcache/sysfs.c | 6 ++++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 67a2c47f4201..94d38e8a59b3 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -712,8 +712,10 @@ void bch_bset_build_written_tree(struct btree_keys *b) for (j = inorder_next(0, t->size); j; j = inorder_next(j, t->size)) { - while (bkey_to_cacheline(t, k) < cacheline) - prev = k, k = bkey_next(k); + while (bkey_to_cacheline(t, k) < cacheline) { + prev = k; + k = bkey_next(k); + } t->prev[j] = bkey_u64s(prev); t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k); @@ -901,8 +903,10 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, status = BTREE_INSERT_STATUS_INSERT; while (m != bset_bkey_last(i) && - bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) - prev = m, m = bkey_next(m); + bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) { + prev = m; + m = bkey_next(m); + } /* prev is in the tree, if we merge we're done */ status = BTREE_INSERT_STATUS_BACK_MERGE; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index eef15f8022ba..cc89f3156d1a 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -1094,8 +1094,10 @@ SHOW(__bch_cache) --n; while (cached < p + n && - *cached == BTREE_PRIO) - cached++, n--; + *cached == BTREE_PRIO) { + cached++; + n--; + } for (i = 0; i < n; i++) sum += INITIAL_PRIO - cached[i]; From f720a8edbc6470fad8b47d0d4ae092a6c63340bb Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 2 Feb 2021 15:06:17 +0800 Subject: [PATCH 39/64] nvme: convert sysfs sprintf/snprintf family to sysfs_emit Fix the following coccicheck warning: ./drivers/nvme/host/core.c:3580:8-16: WARNING: use scnprintf or sprintf. ./drivers/nvme/host/core.c:3570:8-16: WARNING: use scnprintf or sprintf. ./drivers/nvme/host/core.c:3560:8-16: WARNING: use scnprintf or sprintf. ./drivers/nvme/host/core.c:3526:8-16: WARNING: use scnprintf or sprintf. ./drivers/nvme/host/core.c:2833:8-16: WARNING: use scnprintf or sprintf. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4e8e310033c9..0befaad788a0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2848,7 +2848,7 @@ static ssize_t nvme_subsys_show_nqn(struct device *dev, struct nvme_subsystem *subsys = container_of(dev, struct nvme_subsystem, dev); - return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn); + return sysfs_emit(buf, "%s\n", subsys->subnqn); } static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); @@ -3541,7 +3541,7 @@ static ssize_t nvme_sysfs_show_transport(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name); + return sysfs_emit(buf, "%s\n", ctrl->ops->name); } static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); @@ -3575,7 +3575,7 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn); + return sysfs_emit(buf, "%s\n", ctrl->subsys->subnqn); } static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); @@ -3585,7 +3585,7 @@ static ssize_t nvme_sysfs_show_hostnqn(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn); + return sysfs_emit(buf, "%s\n", ctrl->opts->host->nqn); } static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL); @@ -3595,7 +3595,7 @@ static ssize_t nvme_sysfs_show_hostid(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id); + return sysfs_emit(buf, "%pU\n", &ctrl->opts->host->id); } static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL); From 83fba8c8114748a18e20391565cfdfdf8466075c Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Mon, 1 Feb 2021 11:49:38 +0800 Subject: [PATCH 40/64] blk-mq: introduce blk_mq_set_request_complete nvme drivers need to set the state of request to MQ_RQ_COMPLETE when directly complete request in queue_rq. So add blk_mq_set_request_complete. Signed-off-by: Chao Leng Signed-off-by: Christoph Hellwig --- include/linux/blk-mq.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index aabbf6830ffc..2c473c9b8990 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -490,6 +490,18 @@ static inline int blk_mq_request_completed(struct request *rq) return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; } +/* + * + * Set the state to complete when completing a request from inside ->queue_rq. + * This is used by drivers that want to ensure special complete actions that + * need access to the request are called on failure, e.g. by nvme for + * multipathing. + */ +static inline void blk_mq_set_request_complete(struct request *rq) +{ + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); +} + void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); From dda3248e7fc306e0ce3612ae96bdd9a36e2ab04f Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Thu, 4 Feb 2021 08:55:11 +0100 Subject: [PATCH 41/64] nvme: introduce a nvme_host_path_error helper When using nvme native multipathing, if a path related error occurs during ->queue_rq, the request needs to be completed with NVME_SC_HOST_PATH_ERROR so that the request can be failed over. Introduce a helper to complete the command from ->queue_rq in a wait that invokes nvme_complete_rq. Signed-off-by: Chao Leng [hch: renamed, added a return value to clean up the callers a bit] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 15 +++++++++++++++ drivers/nvme/host/nvme.h | 1 + 2 files changed, 16 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0befaad788a0..02579f4f776c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -355,6 +355,21 @@ void nvme_complete_rq(struct request *req) } EXPORT_SYMBOL_GPL(nvme_complete_rq); +/* + * Called to unwind from ->queue_rq on a failed command submission so that the + * multipathing code gets called to potentially failover to another path. + * The caller needs to unwind all transport specific resource allocations and + * must return propagate the return value. + */ +blk_status_t nvme_host_path_error(struct request *req) +{ + nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR; + blk_mq_set_request_complete(req); + nvme_complete_rq(req); + return BLK_STS_OK; +} +EXPORT_SYMBOL_GPL(nvme_host_path_error); + bool nvme_cancel_request(struct request *req, void *data, bool reserved) { dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a72f07181091..5819f0381041 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -575,6 +575,7 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) } void nvme_complete_rq(struct request *req); +blk_status_t nvme_host_path_error(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); void nvme_cancel_tagset(struct nvme_ctrl *ctrl); void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); From ea5e5f42cd2c80d19862dd63a2f3a4e7a99c6a20 Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Mon, 1 Feb 2021 11:49:39 +0800 Subject: [PATCH 42/64] nvme-fabrics: avoid double completions in nvmf_fail_nonready_command When reconnecting, the request may be completed with NVME_SC_HOST_PATH_ERROR in nvmf_fail_nonready_command, which currently set the state of the request to MQ_RQ_IN_FLIGHT before calling nvme_complete_rq. When this happens for a request that is freed by the caller, such as nvme_submit_user_cmd, in the worst case the request could be completed again in tear down process. Instead of calling blk_mq_start_request from nvmf_fail_nonready_command, just use the new nvme_host_path_error helper to complete the command without starting it. Signed-off-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 72ac00173500..5dfd806fc2d2 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -552,11 +552,7 @@ blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; - - nvme_req(rq)->status = NVME_SC_HOST_PATH_ERROR; - blk_mq_start_request(rq); - nvme_complete_rq(rq); - return BLK_STS_OK; + return nvme_host_path_error(rq); } EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command); From 62eca39722fd997e3621fc903229917b9f0fb271 Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Mon, 1 Feb 2021 11:49:40 +0800 Subject: [PATCH 43/64] nvme-rdma: handle nvme_rdma_post_send failures better nvme_rdma_post_send failing is a path related error and should bounce to another path when using nvme-multipath. Call nvme_host_path_error when nvme_rdma_post_send returns -EIO to ensure nvme_complete_rq gets invoked to fail over to another path if there is one. Signed-off-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6700d8bab68a..53ac4d7442ba 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2098,7 +2098,9 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, err_unmap: nvme_rdma_unmap_data(queue, rq); err: - if (err == -ENOMEM || err == -EAGAIN) + if (err == -EIO) + ret = nvme_host_path_error(rq); + else if (err == -ENOMEM || err == -EAGAIN) ret = BLK_STS_RESOURCE; else ret = BLK_STS_IOERR; From fda871c0ba5d2eed2cd1c881573168129da70058 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 3 Feb 2021 15:00:01 -0800 Subject: [PATCH 44/64] nvmet-tcp: fix receive data digest calculation for multiple h2cdata PDUs When a host sends multiple h2cdata PDUs for a single command, we should verify the data digest calculation per PDU and not per command. Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver") Reported-by: Narayan Ayalasomayajula Tested-by: Narayan Ayalasomayajula Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index dc1f0f647189..c3da50f776fa 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -378,7 +378,7 @@ err: return NVME_SC_INTERNAL; } -static void nvmet_tcp_ddgst(struct ahash_request *hash, +static void nvmet_tcp_send_ddgst(struct ahash_request *hash, struct nvmet_tcp_cmd *cmd) { ahash_request_set_crypt(hash, cmd->req.sg, @@ -386,6 +386,23 @@ static void nvmet_tcp_ddgst(struct ahash_request *hash, crypto_ahash_digest(hash); } +static void nvmet_tcp_recv_ddgst(struct ahash_request *hash, + struct nvmet_tcp_cmd *cmd) +{ + struct scatterlist sg; + struct kvec *iov; + int i; + + crypto_ahash_init(hash); + for (i = 0, iov = cmd->iov; i < cmd->nr_mapped; i++, iov++) { + sg_init_one(&sg, iov->iov_base, iov->iov_len); + ahash_request_set_crypt(hash, &sg, NULL, iov->iov_len); + crypto_ahash_update(hash); + } + ahash_request_set_crypt(hash, NULL, (void *)&cmd->exp_ddgst, 0); + crypto_ahash_final(hash); +} + static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) { struct nvme_tcp_data_pdu *pdu = cmd->data_pdu; @@ -410,7 +427,7 @@ static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) if (queue->data_digest) { pdu->hdr.flags |= NVME_TCP_F_DDGST; - nvmet_tcp_ddgst(queue->snd_hash, cmd); + nvmet_tcp_send_ddgst(queue->snd_hash, cmd); } if (cmd->queue->hdr_digest) { @@ -1059,7 +1076,7 @@ static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd) { struct nvmet_tcp_queue *queue = cmd->queue; - nvmet_tcp_ddgst(queue->rcv_hash, cmd); + nvmet_tcp_recv_ddgst(queue->rcv_hash, cmd); queue->offset = 0; queue->left = NVME_TCP_DIGEST_LENGTH; queue->rcv_state = NVMET_TCP_RECV_DDGST; @@ -1080,14 +1097,14 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) cmd->rbytes_done += ret; } + if (queue->data_digest) { + nvmet_tcp_prep_recv_ddgst(cmd); + return 0; + } nvmet_tcp_unmap_pdu_iovec(cmd); if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && cmd->rbytes_done == cmd->req.transfer_len) { - if (queue->data_digest) { - nvmet_tcp_prep_recv_ddgst(cmd); - return 0; - } cmd->req.execute(&cmd->req); } From 0fbcfb089a3f2f2a731d01f0aec8f7697a849c28 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 5 Feb 2021 11:47:25 -0800 Subject: [PATCH 45/64] nvmet-tcp: fix potential race of tcp socket closing accept_work When we accept a TCP connection and allocate an nvmet-tcp queue we should make sure not to fully establish it or reference it as the connection may be already closing, which triggers queue release work, which does not fence against queue establishment. In order to address such a race, we make sure to check the sk_state and contain the queue reference to be done underneath the sk_callback_lock such that the queue release work correctly fences against it. Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver") Reported-by: Elad Grupi Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index c3da50f776fa..ac2d9ed23cea 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1484,17 +1484,27 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) if (inet->rcv_tos > 0) ip_sock_set_tos(sock->sk, inet->rcv_tos); + ret = 0; write_lock_bh(&sock->sk->sk_callback_lock); - sock->sk->sk_user_data = queue; - queue->data_ready = sock->sk->sk_data_ready; - sock->sk->sk_data_ready = nvmet_tcp_data_ready; - queue->state_change = sock->sk->sk_state_change; - sock->sk->sk_state_change = nvmet_tcp_state_change; - queue->write_space = sock->sk->sk_write_space; - sock->sk->sk_write_space = nvmet_tcp_write_space; + if (sock->sk->sk_state != TCP_ESTABLISHED) { + /* + * If the socket is already closing, don't even start + * consuming it + */ + ret = -ENOTCONN; + } else { + sock->sk->sk_user_data = queue; + queue->data_ready = sock->sk->sk_data_ready; + sock->sk->sk_data_ready = nvmet_tcp_data_ready; + queue->state_change = sock->sk->sk_state_change; + sock->sk->sk_state_change = nvmet_tcp_state_change; + queue->write_space = sock->sk->sk_write_space; + sock->sk->sk_write_space = nvmet_tcp_write_space; + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); + } write_unlock_bh(&sock->sk->sk_callback_lock); - return 0; + return ret; } static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, @@ -1542,8 +1552,6 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, if (ret) goto out_destroy_sq; - queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); - return 0; out_destroy_sq: mutex_lock(&nvmet_tcp_queue_mutex); From 73a1a2298f3e9df24cea7a9aab412ba9470f6159 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 5 Feb 2021 11:50:02 -0800 Subject: [PATCH 46/64] nvme-multipath: set nr_zones for zoned namespaces The bio based drivers only require the request_queue's nr_zones is set, so set this field in the head if the namespace path is zoned. Fixes: 240e6ee272c07 ("nvme: support for zoned namespaces") Reported-by: Minwoo Im Cc: Damien Le Moal Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 65bd6efa5e1c..0696319adaf6 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -677,6 +677,10 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) if (blk_queue_stable_writes(ns->queue) && ns->head->disk) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->head->disk->queue); +#ifdef CONFIG_BLK_DEV_ZONED + if (blk_queue_is_zoned(ns->queue) && ns->head->disk) + ns->head->disk->queue->nr_zones = ns->queue->nr_zones; +#endif } void nvme_mpath_remove_disk(struct nvme_ns_head *head) From b5df8e79a293739f031f25eb45de350165033ea4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Feb 2021 17:17:34 +0100 Subject: [PATCH 47/64] nvmet-fc: add a missing __rcu annotation to nvmet_fc_tgt_assoc.queues Make sparse happy after the recent conversion to RCU lookups. Fixes: 4e2f02bf77da ("nvmet-fc: use RCU proctection for assoc_list") Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: James Smart --- drivers/nvme/target/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index c14c60bfdf85..d375745fc4ed 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -165,7 +165,7 @@ struct nvmet_fc_tgt_assoc { struct nvmet_fc_hostport *hostport; struct nvmet_fc_ls_iod *rcv_disconn; struct list_head a_list; - struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; + struct nvmet_fc_tgt_queue __rcu *queues[NVMET_NR_QUEUES + 1]; struct kref ref; struct work_struct del_work; struct rcu_head rcu; From 40244ad36bcfb796a6bb9e95bdcbf8ddf3134509 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 9 Feb 2021 21:47:52 -0800 Subject: [PATCH 48/64] nvmet: set status to 0 in case for invalid nsid For unallocated namespace in nvmet_execute_identify_ns() don't set the status to NVME_SC_INVALID_NS, set it to zero. Fixes: bffcd507780e ("nvmet: set right status on error in id-ns handler") Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 613a4d8feac1..5070ea5cf260 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -485,7 +485,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) /* return an all zeroed buffer if we can't find an active namespace */ req->ns = nvmet_find_namespace(ctrl, req->cmd->identify.nsid); if (!req->ns) { - status = NVME_SC_INVALID_NS; + status = 0; goto done; } From aa0aff604a60627b9f6c51c99dd5f63634322668 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 9 Feb 2021 21:47:53 -0800 Subject: [PATCH 49/64] nvmet: return uniform error for invalid ns For nvmet_find_namespace() error case we have inconsistent error code mapping in the function nvmet_get_smart_log_nsid() and nvmet_set_feat_write_protect(). There is no point in retrying for the invalid namesapce from the host side. Set the error code to the NVME_SC_INVALID_NS | NVME_SC_DNR which matches what we have in nvmet_execute_identify_desclist(). Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 5070ea5cf260..e938064254a5 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -82,7 +82,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, pr_err("Could not find namespace id : %d\n", le32_to_cpu(req->cmd->get_log_page.nsid)); req->error_loc = offsetof(struct nvme_rw_command, nsid); - return NVME_SC_INVALID_NS; + return NVME_SC_INVALID_NS | NVME_SC_DNR; } /* we don't have the right data for file backed ns */ @@ -697,7 +697,7 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req) req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid); if (unlikely(!req->ns)) { req->error_loc = offsetof(struct nvme_common_command, nsid); - return status; + return status = NVME_SC_INVALID_NS | NVME_SC_DNR; } mutex_lock(&subsys->lock); From 3a1f7c79ae6d3dfdc16082daa44b3cf8dbe4f238 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 9 Feb 2021 21:47:54 -0800 Subject: [PATCH 50/64] nvmet: make nvmet_find_namespace() req based The six callers of nvmet_find_namespace() duplicate the error log page update and status setting code for each call on failure. All callers are nvmet requests based functions, so we can pass req to the nvmet_find_namesapce() & derive ctrl from req, that'll allow us to update the error log page in nvmet_find_namespace(). Now that we pass the request we can also get rid of the local variable in nvmet_find_namespace() and use the req->ns and return the error code. Replace the ctrl parameter with nvmet_req for nvmet_find_namespace(), centralize the error log page update for non allocated namesapces, and return uniform error for non-allocated namespace. The nvmet_find_namespace() takes nsid parameter which is from NVMe commands structures such as get_log_page, identify, rw and common. All these commands have same offset for the nsid field. Derive nsid from req->cmd->common.nsid) & remove the extra parameter from the nvmet_find_namespace(). Lastly now we associate the ns to the req parameter that we pass to the nvmet_find_namespace(), rename nvmet_find_namespace() to nvmet_req_find_ns(). Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 46 +++++++++++++-------------------- drivers/nvme/target/core.c | 24 +++++++++-------- drivers/nvme/target/nvmet.h | 2 +- 3 files changed, 32 insertions(+), 40 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index e938064254a5..f32533480e66 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -75,15 +75,11 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, struct nvme_smart_log *slog) { u64 host_reads, host_writes, data_units_read, data_units_written; + u16 status; - req->ns = nvmet_find_namespace(req->sq->ctrl, - req->cmd->get_log_page.nsid); - if (!req->ns) { - pr_err("Could not find namespace id : %d\n", - le32_to_cpu(req->cmd->get_log_page.nsid)); - req->error_loc = offsetof(struct nvme_rw_command, nsid); - return NVME_SC_INVALID_NS | NVME_SC_DNR; - } + status = nvmet_req_find_ns(req); + if (status) + return status; /* we don't have the right data for file backed ns */ if (!req->ns->bdev) @@ -468,7 +464,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvme_id_ns *id; - u16 status = 0; + u16 status; if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) { req->error_loc = offsetof(struct nvme_identify, nsid); @@ -483,8 +479,8 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) } /* return an all zeroed buffer if we can't find an active namespace */ - req->ns = nvmet_find_namespace(ctrl, req->cmd->identify.nsid); - if (!req->ns) { + status = nvmet_req_find_ns(req); + if (status) { status = 0; goto done; } @@ -604,15 +600,12 @@ static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len, static void nvmet_execute_identify_desclist(struct nvmet_req *req) { - u16 status = 0; off_t off = 0; + u16 status; - req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); - if (!req->ns) { - req->error_loc = offsetof(struct nvme_identify, nsid); - status = NVME_SC_INVALID_NS | NVME_SC_DNR; + status = nvmet_req_find_ns(req); + if (status) goto out; - } if (memchr_inv(&req->ns->uuid, 0, sizeof(req->ns->uuid))) { status = nvmet_copy_ns_identifier(req, NVME_NIDT_UUID, @@ -692,13 +685,11 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req) { u32 write_protect = le32_to_cpu(req->cmd->common.cdw11); struct nvmet_subsys *subsys = req->sq->ctrl->subsys; - u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE; + u16 status; - req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid); - if (unlikely(!req->ns)) { - req->error_loc = offsetof(struct nvme_common_command, nsid); - return status = NVME_SC_INVALID_NS | NVME_SC_DNR; - } + status = nvmet_req_find_ns(req); + if (status) + return status; mutex_lock(&subsys->lock); switch (write_protect) { @@ -799,11 +790,10 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req) struct nvmet_subsys *subsys = req->sq->ctrl->subsys; u32 result; - req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid); - if (!req->ns) { - req->error_loc = offsetof(struct nvme_common_command, nsid); - return NVME_SC_INVALID_NS | NVME_SC_DNR; - } + result = nvmet_req_find_ns(req); + if (result) + return result; + mutex_lock(&subsys->lock); if (req->ns->readonly == true) result = NVME_NS_WRITE_PROTECT; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 8ce4d59cc9e7..95b58d4b1af2 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -417,15 +417,18 @@ void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) cancel_delayed_work_sync(&ctrl->ka_work); } -struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid) +u16 nvmet_req_find_ns(struct nvmet_req *req) { - struct nvmet_ns *ns; + u32 nsid = le32_to_cpu(req->cmd->common.nsid); - ns = xa_load(&ctrl->subsys->namespaces, le32_to_cpu(nsid)); - if (ns) - percpu_ref_get(&ns->ref); + req->ns = xa_load(&req->sq->ctrl->subsys->namespaces, nsid); + if (unlikely(!req->ns)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); + return NVME_SC_INVALID_NS | NVME_SC_DNR; + } - return ns; + percpu_ref_get(&req->ns->ref); + return NVME_SC_SUCCESS; } static void nvmet_destroy_namespace(struct percpu_ref *ref) @@ -862,11 +865,10 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) if (nvmet_req_passthru_ctrl(req)) return nvmet_parse_passthru_io_cmd(req); - req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); - if (unlikely(!req->ns)) { - req->error_loc = offsetof(struct nvme_common_command, nsid); - return NVME_SC_INVALID_NS | NVME_SC_DNR; - } + ret = nvmet_req_find_ns(req); + if (unlikely(ret)) + return ret; + ret = nvmet_check_ana_state(req->port, req->ns); if (unlikely(ret)) { req->error_loc = offsetof(struct nvme_common_command, nsid); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 8776dd1a0490..954b3d8451f5 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -443,7 +443,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, void nvmet_subsys_put(struct nvmet_subsys *subsys); void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys); -struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid); +u16 nvmet_req_find_ns(struct nvmet_req *req); void nvmet_put_namespace(struct nvmet_ns *ns); int nvmet_ns_enable(struct nvmet_ns *ns); void nvmet_ns_disable(struct nvmet_ns *ns); From 3999434b6ce6fa452128c36cbb5017f0cd347615 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 9 Feb 2021 21:47:55 -0800 Subject: [PATCH 51/64] nvmet: remove extra variable in id-ns handler In nvmet_execute_identify_ns() local variable ctrl is accessed only in one place, remove that and directly use it from nvmet_req->sq->ctrl. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index f32533480e66..552da813da18 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -462,7 +462,6 @@ out: static void nvmet_execute_identify_ns(struct nvmet_req *req) { - struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvme_id_ns *id; u16 status; @@ -523,7 +522,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) id->lbaf[0].ds = req->ns->blksize_shift; - if (ctrl->pi_support && nvmet_ns_has_pi(req->ns)) { + if (req->sq->ctrl->pi_support && nvmet_ns_has_pi(req->ns)) { id->dpc = NVME_NS_DPC_PI_FIRST | NVME_NS_DPC_PI_LAST | NVME_NS_DPC_PI_TYPE1 | NVME_NS_DPC_PI_TYPE2 | NVME_NS_DPC_PI_TYPE3; From d81d57cf1b4702b7c2fa8ce8f1d5c6961a0c20b5 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 9 Feb 2021 21:47:56 -0800 Subject: [PATCH 52/64] nvmet: add helper to report invalid opcode In the NVMeOF block device backend, file backend, and passthru backend we reject and report the commands if opcode is not handled. Add an helper and use it in block device backend to keep the code and error message uniform. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 9 +++++++++ drivers/nvme/target/io-cmd-bdev.c | 5 +---- drivers/nvme/target/nvmet.h | 1 + 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 95b58d4b1af2..35ad96261b8f 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -82,6 +82,15 @@ inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno) return status; } +u16 nvmet_report_invalid_opcode(struct nvmet_req *req) +{ + pr_debug("unhandled cmd %d on qid %d\n", req->cmd->common.opcode, + req->sq->qid); + + req->error_loc = offsetof(struct nvme_common_command, opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; +} + static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, const char *subsysnqn); diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 23095bdfce06..105ef2b125cf 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -449,9 +449,6 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) req->execute = nvmet_bdev_execute_write_zeroes; return 0; default: - pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, - req->sq->qid); - req->error_loc = offsetof(struct nvme_common_command, opcode); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return nvmet_report_invalid_opcode(req); } } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 954b3d8451f5..00f78e41d8c8 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -589,6 +589,7 @@ nvmet_req_passthru_ctrl(struct nvmet_req *req) } u16 errno_to_nvme_status(struct nvmet_req *req, int errno); +u16 nvmet_report_invalid_opcode(struct nvmet_req *req); /* Convert a 32-bit number to a 16-bit 0's based number */ static inline __le16 to0based(u32 a) From 1c2c76136875d2329339275d431484a33dbb612d Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 9 Feb 2021 21:47:57 -0800 Subject: [PATCH 53/64] nvmet: use invalid cmd opcode helper In the NVMeOF block device backend, file backend, and passthru backend we reject and report the commands if opcode is not handled. Use the previously introduced helper in file backend to reduce the duplicate code and make the error message uniform. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-file.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 0abbefd9925e..715d4376c997 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -400,9 +400,6 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) req->execute = nvmet_file_execute_write_zeroes; return 0; default: - pr_err("unhandled cmd for file ns %d on qid %d\n", - cmd->common.opcode, req->sq->qid); - req->error_loc = offsetof(struct nvme_common_command, opcode); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return nvmet_report_invalid_opcode(req); } } From 07116ea50fd3a3b58725389e4abaf1c03bcae641 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 9 Feb 2021 21:47:58 -0800 Subject: [PATCH 54/64] nvmet: use invalid cmd opcode helper In the NVMeOF block device backend, file backend, and passthru backend we reject and report the commands if opcode is not handled. Use the previously introduced helper in the passthru backend to make the error message uniform. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/passthru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index cbc88acdd233..3b22f4a868f4 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -494,7 +494,7 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) return nvmet_setup_passthru_command(req); default: /* Reject commands not in the allowlist above */ - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return nvmet_report_invalid_opcode(req); } } From d86481e924a7d6e8a40477ffa98077c6c0d77ed5 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 9 Feb 2021 21:47:59 -0800 Subject: [PATCH 55/64] nvmet: use min of device_path and disk len MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In function __assign_req_name() instead of using the DEVICE_NAME_LEN in strncpy() use min of DISK_NAME_LEN and strlen(req->ns->device_path). This is needed to turn off the following warnings:- In file included from drivers/nvme/target/core.c:14: In function ‘__assign_req_name’, inlined from ‘trace_event_raw_event_nvmet_req_init’ at drivers/nvme/target/./trace.h:58:1: drivers/nvme/target/trace.h:52:3: warning: ‘strncpy’ specified bound 32 equals destination size [-Wstringop-truncation] strncpy(name, req->ns->device_path, DISK_NAME_LEN); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function ‘__assign_req_name’, inlined from ‘perf_trace_nvmet_req_complete’ at drivers/nvme/target/./trace.h:100:1: drivers/nvme/target/trace.h:52:3: warning: ‘strncpy’ specified bound 32 equals destination size [-Wstringop-truncation] strncpy(name, req->ns->device_path, DISK_NAME_LEN); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function ‘__assign_req_name’, inlined from ‘perf_trace_nvmet_req_init’ at drivers/nvme/target/./trace.h:58:1: drivers/nvme/target/trace.h:52:3: warning: ‘strncpy’ specified bound 32 equals destination size [-Wstringop-truncation] strncpy(name, req->ns->device_path, DISK_NAME_LEN); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function ‘__assign_req_name’, inlined from ‘trace_event_raw_event_nvmet_req_complete’ at drivers/nvme/target/./trace.h:100:1: drivers/nvme/target/trace.h:52:3: warning: ‘strncpy’ specified bound 32 equals destination size [-Wstringop-truncation] strncpy(name, req->ns->device_path, DISK_NAME_LEN); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/trace.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/trace.h b/drivers/nvme/target/trace.h index c14e3249a14d..6109b3806b12 100644 --- a/drivers/nvme/target/trace.h +++ b/drivers/nvme/target/trace.h @@ -48,10 +48,13 @@ static inline struct nvmet_ctrl *nvmet_req_to_ctrl(struct nvmet_req *req) static inline void __assign_req_name(char *name, struct nvmet_req *req) { - if (req->ns) - strncpy(name, req->ns->device_path, DISK_NAME_LEN); - else + if (!req->ns) { memset(name, 0, DISK_NAME_LEN); + return; + } + + strncpy(name, req->ns->device_path, + min_t(size_t, DISK_NAME_LEN, strlen(req->ns->device_path))); } #endif From 20c2c3bb83f26c42bf62cc773f96f30848ed11a2 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 9 Feb 2021 21:48:01 -0800 Subject: [PATCH 56/64] nvmet: add nvmet_req_subsys() helper Just like what we have to get the passthru ctrl from the req, add an helper to get the subsystem associated with the nvmet_req() instead of open coding the chain of structures. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 10 +++++----- drivers/nvme/target/core.c | 2 +- drivers/nvme/target/nvmet.h | 7 ++++++- drivers/nvme/target/passthru.c | 4 ++-- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 552da813da18..bc6a774f2124 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -683,7 +683,7 @@ static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req) static u16 nvmet_set_feat_write_protect(struct nvmet_req *req) { u32 write_protect = le32_to_cpu(req->cmd->common.cdw11); - struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + struct nvmet_subsys *subsys = nvmet_req_subsys(req); u16 status; status = nvmet_req_find_ns(req); @@ -742,7 +742,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask) void nvmet_execute_set_features(struct nvmet_req *req) { - struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + struct nvmet_subsys *subsys = nvmet_req_subsys(req); u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11); u16 status = 0; @@ -786,7 +786,7 @@ void nvmet_execute_set_features(struct nvmet_req *req) static u16 nvmet_get_feat_write_protect(struct nvmet_req *req) { - struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + struct nvmet_subsys *subsys = nvmet_req_subsys(req); u32 result; result = nvmet_req_find_ns(req); @@ -816,7 +816,7 @@ void nvmet_get_feat_async_event(struct nvmet_req *req) void nvmet_execute_get_features(struct nvmet_req *req) { - struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + struct nvmet_subsys *subsys = nvmet_req_subsys(req); u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); u16 status = 0; @@ -923,7 +923,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (nvme_is_fabrics(cmd)) return nvmet_parse_fabrics_cmd(req); - if (req->sq->ctrl->subsys->type == NVME_NQN_DISC) + if (nvmet_req_subsys(req)->type == NVME_NQN_DISC) return nvmet_parse_discovery_cmd(req); ret = nvmet_check_ctrl_status(req, cmd); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 35ad96261b8f..7e3b194203a4 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -430,7 +430,7 @@ u16 nvmet_req_find_ns(struct nvmet_req *req) { u32 nsid = le32_to_cpu(req->cmd->common.nsid); - req->ns = xa_load(&req->sq->ctrl->subsys->namespaces, nsid); + req->ns = xa_load(&nvmet_req_subsys(req)->namespaces, nsid); if (unlikely(!req->ns)) { req->error_loc = offsetof(struct nvme_common_command, nsid); return NVME_SC_INVALID_NS | NVME_SC_DNR; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 00f78e41d8c8..cdfa537b1c0a 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -551,6 +551,11 @@ static inline u32 nvmet_dsm_len(struct nvmet_req *req) sizeof(struct nvme_dsm_range); } +static inline struct nvmet_subsys *nvmet_req_subsys(struct nvmet_req *req) +{ + return req->sq->ctrl->subsys; +} + #ifdef CONFIG_NVME_TARGET_PASSTHRU void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys); int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys); @@ -585,7 +590,7 @@ static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) static inline struct nvme_ctrl * nvmet_req_passthru_ctrl(struct nvmet_req *req) { - return nvmet_passthru_ctrl(req->sq->ctrl->subsys); + return nvmet_passthru_ctrl(nvmet_req_subsys(req)); } u16 errno_to_nvme_status(struct nvmet_req *req, int errno); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 3b22f4a868f4..f50c7b2bf21c 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -239,9 +239,9 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) } q = ns->queue; - timeout = req->sq->ctrl->subsys->io_timeout; + timeout = nvmet_req_subsys(req)->io_timeout; } else { - timeout = req->sq->ctrl->subsys->admin_timeout; + timeout = nvmet_req_subsys(req)->admin_timeout; } rq = nvme_alloc_request(q, req->cmd, 0); From 295a39f5a56f3276bae6a0ae5c26ce06bb8aa21c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 9 Feb 2021 21:48:02 -0800 Subject: [PATCH 57/64] nvmet: remove else at the end of the function The function nvmet_parse_io_cmd() returns value from nvmet_file_parse_io_cmd() or nvmet_bdev_parse_io_cmd() based on which backend is set for the request. Remove the else and just return the value from nvmet_bdev_parse_io_cmd(). Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 7e3b194203a4..67bbf0e3b507 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -891,8 +891,8 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) if (req->ns->file) return nvmet_file_parse_io_cmd(req); - else - return nvmet_bdev_parse_io_cmd(req); + + return nvmet_bdev_parse_io_cmd(req); } bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, From ed7770f6628691c13c9423bce7eee7cff2399c12 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 19 Jan 2021 07:43:18 +0100 Subject: [PATCH 58/64] nvme-hwmon: rework to avoid devm allocation The original design to use device-managed resource allocation doesn't really work as the NVMe controller has a vastly different lifetime than the hwmon sysfs attributes, causing warning about duplicate sysfs entries upon reconnection. This patch reworks the hwmon allocation to avoid device-managed resource allocation, and uses the NVMe controller as parent for the sysfs attributes. Cc: Guenter Roeck Signed-off-by: Hannes Reinecke Tested-by: Enzo Matsumiya Tested-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/hwmon.c | 31 +++++++++++++++++++++---------- drivers/nvme/host/nvme.h | 8 ++++++++ 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 02579f4f776c..d77f3f26d8d3 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4471,6 +4471,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { + nvme_hwmon_exit(ctrl); nvme_fault_inject_fini(&ctrl->fault_inject); dev_pm_qos_hide_latency_tolerance(ctrl->device); cdev_device_del(&ctrl->cdev, ctrl->device); diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 552dbc04567b..8f9e96986780 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -223,12 +223,12 @@ static const struct hwmon_chip_info nvme_hwmon_chip_info = { int nvme_hwmon_init(struct nvme_ctrl *ctrl) { - struct device *dev = ctrl->dev; + struct device *dev = ctrl->device; struct nvme_hwmon_data *data; struct device *hwmon; int err; - data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return 0; @@ -237,19 +237,30 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl) err = nvme_hwmon_get_smart_log(data); if (err) { - dev_warn(ctrl->device, - "Failed to read smart log (error %d)\n", err); - devm_kfree(dev, data); + dev_warn(dev, "Failed to read smart log (error %d)\n", err); + kfree(data); return err; } - hwmon = devm_hwmon_device_register_with_info(dev, "nvme", data, - &nvme_hwmon_chip_info, - NULL); + hwmon = hwmon_device_register_with_info(dev, "nvme", + data, &nvme_hwmon_chip_info, + NULL); if (IS_ERR(hwmon)) { dev_warn(dev, "Failed to instantiate hwmon device\n"); - devm_kfree(dev, data); + kfree(data); } - + ctrl->hwmon_device = hwmon; return 0; } + +void nvme_hwmon_exit(struct nvme_ctrl *ctrl) +{ + if (ctrl->hwmon_device) { + struct nvme_hwmon_data *data = + dev_get_drvdata(ctrl->hwmon_device); + + hwmon_device_unregister(ctrl->hwmon_device); + ctrl->hwmon_device = NULL; + kfree(data); + } +} diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5819f0381041..2efb87642d18 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -246,6 +246,9 @@ struct nvme_ctrl { struct rw_semaphore namespaces_rwsem; struct device ctrl_device; struct device *device; /* char device */ +#ifdef CONFIG_NVME_HWMON + struct device *hwmon_device; +#endif struct cdev cdev; struct work_struct reset_work; struct work_struct delete_work; @@ -812,11 +815,16 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) #ifdef CONFIG_NVME_HWMON int nvme_hwmon_init(struct nvme_ctrl *ctrl); +void nvme_hwmon_exit(struct nvme_ctrl *ctrl); #else static inline int nvme_hwmon_init(struct nvme_ctrl *ctrl) { return 0; } + +static inline void nvme_hwmon_exit(struct nvme_ctrl *ctrl) +{ +} #endif u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, From 4bdf260362b3be529d170b04662638fd6dc52241 Mon Sep 17 00:00:00 2001 From: Filippo Sironi Date: Wed, 10 Feb 2021 01:39:42 +0100 Subject: [PATCH 59/64] nvme: add 48-bit DMA address quirk for Amazon NVMe controllers Some Amazon NVMe controllers do not follow the NVMe specification and are limited to 48-bit DMA addresses. Add a quirk to force bounce buffering if needed and limit the IOVA allocation for these devices. This affects all current Amazon NVMe controllers that expose EBS volumes (0x0061, 0x0065, 0x8061) and local instance storage (0xcd00, 0xcd01, 0xcd02). Signed-off-by: Filippo Sironi Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 6 ++++++ drivers/nvme/host/pci.c | 21 ++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2efb87642d18..07b34175c6ce 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -144,6 +144,12 @@ enum nvme_quirks { * NVMe 1.3 compliance. */ NVME_QUIRK_NO_NS_DESC_LIST = (1 << 15), + + /* + * The controller does not properly handle DMA addresses over + * 48 bits. + */ + NVME_QUIRK_DMA_ADDRESS_BITS_48 = (1 << 16), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5b78e68be9a1..0045c5edf629 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2362,13 +2362,16 @@ static int nvme_pci_enable(struct nvme_dev *dev) { int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); + int dma_address_bits = 64; if (pci_enable_device_mem(pdev)) return result; pci_set_master(pdev); - if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64))) + if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48) + dma_address_bits = 48; + if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits))) goto disable; if (readl(dev->bar + NVME_REG_CSTS) == -1) { @@ -3257,6 +3260,22 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x15b7, 0x2001), /* Sandisk Skyhawk */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x1d97, 0x2263), /* SPCC */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x8061), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd00), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd01), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), .driver_data = NVME_QUIRK_SINGLE_VECTOR }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, From a2d52a6c1b6764031b6cac7cc156530cbb38248c Mon Sep 17 00:00:00 2001 From: Liao Pingfang Date: Sat, 6 Feb 2021 15:10:55 +0800 Subject: [PATCH 60/64] nbd: Convert to DEFINE_SHOW_ATTRIBUTE Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code. Signed-off-by: Liao Pingfang Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6727358e147d..b076a0a53fb1 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1521,17 +1521,7 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused) return 0; } -static int nbd_dbg_tasks_open(struct inode *inode, struct file *file) -{ - return single_open(file, nbd_dbg_tasks_show, inode->i_private); -} - -static const struct file_operations nbd_dbg_tasks_ops = { - .open = nbd_dbg_tasks_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(nbd_dbg_tasks); static int nbd_dbg_flags_show(struct seq_file *s, void *unused) { @@ -1556,17 +1546,7 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused) return 0; } -static int nbd_dbg_flags_open(struct inode *inode, struct file *file) -{ - return single_open(file, nbd_dbg_flags_show, inode->i_private); -} - -static const struct file_operations nbd_dbg_flags_ops = { - .open = nbd_dbg_flags_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(nbd_dbg_flags); static int nbd_dev_dbg_init(struct nbd_device *nbd) { @@ -1584,11 +1564,11 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd) } config->dbg_dir = dir; - debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops); + debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_fops); debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize); debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout); debugfs_create_u64("blocksize", 0444, dir, &config->blksize); - debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops); + debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_fops); return 0; } From 597886836164ef18b76faea7304357556fe29da9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 10 Feb 2021 15:51:59 -0800 Subject: [PATCH 61/64] block: Replace lkml.org links with lore As started by commit 05a5f51ca566 ("Documentation: Replace lkml.org links with lore"), replace lkml.org links with lore to better use a single source that's more likely to stay available long-term. Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/block/aoe/aoecmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index ac720bdcd983..ecd77897a761 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1046,7 +1046,7 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail) __blk_mq_end_request(rq, err); - /* cf. http://lkml.org/lkml/2006/10/31/28 */ + /* cf. https://lore.kernel.org/lkml/20061031071040.GS14055@kernel.dk/ */ if (!fastfail) blk_mq_run_hw_queues(q, true); } From e11e5116171dedeaf63735931e72ad5de0f30ed5 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 10 Feb 2021 14:04:00 -0800 Subject: [PATCH 62/64] nvme-tcp: fix crash triggered with a dataless request submission write-zeros has a bio, but does not have any data buffers associated with it. Hence should not initialize the request iter for it (which attempts to reference the bi_io_vec (and crash). -- run blktests nvme/012 at 2021-02-05 21:53:34 BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 15 PID: 12069 Comm: kworker/15:2H Tainted: G S I 5.11.0-rc6+ #1 Hardware name: Dell Inc. PowerEdge R640/06NR82, BIOS 2.10.0 11/12/2020 Workqueue: kblockd blk_mq_run_work_fn RIP: 0010:nvme_tcp_init_iter+0x7d/0xd0 [nvme_tcp] RSP: 0018:ffffbd084447bd18 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffa0bba9f3ce80 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000002000000 RBP: ffffa0ba8ac6fec0 R08: 0000000002000000 R09: 0000000000000000 R10: 0000000002800809 R11: 0000000000000000 R12: 0000000000000000 R13: ffffa0bba9f3cf90 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffffa0c9ff9c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 00000001c9c6c005 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: nvme_tcp_queue_rq+0xef/0x330 [nvme_tcp] blk_mq_dispatch_rq_list+0x11c/0x7c0 ? blk_mq_flush_busy_ctxs+0xf6/0x110 __blk_mq_sched_dispatch_requests+0x12b/0x170 blk_mq_sched_dispatch_requests+0x30/0x60 __blk_mq_run_hw_queue+0x2b/0x60 process_one_work+0x1cb/0x360 ? process_one_work+0x360/0x360 worker_thread+0x30/0x370 ? process_one_work+0x360/0x360 kthread+0x116/0x130 ? kthread_park+0x80/0x80 ret_from_fork+0x1f/0x30 -- Fixes: cb9b870fba3e ("nvme-tcp: fix wrong setting of request iov_iter") Reported-by: Yi Zhang Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Tested-by: Yi Zhang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 619b0d8f6e38..69f59d2c5799 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2271,7 +2271,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, req->data_len = blk_rq_nr_phys_segments(rq) ? blk_rq_payload_bytes(rq) : 0; req->curr_bio = rq->bio; - if (req->curr_bio) + if (req->curr_bio && req->data_len) nvme_tcp_init_iter(req, rq_data_dir(rq)); if (rq_data_dir(rq) == WRITE && From 4cf29e43afc0dea7ccf6b09a20bd598fad47bf60 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Sun, 14 Feb 2021 10:31:02 +0000 Subject: [PATCH 63/64] lightnvm: fix unnecessary NULL check warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove NULL checks before vfree() to fix these warnings: ./drivers/lightnvm/pblk-gc.c:27:2-7: WARNING: NULL check before some freeing functions is not needed. Signed-off-by: Tian Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 2581eebcfc41..b31658be35a7 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -23,8 +23,7 @@ static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq) { - if (gc_rq->data) - vfree(gc_rq->data); + vfree(gc_rq->data); kfree(gc_rq); } From f4b64ae6745177642cd9610cfd7df0041e7fca58 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sun, 14 Feb 2021 10:31:03 +0000 Subject: [PATCH 64/64] lightnvm: pblk: Replace guid_copy() with export_guid()/import_guid() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a specific API to treat raw data as GUID, i.e. export_guid() and import_guid(). Use them instead of guid_copy() with explicit casting. Signed-off-by: Andy Shevchenko Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 5 ++--- drivers/lightnvm/pblk-recovery.c | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 1dddba11e721..33d39d3dd343 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -988,7 +988,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len); smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); - guid_copy((guid_t *)&smeta_buf->header.uuid, &pblk->instance_uuid); + export_guid(smeta_buf->header.uuid, &pblk->instance_uuid); smeta_buf->header.id = cpu_to_le32(line->id); smeta_buf->header.type = cpu_to_le16(line->type); smeta_buf->header.version_major = SMETA_VERSION_MAJOR; @@ -1803,8 +1803,7 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) { emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); - guid_copy((guid_t *)&emeta_buf->header.uuid, - &pblk->instance_uuid); + export_guid(emeta_buf->header.uuid, &pblk->instance_uuid); emeta_buf->header.id = cpu_to_le32(line->id); emeta_buf->header.type = cpu_to_le16(line->type); emeta_buf->header.version_major = EMETA_VERSION_MAJOR; diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 299ef47a17b2..0e6f0c76e930 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -706,8 +706,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) /* The first valid instance uuid is used for initialization */ if (!valid_uuid) { - guid_copy(&pblk->instance_uuid, - (guid_t *)&smeta_buf->header.uuid); + import_guid(&pblk->instance_uuid, smeta_buf->header.uuid); valid_uuid = 1; }