From 0bca799b92807ee9be0890690f5dde7d8c6a8e25 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 5 Apr 2018 00:35:21 +0800 Subject: [PATCH 01/28] blk-mq: order getting budget and driver tag This patch orders getting budget and driver tag by making sure to acquire driver tag after budget is got, this way can help to avoid the following race: 1) before dispatch request from scheduler queue, get one budget first, then dequeue a request, call it request A. 2) in another IO path for dispatching request B which is from hctx->dispatch, driver tag is got, then try to get budget in blk_mq_dispatch_rq_list(), unfortunately the budget is held by request A. 3) meantime blk_mq_dispatch_rq_list() is called for dispatching request A, and try to get driver tag first, unfortunately no driver tag is available because the driver tag is held by request B 4) both two IO pathes can't move on, and IO stall is caused. This issue can be observed when running dbench on USB storage. This patch fixes this issue by always getting budget before getting driver tag. Cc: stable@vger.kernel.org Fixes: de1482974080ec9e ("blk-mq: introduce .get_budget and .put_budget in blk_mq_ops") Cc: Christoph Hellwig Cc: Bart Van Assche Cc: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index f5c7dbcb954f..90f869a083a4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1180,7 +1180,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); - if (!blk_mq_get_driver_tag(rq, &hctx, false)) { + + hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); + if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) + break; + + if (!blk_mq_get_driver_tag(rq, NULL, false)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The @@ -1189,8 +1194,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * we'll re-run it below. */ if (!blk_mq_mark_tag_wait(&hctx, rq)) { - if (got_budget) - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(hctx); /* * For non-shared tags, the RESTART check * will suffice. @@ -1201,11 +1205,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, } } - if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) { - blk_mq_put_driver_tag(rq); - break; - } - list_del_init(&rq->queuelist); bd.rq = rq; @@ -1804,11 +1803,11 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (q->elevator && !bypass_insert) goto insert; - if (!blk_mq_get_driver_tag(rq, NULL, false)) + if (!blk_mq_get_dispatch_budget(hctx)) goto insert; - if (!blk_mq_get_dispatch_budget(hctx)) { - blk_mq_put_driver_tag(rq); + if (!blk_mq_get_driver_tag(rq, NULL, false)) { + blk_mq_put_dispatch_budget(hctx); goto insert; } From 1e047eaab3bb5564f25b41e9cd3a053009f4e789 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 6 Apr 2018 10:03:17 +0900 Subject: [PATCH 02/28] block/loop: fix deadlock after loop_set_status syzbot is reporting deadlocks at __blkdev_get() [1]. ---------------------------------------- [ 92.493919] systemd-udevd D12696 525 1 0x00000000 [ 92.495891] Call Trace: [ 92.501560] schedule+0x23/0x80 [ 92.502923] schedule_preempt_disabled+0x5/0x10 [ 92.504645] __mutex_lock+0x416/0x9e0 [ 92.510760] __blkdev_get+0x73/0x4f0 [ 92.512220] blkdev_get+0x12e/0x390 [ 92.518151] do_dentry_open+0x1c3/0x2f0 [ 92.519815] path_openat+0x5d9/0xdc0 [ 92.521437] do_filp_open+0x7d/0xf0 [ 92.527365] do_sys_open+0x1b8/0x250 [ 92.528831] do_syscall_64+0x6e/0x270 [ 92.530341] entry_SYSCALL_64_after_hwframe+0x42/0xb7 [ 92.931922] 1 lock held by systemd-udevd/525: [ 92.933642] #0: 00000000a2849e25 (&bdev->bd_mutex){+.+.}, at: __blkdev_get+0x73/0x4f0 ---------------------------------------- The reason of deadlock turned out that wait_event_interruptible() in blk_queue_enter() got stuck with bdev->bd_mutex held at __blkdev_put() due to q->mq_freeze_depth == 1. ---------------------------------------- [ 92.787172] a.out S12584 634 633 0x80000002 [ 92.789120] Call Trace: [ 92.796693] schedule+0x23/0x80 [ 92.797994] blk_queue_enter+0x3cb/0x540 [ 92.803272] generic_make_request+0xf0/0x3d0 [ 92.807970] submit_bio+0x67/0x130 [ 92.810928] submit_bh_wbc+0x15e/0x190 [ 92.812461] __block_write_full_page+0x218/0x460 [ 92.815792] __writepage+0x11/0x50 [ 92.817209] write_cache_pages+0x1ae/0x3d0 [ 92.825585] generic_writepages+0x5a/0x90 [ 92.831865] do_writepages+0x43/0xd0 [ 92.836972] __filemap_fdatawrite_range+0xc1/0x100 [ 92.838788] filemap_write_and_wait+0x24/0x70 [ 92.840491] __blkdev_put+0x69/0x1e0 [ 92.841949] blkdev_close+0x16/0x20 [ 92.843418] __fput+0xda/0x1f0 [ 92.844740] task_work_run+0x87/0xb0 [ 92.846215] do_exit+0x2f5/0xba0 [ 92.850528] do_group_exit+0x34/0xb0 [ 92.852018] SyS_exit_group+0xb/0x10 [ 92.853449] do_syscall_64+0x6e/0x270 [ 92.854944] entry_SYSCALL_64_after_hwframe+0x42/0xb7 [ 92.943530] 1 lock held by a.out/634: [ 92.945105] #0: 00000000a2849e25 (&bdev->bd_mutex){+.+.}, at: __blkdev_put+0x3c/0x1e0 ---------------------------------------- The reason of q->mq_freeze_depth == 1 turned out that loop_set_status() forgot to call blk_mq_unfreeze_queue() at error paths for info->lo_encrypt_type != NULL case. ---------------------------------------- [ 37.509497] CPU: 2 PID: 634 Comm: a.out Tainted: G W 4.16.0+ #457 [ 37.513608] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/19/2017 [ 37.518832] RIP: 0010:blk_freeze_queue_start+0x17/0x40 [ 37.521778] RSP: 0018:ffffb0c2013e7c60 EFLAGS: 00010246 [ 37.524078] RAX: 0000000000000000 RBX: ffff8b07b1519798 RCX: 0000000000000000 [ 37.527015] RDX: 0000000000000002 RSI: ffffb0c2013e7cc0 RDI: ffff8b07b1519798 [ 37.529934] RBP: ffffb0c2013e7cc0 R08: 0000000000000008 R09: 47a189966239b898 [ 37.532684] R10: dad78b99b278552f R11: 9332dca72259d5ef R12: ffff8b07acd73678 [ 37.535452] R13: 0000000000004c04 R14: 0000000000000000 R15: ffff8b07b841e940 [ 37.538186] FS: 00007fede33b9740(0000) GS:ffff8b07b8e80000(0000) knlGS:0000000000000000 [ 37.541168] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 37.543590] CR2: 00000000206fdf18 CR3: 0000000130b30006 CR4: 00000000000606e0 [ 37.546410] Call Trace: [ 37.547902] blk_freeze_queue+0x9/0x30 [ 37.549968] loop_set_status+0x67/0x3c0 [loop] [ 37.549975] loop_set_status64+0x3b/0x70 [loop] [ 37.549986] lo_ioctl+0x223/0x810 [loop] [ 37.549995] blkdev_ioctl+0x572/0x980 [ 37.550003] block_ioctl+0x34/0x40 [ 37.550006] do_vfs_ioctl+0xa7/0x6d0 [ 37.550017] ksys_ioctl+0x6b/0x80 [ 37.573076] SyS_ioctl+0x5/0x10 [ 37.574831] do_syscall_64+0x6e/0x270 [ 37.576769] entry_SYSCALL_64_after_hwframe+0x42/0xb7 ---------------------------------------- [1] https://syzkaller.appspot.com/bug?id=cd662bc3f6022c0979d01a262c318fab2ee9b56f Signed-off-by: Tetsuo Handa Reported-by: syzbot Fixes: ecdd09597a572513 ("block/loop: fix race between I/O and set_status") Cc: Ming Lei Cc: Dmitry Vyukov Cc: stable Cc: Jens Axboe Signed-off-by: Jens Axboe --- drivers/block/loop.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 264abaaff662..e5fc020cceda 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1103,11 +1103,15 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if (info->lo_encrypt_type) { unsigned int type = info->lo_encrypt_type; - if (type >= MAX_LO_CRYPT) - return -EINVAL; + if (type >= MAX_LO_CRYPT) { + err = -EINVAL; + goto exit; + } xfer = xfer_funcs[type]; - if (xfer == NULL) - return -EINVAL; + if (xfer == NULL) { + err = -EINVAL; + goto exit; + } } else xfer = NULL; From bdac616db9bbadb90b7d6a406144571015e138f7 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 6 Apr 2018 09:57:03 -0700 Subject: [PATCH 03/28] loop: fix LOOP_GET_STATUS lock imbalance Commit 2d1d4c1e591f made loop_get_status() drop lo_ctx_mutex before returning, but the loop_get_status_old(), loop_get_status64(), and loop_get_status_compat() wrappers don't call loop_get_status() if the passed argument is NULL. The callers expect that the lock is dropped, so make sure we drop it in that case, too. Reported-by: syzbot+31e8daa8b3fc129e75f2@syzkaller.appspotmail.com Fixes: 2d1d4c1e591f ("loop: don't call into filesystem while holding lo_ctl_mutex") Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index e5fc020cceda..c9d04497a415 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1287,12 +1287,13 @@ static int loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) { struct loop_info info; struct loop_info64 info64; - int err = 0; + int err; - if (!arg) - err = -EINVAL; - if (!err) - err = loop_get_status(lo, &info64); + if (!arg) { + mutex_unlock(&lo->lo_ctl_mutex); + return -EINVAL; + } + err = loop_get_status(lo, &info64); if (!err) err = loop_info64_to_old(&info64, &info); if (!err && copy_to_user(arg, &info, sizeof(info))) @@ -1304,12 +1305,13 @@ loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) { static int loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { struct loop_info64 info64; - int err = 0; + int err; - if (!arg) - err = -EINVAL; - if (!err) - err = loop_get_status(lo, &info64); + if (!arg) { + mutex_unlock(&lo->lo_ctl_mutex); + return -EINVAL; + } + err = loop_get_status(lo, &info64); if (!err && copy_to_user(arg, &info64, sizeof(info64))) err = -EFAULT; @@ -1533,12 +1535,13 @@ loop_get_status_compat(struct loop_device *lo, struct compat_loop_info __user *arg) { struct loop_info64 info64; - int err = 0; + int err; - if (!arg) - err = -EINVAL; - if (!err) - err = loop_get_status(lo, &info64); + if (!arg) { + mutex_unlock(&lo->lo_ctl_mutex); + return -EINVAL; + } + err = loop_get_status(lo, &info64); if (!err) err = loop_info64_to_compat(&info64, arg); return err; From a1c735fb790745f94a359df45c11df4a69760389 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 8 Apr 2018 17:48:07 +0800 Subject: [PATCH 04/28] blk-mq: make sure that correct hctx->next_cpu is set From commit 20e4d81393196 (blk-mq: simplify queue mapping & schedule with each possisble CPU), one hctx can be mapped from all offline CPUs, then hctx->next_cpu can be set as wrong. This patch fixes this issue by making hctx->next_cpu pointing to the first CPU in hctx->cpumask if all CPUs in hctx->cpumask are offline. Cc: Stefan Haberland Tested-by: Christian Borntraeger Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Fixes: 20e4d81393196 ("blk-mq: simplify queue mapping & schedule with each possisble CPU") Cc: stable@vger.kernel.org Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 90f869a083a4..f489ec920807 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2431,6 +2431,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) */ hctx->next_cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); + if (hctx->next_cpu >= nr_cpu_ids) + hctx->next_cpu = cpumask_first(hctx->cpumask); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } } From bffa9909a6b48d8ca3398dec601bc9162a4020c4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 8 Apr 2018 17:48:08 +0800 Subject: [PATCH 05/28] blk-mq: don't keep offline CPUs mapped to hctx 0 From commit 4b855ad37194 ("blk-mq: Create hctx for each present CPU), blk-mq doesn't remap queue after CPU topo is changed, that said when some of these offline CPUs become online, they are still mapped to hctx 0, then hctx 0 may become the bottleneck of IO dispatch and completion. This patch sets up the mapping from the beginning, and aligns to queue mapping for PCI device (blk_mq_pci_map_queues()). Cc: Stefan Haberland Cc: Keith Busch Cc: stable@vger.kernel.org Fixes: 4b855ad37194 ("blk-mq: Create hctx for each present CPU) Tested-by: Christian Borntraeger Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-cpumap.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 9f8cffc8a701..3eb169f15842 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -16,11 +16,6 @@ static int cpu_to_queue_index(unsigned int nr_queues, const int cpu) { - /* - * Non present CPU will be mapped to queue index 0. - */ - if (!cpu_present(cpu)) - return 0; return cpu % nr_queues; } From 476f8c98a9bccccbb97866974ffc80879adf2bbb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 8 Apr 2018 17:48:09 +0800 Subject: [PATCH 06/28] blk-mq: avoid to write intermediate result to hctx->next_cpu This patch figures out the final selected CPU, then writes it to hctx->next_cpu once, then we can avoid to intermediate next cpu observed from other dispatch paths. Cc: Stefan Haberland Tested-by: Christian Borntraeger Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index f489ec920807..db178c577068 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1344,26 +1344,24 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { bool tried = false; + int next_cpu = hctx->next_cpu; if (hctx->queue->nr_hw_queues == 1) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { - int next_cpu; select_cpu: - next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask, + next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, cpu_online_mask); if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask); + next_cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); /* * No online CPU is found, so have to make sure hctx->next_cpu * is set correctly for not breaking workqueue. */ if (next_cpu >= nr_cpu_ids) - hctx->next_cpu = cpumask_first(hctx->cpumask); - else - hctx->next_cpu = next_cpu; + next_cpu = cpumask_first(hctx->cpumask); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } @@ -1371,7 +1369,7 @@ select_cpu: * Do unbound schedule if we can't find a online CPU for this hctx, * and it should only happen in the path of handling CPU DEAD. */ - if (!cpu_online(hctx->next_cpu)) { + if (!cpu_online(next_cpu)) { if (!tried) { tried = true; goto select_cpu; @@ -1381,10 +1379,13 @@ select_cpu: * Make sure to re-select CPU next time once after CPUs * in hctx->cpumask become online again. */ + hctx->next_cpu = next_cpu; hctx->next_cpu_batch = 1; return WORK_CPU_UNBOUND; } - return hctx->next_cpu; + + hctx->next_cpu = next_cpu; + return next_cpu; } static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, From f82ddf1923b90f89665d08cf219287c8f9deb739 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 8 Apr 2018 17:48:10 +0800 Subject: [PATCH 07/28] blk-mq: introduce blk_mq_hw_queue_first_cpu() to figure out first cpu This patch introduces helper of blk_mq_hw_queue_first_cpu() for figuring out the hctx's first cpu, and code duplication can be avoided. Cc: Stefan Haberland Tested-by: Christian Borntraeger Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index db178c577068..e05bd10d5c84 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1335,6 +1335,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) hctx_unlock(hctx, srcu_idx); } +static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) +{ + int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); + + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(hctx->cpumask); + return cpu; +} + /* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. @@ -1354,14 +1363,7 @@ select_cpu: next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, cpu_online_mask); if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); - - /* - * No online CPU is found, so have to make sure hctx->next_cpu - * is set correctly for not breaking workqueue. - */ - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(hctx->cpumask); + next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } @@ -2430,10 +2432,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) /* * Initialize batch roundrobin counts */ - hctx->next_cpu = cpumask_first_and(hctx->cpumask, - cpu_online_mask); - if (hctx->next_cpu >= nr_cpu_ids) - hctx->next_cpu = cpumask_first(hctx->cpumask); + hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } } From 15fe8a90bb45b953ca36f074194fcb519a05fdec Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 8 Apr 2018 17:48:11 +0800 Subject: [PATCH 08/28] blk-mq: remove blk_mq_delay_queue() No driver uses this interface any more, so remove it. Cc: Stefan Haberland Tested-by: Christian Borntraeger Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/blk-mq.c | 30 ++---------------------------- include/linux/blk-mq.h | 2 -- 3 files changed, 2 insertions(+), 31 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 58b3b79cbe83..3080e18cb859 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -235,7 +235,6 @@ static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), - HCTX_STATE_NAME(START_ON_RUN), }; #undef HCTX_STATE_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index e05bd10d5c84..c2c6d276da3a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1562,40 +1562,14 @@ static void blk_mq_run_work_fn(struct work_struct *work) hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); /* - * If we are stopped, don't run the queue. The exception is if - * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear - * the STOPPED bit and run it. + * If we are stopped, don't run the queue. */ - if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) { - if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state)) - return; - - clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state); + if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - } __blk_mq_run_hw_queue(hctx); } - -void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) -{ - if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx))) - return; - - /* - * Stop the hw queue, then modify currently delayed work. - * This should prevent us from running the queue prematurely. - * Mark the queue as auto-clearing STOPPED when it runs. - */ - blk_mq_stop_hw_queue(hctx); - set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state); - kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), - &hctx->run_work, - msecs_to_jiffies(msecs)); -} -EXPORT_SYMBOL(blk_mq_delay_queue); - static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8efcf49796a3..e3986f4b3461 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -183,7 +183,6 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, - BLK_MQ_S_START_ON_RUN = 3, BLK_MQ_MAX_DEPTH = 10240, @@ -270,7 +269,6 @@ void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); -void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_freeze_queue(struct request_queue *q); From efea8450c3d2d3918029b36f59ef612be57d91ae Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 8 Apr 2018 17:48:12 +0800 Subject: [PATCH 09/28] blk-mq: don't check queue mapped in __blk_mq_delay_run_hw_queue() There are several reasons for removing the check: 1) blk_mq_hw_queue_mapped() returns true always now since each hctx may be mapped by one CPU at least 2) when there isn't any online CPU mapped to this hctx, there won't be any IO queued to this CPU, blk_mq_run_hw_queue() only runs queue if there is IO queued to this hctx 3) If __blk_mq_delay_run_hw_queue() is called by blk_mq_delay_run_hw_queue(), which is run from blk_mq_dispatch_rq_list() or scsi_mq_get_budget(), and the hctx to be handled has to be mapped. Cc: Stefan Haberland Tested-by: Christian Borntraeger Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index c2c6d276da3a..0ee9d8e964b3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1393,9 +1393,6 @@ select_cpu: static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, unsigned long msecs) { - if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx))) - return; - if (unlikely(blk_mq_hctx_stopped(hctx))) return; From 127276c6ce5a30fcc806b7fe53015f4f89b62956 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 8 Apr 2018 17:48:13 +0800 Subject: [PATCH 10/28] blk-mq: reimplement blk_mq_hw_queue_mapped Now the actual meaning of queue mapped is that if there is any online CPU mapped to this hctx, so implement blk_mq_hw_queue_mapped() in this way. Cc: Stefan Haberland Tested-by: Christian Borntraeger Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.h b/block/blk-mq.h index 88c558f71819..502af371b83b 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -181,7 +181,7 @@ static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) { - return hctx->nr_ctx && hctx->tags; + return cpumask_first_and(hctx->cpumask, cpu_online_mask) < nr_cpu_ids; } void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, From 37c7c6c76d431dd7ef9c29d95f6052bd425f004c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 8 Apr 2018 17:48:14 +0800 Subject: [PATCH 11/28] blk-mq: remove code for dealing with remapping queue Firstly, from commit 4b855ad37194 ("blk-mq: Create hctx for each present CPU), blk-mq doesn't remap queue any more after CPU topo is changed. Secondly, set->nr_hw_queues can't be bigger than nr_cpu_ids, and now we map all possible CPUs to hw queues, so at least one CPU is mapped to each hctx. So queue mapping has became static and fixed just like percpu variable, and we don't need to handle queue remapping any more. Cc: Stefan Haberland Tested-by: Christian Borntraeger Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 34 +++------------------------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0ee9d8e964b3..0dc9e341c2a7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2329,7 +2329,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, static void blk_mq_map_swqueue(struct request_queue *q) { - unsigned int i, hctx_idx; + unsigned int i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; @@ -2346,23 +2346,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) /* * Map software to hardware queues. - * - * If the cpu isn't present, the cpu is mapped to first hctx. */ for_each_possible_cpu(i) { - hctx_idx = q->mq_map[i]; - /* unmapped hw queue can be remapped after CPU topo changed */ - if (!set->tags[hctx_idx] && - !__blk_mq_alloc_rq_map(set, hctx_idx)) { - /* - * If tags initialization fail for some hctx, - * that hctx won't be brought online. In this - * case, remap the current ctx to hctx[0] which - * is guaranteed to always have tags allocated - */ - q->mq_map[i] = 0; - } - ctx = per_cpu_ptr(q->queue_ctx, i); hctx = blk_mq_map_queue(q, i); @@ -2374,21 +2359,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) mutex_unlock(&q->sysfs_lock); queue_for_each_hw_ctx(q, hctx, i) { - /* - * If no software queues are mapped to this hardware queue, - * disable it and free the request entries. - */ - if (!hctx->nr_ctx) { - /* Never unmap queue 0. We need it as a - * fallback in case of a new remap fails - * allocation - */ - if (i && set->tags[i]) - blk_mq_free_map_and_requests(set, i); - - hctx->tags = NULL; - continue; - } + /* every hctx should get mapped by at least one CPU */ + WARN_ON(!hctx->nr_ctx); hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); From a93f00b3762026dd8231f473fae9346bda07db03 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Fri, 6 Apr 2018 22:14:51 +0200 Subject: [PATCH 12/28] backing: silence compiler warning using __printf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __printf marker was added in commit d2cc4dde9206 ("bdi_register: add __printf verification, fix arg mismatch") for function `bdi_register` since it is useful to verify format and arguments. Apply equivalent gcc attribute to `bdi_register_va`. Remove warning triggered with W=1: mm/backing-dev.c:881:2: warning: function might be possible candidate for ‘gnu_printf’ format attribute [-Wsuggest-attribute=format] Reviewed-by: Jan Kara Signed-off-by: Mathieu Malaterre Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3e4ce54d84ab..0e9c0f71f726 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -28,6 +28,7 @@ void bdi_put(struct backing_dev_info *bdi); __printf(2, 3) int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); +__printf(2, 0) int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args); int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner); From 37f9579f4c31a6d698dbf3016d7bf132f9288d30 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 10 Apr 2018 17:02:40 -0600 Subject: [PATCH 13/28] blk-mq: Avoid that submitting a bio concurrently with device removal triggers a crash Because blkcg_exit_queue() is now called from inside blk_cleanup_queue() it is no longer safe to access cgroup information during or after the blk_cleanup_queue() call. Hence protect the generic_make_request_checks() call with blk_queue_enter() / blk_queue_exit(). Reported-by: Ming Lei Fixes: a063057d7c73 ("block: Fix a race between request queue removal and the block cgroup controller") Signed-off-by: Bart Van Assche Cc: Ming Lei Cc: Joseph Qi Signed-off-by: Jens Axboe --- block/blk-core.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index abcb8684ba67..806ce2442819 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2385,8 +2385,20 @@ blk_qc_t generic_make_request(struct bio *bio) * yet. */ struct bio_list bio_list_on_stack[2]; + blk_mq_req_flags_t flags = 0; + struct request_queue *q = bio->bi_disk->queue; blk_qc_t ret = BLK_QC_T_NONE; + if (bio->bi_opf & REQ_NOWAIT) + flags = BLK_MQ_REQ_NOWAIT; + if (blk_queue_enter(q, flags) < 0) { + if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT)) + bio_wouldblock_error(bio); + else + bio_io_error(bio); + return ret; + } + if (!generic_make_request_checks(bio)) goto out; @@ -2423,11 +2435,22 @@ blk_qc_t generic_make_request(struct bio *bio) bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; do { - struct request_queue *q = bio->bi_disk->queue; - blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ? - BLK_MQ_REQ_NOWAIT : 0; + bool enter_succeeded = true; - if (likely(blk_queue_enter(q, flags) == 0)) { + if (unlikely(q != bio->bi_disk->queue)) { + if (q) + blk_queue_exit(q); + q = bio->bi_disk->queue; + flags = 0; + if (bio->bi_opf & REQ_NOWAIT) + flags = BLK_MQ_REQ_NOWAIT; + if (blk_queue_enter(q, flags) < 0) { + enter_succeeded = false; + q = NULL; + } + } + + if (enter_succeeded) { struct bio_list lower, same; /* Create a fresh bio_list for all subordinate requests */ @@ -2435,8 +2458,6 @@ blk_qc_t generic_make_request(struct bio *bio) bio_list_init(&bio_list_on_stack[0]); ret = q->make_request_fn(q, bio); - blk_queue_exit(q); - /* sort new bios into those for a lower level * and those for the same level */ @@ -2463,6 +2484,8 @@ blk_qc_t generic_make_request(struct bio *bio) current->bio_list = NULL; /* deactivate */ out: + if (q) + blk_queue_exit(q); return ret; } EXPORT_SYMBOL(generic_make_request); From 2434af79c85d45d41d0c286fedf6e0556888a54c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 11 Apr 2018 18:47:44 +0800 Subject: [PATCH 14/28] blk-mq: Revert "blk-mq: reimplement blk_mq_hw_queue_mapped" This reverts commit 127276c6ce5a30fcc806b7fe53015f4f89b62956. When all CPUs of one hw queue become offline, there still may have IOs not completed from this hctx. But blk_mq_hw_queue_mapped() is called in blk_mq_queue_tag_busy_iter(), which is used for iterating request in timeout handler, timeout event will be missed on the inactive hctx, then request may never be completed. Also the replementation of blk_mq_hw_queue_mapped() doesn't match the helper's name any more, and it should have been named as blk_mq_hw_queue_active(). Even other callers need further verification about this reimplemenation. So revert this patch now, and we can improve hw queue activate/inactivate event after adequent researching and test. Cc: Stefan Haberland Cc: Christian Borntraeger Cc: Christoph Hellwig Reported-by: Jens Axboe Fixes: 127276c6ce5a30fcc ("blk-mq: reimplement blk_mq_hw_queue_mapped") Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.h b/block/blk-mq.h index 502af371b83b..88c558f71819 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -181,7 +181,7 @@ static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) { - return cpumask_first_and(hctx->cpumask, cpu_online_mask) < nr_cpu_ids; + return hctx->nr_ctx && hctx->tags; } void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, From 2d097c50212e137e7b53ffe3b37561153eeba87d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Apr 2018 11:26:09 -0600 Subject: [PATCH 15/28] sr: get/drop reference to device in revalidate and check_events We can't just use scsi_cd() to get the scsi_cd structure, we have to grab a live reference to the device. For both callbacks, we're not inside an open where we already hold a reference to the device. This fixes device removal/addition under concurrent device access, which otherwise could result in the below oops. NULL pointer dereference at 0000000000000010 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP Modules linked in: sr 12:0:0:0: [sr2] scsi-1 drive scsi_debug crc_t10dif crct10dif_generic crct10dif_common nvme nvme_core sb_edac xl sr 12:0:0:0: Attached scsi CD-ROM sr2 sr_mod cdrom btrfs xor zstd_decompress zstd_compress xxhash lzo_compress zlib_defc sr 12:0:0:0: Attached scsi generic sg7 type 5 igb ahci libahci i2c_algo_bit libata dca [last unloaded: crc_t10dif] CPU: 43 PID: 4629 Comm: systemd-udevd Not tainted 4.16.0+ #650 Hardware name: Dell Inc. PowerEdge T630/0NT78X, BIOS 2.3.4 11/09/2016 RIP: 0010:sr_block_revalidate_disk+0x23/0x190 [sr_mod] RSP: 0018:ffff883ff357bb58 EFLAGS: 00010292 RAX: ffffffffa00b07d0 RBX: ffff883ff3058000 RCX: ffff883ff357bb66 RDX: 0000000000000003 RSI: 0000000000007530 RDI: ffff881fea631000 RBP: 0000000000000000 R08: ffff881fe4d38400 R09: 0000000000000000 R10: 0000000000000000 R11: 00000000000001b6 R12: 000000000800005d R13: 000000000800005d R14: ffff883ffd9b3790 R15: 0000000000000000 FS: 00007f7dc8e6d8c0(0000) GS:ffff883fff340000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000003ffda98005 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? __invalidate_device+0x48/0x60 check_disk_change+0x4c/0x60 sr_block_open+0x16/0xd0 [sr_mod] __blkdev_get+0xb9/0x450 ? iget5_locked+0x1c0/0x1e0 blkdev_get+0x11e/0x320 ? bdget+0x11d/0x150 ? _raw_spin_unlock+0xa/0x20 ? bd_acquire+0xc0/0xc0 do_dentry_open+0x1b0/0x320 ? inode_permission+0x24/0xc0 path_openat+0x4e6/0x1420 ? cpumask_any_but+0x1f/0x40 ? flush_tlb_mm_range+0xa0/0x120 do_filp_open+0x8c/0xf0 ? __seccomp_filter+0x28/0x230 ? _raw_spin_unlock+0xa/0x20 ? __handle_mm_fault+0x7d6/0x9b0 ? list_lru_add+0xa8/0xc0 ? _raw_spin_unlock+0xa/0x20 ? __alloc_fd+0xaf/0x160 ? do_sys_open+0x1a6/0x230 do_sys_open+0x1a6/0x230 do_syscall_64+0x5a/0x100 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Reviewed-by: Lee Duncan Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/scsi/sr.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 0cf25d789d05..3f3cb72e0c0c 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -587,18 +587,28 @@ out: static unsigned int sr_block_check_events(struct gendisk *disk, unsigned int clearing) { - struct scsi_cd *cd = scsi_cd(disk); + unsigned int ret = 0; + struct scsi_cd *cd; - if (atomic_read(&cd->device->disk_events_disable_depth)) + cd = scsi_cd_get(disk); + if (!cd) return 0; - return cdrom_check_events(&cd->cdi, clearing); + if (!atomic_read(&cd->device->disk_events_disable_depth)) + ret = cdrom_check_events(&cd->cdi, clearing); + + scsi_cd_put(cd); + return ret; } static int sr_block_revalidate_disk(struct gendisk *disk) { - struct scsi_cd *cd = scsi_cd(disk); struct scsi_sense_hdr sshdr; + struct scsi_cd *cd; + + cd = scsi_cd_get(disk); + if (!cd) + return -ENXIO; /* if the unit is not ready, nothing more to do */ if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr)) @@ -607,6 +617,7 @@ static int sr_block_revalidate_disk(struct gendisk *disk) sr_cd_check(&cd->cdi); get_sectorsize(cd); out: + scsi_cd_put(cd); return 0; } From 7ec6074ff005e5f6cd2cf186a9ec7496c3db04f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Thu, 12 Apr 2018 09:16:03 -0600 Subject: [PATCH 16/28] nvme: enforce 64bit offset for nvme_get_log_ext fn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling on 32 bits system produces a warning for the shift width when shifting 32 bit integer with 64bit integer. Make sure that offset always is 64bit, and use macros for retrieving lower and upper bits of the offset. Signed-off-by: Matias Bjørling Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 6 +++--- drivers/nvme/host/nvme.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 197a6ba9700f..1bdd010a0cf5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2220,7 +2220,7 @@ out_unlock: int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 log_page, void *log, - size_t size, size_t offset) + size_t size, u64 offset) { struct nvme_command c = { }; unsigned long dwlen = size / 4 - 1; @@ -2235,8 +2235,8 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.get_log_page.lid = log_page; c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); - c.get_log_page.lpol = cpu_to_le32(offset & ((1ULL << 32) - 1)); - c.get_log_page.lpou = cpu_to_le32(offset >> 32ULL); + c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); + c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cf93690b3ffc..09d47b1937ff 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -430,7 +430,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 log_page, void *log, size_t size, size_t offset); + u8 log_page, void *log, size_t size, u64 offset); extern const struct attribute_group nvme_ns_id_attr_group; extern const struct block_device_operations nvme_ns_head_ops; From 11d9ea6f2ca69237d35d6c55755beba3e006b106 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 12 Apr 2018 09:16:04 -0600 Subject: [PATCH 17/28] nvme-loop: fix kernel oops in case of unhandled command When nvmet_req_init() fails, __nvmet_req_complete() is called to handle the target request via .queue_response(), so nvme_loop_queue_response() shouldn't be called again for handling the failure. This patch fixes this case by the following way: - move blk_mq_start_request() before nvmet_req_init(), so nvme_loop_queue_response() may work well to complete this host request - don't call nvme_cleanup_cmd() which is done in nvme_loop_complete_rq() - don't call nvme_loop_queue_response() which is done via .queue_response() Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig [trimmed changelog] Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/loop.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index a350765d2d5c..b9d5b69d8548 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -174,15 +174,12 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) return ret; + blk_mq_start_request(req); iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; iod->req.port = nvmet_loop_port; if (!nvmet_req_init(&iod->req, &queue->nvme_cq, - &queue->nvme_sq, &nvme_loop_ops)) { - nvme_cleanup_cmd(req); - blk_mq_start_request(req); - nvme_loop_queue_response(&iod->req); + &queue->nvme_sq, &nvme_loop_ops)) return BLK_STS_OK; - } if (blk_rq_payload_bytes(req)) { iod->sg_table.sgl = iod->first_sgl; @@ -196,8 +193,6 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, iod->req.transfer_len = blk_rq_payload_bytes(req); } - blk_mq_start_request(req); - schedule_work(&iod->work); return BLK_STS_OK; } From 00b683dbabc34599b96a935aeee791f9af3ae02e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 12 Apr 2018 09:16:05 -0600 Subject: [PATCH 18/28] nvme: unexport nvme_start_keep_alive nvme_start_keep_alive() isn't used outside core.c so unexport it and make it static. Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 3 +-- drivers/nvme/host/nvme.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1bdd010a0cf5..695e52ae22fd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -826,7 +826,7 @@ static void nvme_keep_alive_work(struct work_struct *work) } } -void nvme_start_keep_alive(struct nvme_ctrl *ctrl) +static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) { if (unlikely(ctrl->kato == 0)) return; @@ -836,7 +836,6 @@ void nvme_start_keep_alive(struct nvme_ctrl *ctrl) ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); } -EXPORT_SYMBOL_GPL(nvme_start_keep_alive); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 09d47b1937ff..08c4cff79cde 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -422,7 +422,6 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, unsigned timeout, int qid, int at_head, blk_mq_req_flags_t flags); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); -void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); From 74c6c71530847808d4e3be7b205719270efee80c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 12 Apr 2018 09:16:06 -0600 Subject: [PATCH 19/28] nvme: don't send keep-alives to the discovery controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NVMe over Fabrics 1.0 Section 5.2 "Discovery Controller Properties and Command Support" Figure 31 "Discovery Controller – Admin Commands" explicitly listst all commands but "Get Log Page" and "Identify" as reserved, but NetApp report the Linux host is sending Keep Alive commands to the discovery controller, which is a violation of the Spec. We're already checking for discovery controllers when configuring the keep alive timeout but when creating a discovery controller we're not hard wiring the keep alive timeout to 0 and thus remain on NVME_DEFAULT_KATO for the discovery controller. This can be easily remproduced when issuing a direct connect to the discovery susbsystem using: 'nvme connect [...] --nqn=nqn.2014-08.org.nvmexpress.discovery' Signed-off-by: Johannes Thumshirn Fixes: 07bfcd09a288 ("nvme-fabrics: add a generic NVMe over Fabrics library") Reported-by: Martin George Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/fabrics.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 8f0f34d06d46..3583f9492a45 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -608,8 +608,10 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->discovery_nqn = !(strcmp(opts->subsysnqn, NVME_DISC_SUBSYS_NAME)); - if (opts->discovery_nqn) + if (opts->discovery_nqn) { + opts->kato = 0; opts->nr_io_queues = 0; + } break; case NVMF_OPT_TRADDR: p = match_strdup(args); From 6038aa532a224da68c478f34f4dbce33c47169e6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 12 Apr 2018 09:16:07 -0600 Subject: [PATCH 20/28] nvme: target: fix buffer overflow nvmet_execute_get_disc_log_page() passes a fixed-length string into nvmet_format_discovery_entry(), which then does a longer memcpy() on it, as pointed out by gcc-8: In function 'nvmet_format_discovery_entry', inlined from 'nvmet_execute_get_disc_log_page' at drivers/nvme/target/discovery.c:126:4: drivers/nvme/target/discovery.c:62:2: error: 'memcpy' forming offset [38, 223] is out of the bounds [0, 37] [-Werror=array-bounds] memcpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE); Using strncpy() will make this well-defined, filling the rest of the buffer with zeroes, under the assumption that the input is either a NUL-terminated string, or a byte sequence containing no zeroes. If the input is a string that is longer than NVMF_NQN_SIZE, we continue to have no NUL-termination in the output. Fixes: a07b4970f464 ("nvmet: add a generic NVMe target") Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/discovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index a72425d8bce0..231e04e0a496 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -59,7 +59,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); memcpy(e->traddr, traddr, NVMF_TRADDR_SIZE); memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE); - memcpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE); + strncpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE); } /* From 64ee0ac0527704c47170316fa58dbde50edaaf70 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 12 Apr 2018 09:16:08 -0600 Subject: [PATCH 21/28] nvme-pci: Skip queue deletion if there are no queues User reported controller always retains CSTS.RDY to 1, which fails controller disabling when resetting the controller. This is also before the admin queue is allocated, and trying to disable an unallocated queue results in a NULL dereference. Reported-by: Alex Gagniuc Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 295fbec1e5f2..22403aa7dc60 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2201,7 +2201,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_stop_queues(&dev->ctrl); - if (!dead) { + if (!dead && dev->ctrl.queue_count > 0) { /* * If the controller is still alive tell it to stop using the * host memory buffer. In theory the shutdown / reset should From a6ff7262c26c190f2480721703211cb12d66d45a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 12 Apr 2018 09:16:09 -0600 Subject: [PATCH 22/28] nvme-pci: Remove unused queue parameter All the queue memory is allocated up front. We don't take the node into consideration when creating queues anymore, so removing the unused parameter. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 22403aa7dc60..0b3b4d9fd423 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1380,8 +1380,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, return 0; } -static int nvme_alloc_queue(struct nvme_dev *dev, int qid, - int depth, int node) +static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) { struct nvme_queue *nvmeq = &dev->queues[qid]; @@ -1596,8 +1595,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) if (result < 0) return result; - result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, - dev_to_node(dev->dev)); + result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); if (result) return result; @@ -1630,9 +1628,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) int ret = 0; for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { - /* vector == qid - 1, match nvme_create_queue */ - if (nvme_alloc_queue(dev, i, dev->q_depth, - pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { + if (nvme_alloc_queue(dev, i, dev->q_depth)) { ret = -ENOMEM; break; } From 22b5560195bd66bc43359b71821dc78cc9de56c6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 12 Apr 2018 09:16:10 -0600 Subject: [PATCH 23/28] nvme-pci: Separate IO and admin queue IRQ vectors The admin and first IO queues shared the first irq vector, which has an affinity mask including cpu0. If a system allows cpu0 to be offlined, the admin queue may not be usable if no other CPUs in the affinity mask are online. This is a problem since unlike IO queues, there is only one admin queue that always needs to be usable. To fix, this patch allocates one pre_vector for the admin queue that is assigned all CPUs, so will always be accessible. The IO queues are assigned the remaining managed vectors. In case a controller has only one interrupt vector available, the admin and IO queues will share the pre_vector with all CPUs assigned. Cc: Jianchao Wang Cc: Ming Lei Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0b3b4d9fd423..fbc71fac6f1e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -84,6 +84,7 @@ struct nvme_dev { struct dma_pool *prp_small_pool; unsigned online_queues; unsigned max_qid; + unsigned int num_vecs; int q_depth; u32 db_stride; void __iomem *bar; @@ -414,7 +415,8 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set) { struct nvme_dev *dev = set->driver_data; - return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), 0); + return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), + dev->num_vecs > 1 ? 1 /* admin queue */ : 0); } /** @@ -1456,7 +1458,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) nvmeq->sq_cmds_io = dev->cmb + offset; } - nvmeq->cq_vector = qid - 1; + /* + * A queue's vector matches the queue identifier unless the controller + * has only one vector available. + */ + nvmeq->cq_vector = dev->num_vecs == 1 ? 0 : qid; result = adapter_alloc_cq(dev, qid, nvmeq); if (result < 0) goto release_vector; @@ -1910,6 +1916,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) int result, nr_io_queues; unsigned long size; + struct irq_affinity affd = { + .pre_vectors = 1 + }; + nr_io_queues = num_possible_cpus(); result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) @@ -1945,11 +1955,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) * setting up the full range we need. */ pci_free_irq_vectors(pdev); - nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues, - PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY); - if (nr_io_queues <= 0) + result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1, + PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); + if (result <= 0) return -EIO; - dev->max_qid = nr_io_queues; + dev->num_vecs = result; + dev->max_qid = max(result - 1, 1); /* * Should investigate if there's a performance win from allocating From 543c09c89fdc007c2990aa9d2abcc62e0dfa1311 Mon Sep 17 00:00:00 2001 From: "Rodrigo R. Galvao" Date: Thu, 12 Apr 2018 09:16:11 -0600 Subject: [PATCH 24/28] nvmet: Fix nvmet_execute_write_zeroes sector count We have to increment the number of logical blocks to a 1's based value in the native format prior to converting to 512b units. Signed-off-by: Rodrigo R. Galvao [changelog] Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/io-cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 28bbdff4a88b..cd2344179673 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -173,8 +173,8 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req) sector = le64_to_cpu(write_zeroes->slba) << (req->ns->blksize_shift - 9); - nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length)) << - (req->ns->blksize_shift - 9)) + 1; + nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << + (req->ns->blksize_shift - 9)); if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector, GFP_KERNEL, &bio, 0)) From fd92c77f58257ae5eb5180afe36e86094e4910f6 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 12 Apr 2018 09:16:12 -0600 Subject: [PATCH 25/28] nvme: check return value of init_srcu_struct function Also add error flow in case srcu initialization function fails. Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 695e52ae22fd..b09940c556d0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2832,7 +2832,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, goto out_free_head; head->instance = ret; INIT_LIST_HEAD(&head->list); - init_srcu_struct(&head->srcu); + ret = init_srcu_struct(&head->srcu); + if (ret) + goto out_ida_remove; head->subsys = ctrl->subsys; head->ns_id = nsid; kref_init(&head->ref); @@ -2854,6 +2856,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, return head; out_cleanup_srcu: cleanup_srcu_struct(&head->srcu); +out_ida_remove: ida_simple_remove(&ctrl->subsys->ns_ida, head->instance); out_free_head: kfree(head); From c73996984902516745bc587d5e8a0b2e034aea05 Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Thu, 12 Apr 2018 09:16:13 -0600 Subject: [PATCH 26/28] nvmet: fix space padding in serial number Commit 42de82a8b544 previously attempted to fix this, and it did correctly pad the MN and FR fields with spaces, but the SN field still contains 0 bytes. The current code fills out the first 16 bytes with hex2bin, leaving the last 4 bytes zeroed. Rather than adding a lot of error-prone math to avoid overwriting SN twice, just set the whole thing to spaces up front (it's only 20 bytes). Fixes: 42de82a8b544 ("nvmet: don't report 0-bytes in serial number") Signed-off-by: Daniel Verkamp Reviewed-by: Martin Wilck Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/admin-cmd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 90dcdc40ac71..5e0e9fcc0d4d 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -178,6 +178,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->vid = 0; id->ssvid = 0; + memset(id->sn, ' ', sizeof(id->sn)); bin2hex(id->sn, &ctrl->subsys->serial, min(sizeof(ctrl->subsys->serial), sizeof(id->sn) / 2)); memcpy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1, ' '); From 62843c2e4226057c83f520c74fe9c81a1891c331 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 12 Apr 2018 09:16:14 -0600 Subject: [PATCH 27/28] nvme: Use admin command effects for admin commands Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b09940c556d0..aac3c1d2b2a2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1102,7 +1102,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, } if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->iocs[opcode]); + effects = le32_to_cpu(ctrl->effects->acs[opcode]); else effects = nvme_known_admin_effects(opcode); From bb06ec31452fb2da1594f88035c2ecea4e0652f4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 12 Apr 2018 09:16:15 -0600 Subject: [PATCH 28/28] nvme: expand nvmf_check_if_ready checks The nvmf_check_if_ready() checks that were added are very simplistic. As such, the routine allows a lot of cases to fail ios during windows of reset or re-connection. In cases where there are not multi-path options present, the error goes back to the callee - the filesystem or application. Not good. The common routine was rewritten and calling syntax slightly expanded so that per-transport is_ready routines don't need to be present. The transports now call the routine directly. The routine is now a fabrics routine rather than an inline function. The routine now looks at controller state to decide the action to take. Some states mandate io failure. Others define the condition where a command can be accepted. When the decision is unclear, a generic queue-or-reject check is made to look for failfast or multipath ios and only fails the io if it is so marked. Otherwise, the io will be queued and wait for the controller state to resolve. Admin commands issued via ioctl share a live admin queue with commands from the transport for controller init. The ioctls could be intermixed with the initialization commands. It's possible for the ioctl cmd to be issued prior to the controller being enabled. To block this, the ioctl admin commands need to be distinguished from admin commands used for controller init. Added a USERCMD nvme_req(req)->rq_flags bit to reflect this division and set it on ioctls requests. As the nvmf_check_if_ready() routine is called prior to nvme_setup_cmd(), ensure that commands allocated by the ioctl path (actually anything in core.c) preps the nvme_req(req) before starting the io. This will preserve the USERCMD flag during execution and/or retry. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 17 +++++--- drivers/nvme/host/fabrics.c | 79 +++++++++++++++++++++++++++++++++++++ drivers/nvme/host/fabrics.h | 33 +--------------- drivers/nvme/host/fc.c | 12 ++---- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/rdma.c | 14 +------ drivers/nvme/target/loop.c | 11 +----- 7 files changed, 101 insertions(+), 66 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index aac3c1d2b2a2..9df4f71e58ca 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -376,6 +376,15 @@ static void nvme_put_ns(struct nvme_ns *ns) kref_put(&ns->kref, nvme_free_ns); } +static inline void nvme_clear_nvme_request(struct request *req) +{ + if (!(req->rq_flags & RQF_DONTPREP)) { + nvme_req(req)->retries = 0; + nvme_req(req)->flags = 0; + req->rq_flags |= RQF_DONTPREP; + } +} + struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) { @@ -392,6 +401,7 @@ struct request *nvme_alloc_request(struct request_queue *q, return req; req->cmd_flags |= REQ_FAILFAST_DRIVER; + nvme_clear_nvme_request(req); nvme_req(req)->cmd = cmd; return req; @@ -608,11 +618,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, { blk_status_t ret = BLK_STS_OK; - if (!(req->rq_flags & RQF_DONTPREP)) { - nvme_req(req)->retries = 0; - nvme_req(req)->flags = 0; - req->rq_flags |= RQF_DONTPREP; - } + nvme_clear_nvme_request(req); switch (req_op(req)) { case REQ_OP_DRV_IN: @@ -742,6 +748,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, return PTR_ERR(req); req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + nvme_req(req)->flags |= NVME_REQ_USERCMD; if (ubuffer && bufflen) { ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 3583f9492a45..124c458806df 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -536,6 +536,85 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( return NULL; } +blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live, bool is_connected) +{ + struct nvme_command *cmd = nvme_req(rq)->cmd; + + if (likely(ctrl->state == NVME_CTRL_LIVE && is_connected)) + return BLK_STS_OK; + + switch (ctrl->state) { + case NVME_CTRL_DELETING: + goto reject_io; + + case NVME_CTRL_NEW: + case NVME_CTRL_CONNECTING: + if (!is_connected) + /* + * This is the case of starting a new + * association but connectivity was lost + * before it was fully created. We need to + * error the commands used to initialize the + * controller so the reconnect can go into a + * retry attempt. The commands should all be + * marked REQ_FAILFAST_DRIVER, which will hit + * the reject path below. Anything else will + * be queued while the state settles. + */ + goto reject_or_queue_io; + + if ((queue_live && + !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) || + (!queue_live && blk_rq_is_passthrough(rq) && + cmd->common.opcode == nvme_fabrics_command && + cmd->fabrics.fctype == nvme_fabrics_type_connect)) + /* + * If queue is live, allow only commands that + * are internally generated pass through. These + * are commands on the admin queue to initialize + * the controller. This will reject any ioctl + * admin cmds received while initializing. + * + * If the queue is not live, allow only a + * connect command. This will reject any ioctl + * admin cmd as well as initialization commands + * if the controller reverted the queue to non-live. + */ + return BLK_STS_OK; + + /* + * fall-thru to the reject_or_queue_io clause + */ + break; + + /* these cases fall-thru + * case NVME_CTRL_LIVE: + * case NVME_CTRL_RESETTING: + */ + default: + break; + } + +reject_or_queue_io: + /* + * Any other new io is something we're not in a state to send + * to the device. Default action is to busy it and retry it + * after the controller state is recovered. However, anything + * marked for failfast or nvme multipath is immediately failed. + * Note: commands used to initialize the controller will be + * marked for failfast. + * Note: nvme cli/ioctl commands are marked for failfast. + */ + if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) + return BLK_STS_RESOURCE; + +reject_io: + nvme_req(rq)->status = NVME_SC_ABORT_REQ; + return BLK_STS_IOERR; +} +EXPORT_SYMBOL_GPL(nvmf_check_if_ready); + static const match_table_t opt_tokens = { { NVMF_OPT_TRANSPORT, "transport=%s" }, { NVMF_OPT_TRADDR, "traddr=%s" }, diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a3145d90c1d2..ef46c915b7b5 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -157,36 +157,7 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); - -static inline blk_status_t nvmf_check_init_req(struct nvme_ctrl *ctrl, - struct request *rq) -{ - struct nvme_command *cmd = nvme_req(rq)->cmd; - - /* - * We cannot accept any other command until the connect command has - * completed, so only allow connect to pass. - */ - if (!blk_rq_is_passthrough(rq) || - cmd->common.opcode != nvme_fabrics_command || - cmd->fabrics.fctype != nvme_fabrics_type_connect) { - /* - * Connecting state means transport disruption or initial - * establishment, which can take a long time and even might - * fail permanently, fail fast to give upper layers a chance - * to failover. - * Deleting state means that the ctrl will never accept commands - * again, fail it permanently. - */ - if (ctrl->state == NVME_CTRL_CONNECTING || - ctrl->state == NVME_CTRL_DELETING) { - nvme_req(rq)->status = NVME_SC_ABORT_REQ; - return BLK_STS_IOERR; - } - return BLK_STS_RESOURCE; /* try again later */ - } - - return BLK_STS_OK; -} +blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, + struct request *rq, bool queue_live, bool is_connected); #endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index c6e719b2f3ca..6cb26bcf6ec0 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2277,14 +2277,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, return BLK_STS_OK; } -static inline blk_status_t nvme_fc_is_ready(struct nvme_fc_queue *queue, - struct request *rq) -{ - if (unlikely(!test_bit(NVME_FC_Q_LIVE, &queue->flags))) - return nvmf_check_init_req(&queue->ctrl->ctrl, rq); - return BLK_STS_OK; -} - static blk_status_t nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) @@ -2300,7 +2292,9 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, u32 data_len; blk_status_t ret; - ret = nvme_fc_is_ready(queue, rq); + ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq, + test_bit(NVME_FC_Q_LIVE, &queue->flags), + ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE); if (unlikely(ret)) return ret; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 08c4cff79cde..061fecfd44f5 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -105,6 +105,7 @@ struct nvme_request { enum { NVME_REQ_CANCELLED = (1 << 0), + NVME_REQ_USERCMD = (1 << 1), }; static inline struct nvme_request *nvme_req(struct request *req) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 758537e9ba07..1eb4438a8763 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1601,17 +1601,6 @@ nvme_rdma_timeout(struct request *rq, bool reserved) return BLK_EH_HANDLED; } -/* - * We cannot accept any other command until the Connect command has completed. - */ -static inline blk_status_t -nvme_rdma_is_ready(struct nvme_rdma_queue *queue, struct request *rq) -{ - if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) - return nvmf_check_init_req(&queue->ctrl->ctrl, rq); - return BLK_STS_OK; -} - static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -1627,7 +1616,8 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, WARN_ON_ONCE(rq->tag < 0); - ret = nvme_rdma_is_ready(queue, rq); + ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq, + test_bit(NVME_RDMA_Q_LIVE, &queue->flags), true); if (unlikely(ret)) return ret; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index b9d5b69d8548..31fdfba556a8 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -149,14 +149,6 @@ nvme_loop_timeout(struct request *rq, bool reserved) return BLK_EH_HANDLED; } -static inline blk_status_t nvme_loop_is_ready(struct nvme_loop_queue *queue, - struct request *rq) -{ - if (unlikely(!test_bit(NVME_LOOP_Q_LIVE, &queue->flags))) - return nvmf_check_init_req(&queue->ctrl->ctrl, rq); - return BLK_STS_OK; -} - static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -166,7 +158,8 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret; - ret = nvme_loop_is_ready(queue, req); + ret = nvmf_check_if_ready(&queue->ctrl->ctrl, req, + test_bit(NVME_LOOP_Q_LIVE, &queue->flags), true); if (unlikely(ret)) return ret;