From 27b13e209ddca5979847a1b57890e0372c1edcee Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 17 Nov 2023 10:35:22 +0800 Subject: [PATCH 01/33] blk-throttle: fix lockdep warning of "cgroup_mutex or RCU read lock required!" Inside blkg_for_each_descendant_pre(), both css_for_each_descendant_pre() and blkg_lookup() requires RCU read lock, and either cgroup_assert_mutex_or_rcu_locked() or rcu_read_lock_held() is called. Fix the warning by adding rcu read lock. Reported-by: Changhui Zhong Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20231117023527.3188627-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 13e4377a8b28..16f5766620a4 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1320,6 +1320,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); + rcu_read_lock(); /* * Update has_rules[] flags for the updated tg's subtree. A tg is * considered to have rules if either the tg itself or any of its @@ -1347,6 +1348,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) this_tg->latency_target = max(this_tg->latency_target, parent_tg->latency_target); } + rcu_read_unlock(); /* * We're already holding queue_lock and know @tg is valid. Let's From 35a99d6557cacbc177314735342f77a2dda41872 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 17 Nov 2023 10:35:23 +0800 Subject: [PATCH 02/33] blk-cgroup: avoid to warn !rcu_read_lock_held() in blkg_lookup() So far, all callers either holds spin lock or rcu read explicitly, and most of the caller has added WARN_ON_ONCE(!rcu_read_lock_held()) or lockdep_assert_held(&disk->queue->queue_lock). Remove WARN_ON_ONCE(!rcu_read_lock_held()) from blkg_lookup() for killing the false positive warning from blkg_conf_prep(). Reported-by: Changhui Zhong Fixes: 83462a6c971c ("blkcg: Drop unnecessary RCU read [un]locks from blkg_conf_prep/finish()") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20231117023527.3188627-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 624c03c8fe64..fd482439afbc 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -249,8 +249,6 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, { struct blkcg_gq *blkg; - WARN_ON_ONCE(!rcu_read_lock_held()); - if (blkcg == &blkcg_root) return q->root_blkg; From e63a57303599b17290cd8bc48e6f20b24289a8bc Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 17 Nov 2023 10:35:24 +0800 Subject: [PATCH 03/33] blk-cgroup: bypass blkcg_deactivate_policy after destroying blkcg_deactivate_policy() can be called after blkg_destroy_all() returns, and it isn't necessary since blkg_destroy_all has covered policy deactivation. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20231117023527.3188627-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4a42ea2972ad..4b48c2c44098 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -577,6 +577,7 @@ static void blkg_destroy_all(struct gendisk *disk) struct request_queue *q = disk->queue; struct blkcg_gq *blkg, *n; int count = BLKG_DESTROY_BATCH_SIZE; + int i; restart: spin_lock_irq(&q->queue_lock); @@ -602,6 +603,18 @@ restart: } } + /* + * Mark policy deactivated since policy offline has been done, and + * the free is scheduled, so future blkcg_deactivate_policy() can + * be bypassed + */ + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (pol) + __clear_bit(pol->plid, q->blkcg_pols); + } + q->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); } From 45b478951b2ba5aea70b2850c49c1aa83aedd0d2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 17 Nov 2023 15:56:30 -0800 Subject: [PATCH 04/33] md: fix bi_status reporting in md_end_clone_io md_end_clone_io() may overwrite error status in orig_bio->bi_status with BLK_STS_OK. This could happen when orig_bio has BIO_CHAIN (split by md_submit_bio => bio_split_to_limits, for example). As a result, upper layer may miss error reported from md (or the device) and consider the failed IO was successful. Fix this by only update orig_bio->bi_status when current bio reports error and orig_bio is BLK_STS_OK. This is the same behavior as __bio_chain_endio(). Fixes: 10764815ff47 ("md: add io accounting for raid0 and raid5") Cc: stable@vger.kernel.org # v5.14+ Reported-by: Bhanu Victor DiCara <00bvd0+linux@gmail.com> Closes: https://lore.kernel.org/regressions/5727380.DvuYhMxLoT@bvd0/ Signed-off-by: Song Liu Tested-by: Xiao Ni Reviewed-by: Yu Kuai Acked-by: Guoqing Jiang --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 4ee4593c874a..c94373d64f2c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8666,7 +8666,8 @@ static void md_end_clone_io(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; - orig_bio->bi_status = bio->bi_status; + if (bio->bi_status && !orig_bio->bi_status) + orig_bio->bi_status = bio->bi_status; if (md_io_clone->start_time) bio_end_io_acct(orig_bio, md_io_clone->start_time); From baf8fb7e0e5ec54ea0839f0c534f2cdcd79bea9c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 20 Nov 2023 13:24:54 +0800 Subject: [PATCH 05/33] bcache: avoid oversize memory allocation by small stripe_size Arraies bcache->stripe_sectors_dirty and bcache->full_dirty_stripes are used for dirty data writeback, their sizes are decided by backing device capacity and stripe size. Larger backing device capacity or smaller stripe size make these two arraies occupies more dynamic memory space. Currently bcache->stripe_size is directly inherited from queue->limits.io_opt of underlying storage device. For normal hard drives, its limits.io_opt is 0, and bcache sets the corresponding stripe_size to 1TB (1<<31 sectors), it works fine 10+ years. But for devices do declare value for queue->limits.io_opt, small stripe_size (comparing to 1TB) becomes an issue for oversize memory allocations of bcache->stripe_sectors_dirty and bcache->full_dirty_stripes, while the capacity of hard drives gets much larger in recent decade. For example a raid5 array assembled by three 20TB hardrives, the raid device capacity is 40TB with typical 512KB limits.io_opt. After the math calculation in bcache code, these two arraies will occupy 400MB dynamic memory. Even worse Andrea Tomassetti reports that a 4KB limits.io_opt is declared on a new 2TB hard drive, then these two arraies request 2GB and 512MB dynamic memory from kzalloc(). The result is that bcache device always fails to initialize on his system. To avoid the oversize memory allocation, bcache->stripe_size should not directly inherited by queue->limits.io_opt from the underlying device. This patch defines BCH_MIN_STRIPE_SZ (4MB) as minimal bcache stripe size and set bcache device's stripe size against the declared limits.io_opt value from the underlying storage device, - If the declared limits.io_opt > BCH_MIN_STRIPE_SZ, bcache device will set its stripe size directly by this limits.io_opt value. - If the declared limits.io_opt < BCH_MIN_STRIPE_SZ, bcache device will set its stripe size by a value multiplying limits.io_opt and euqal or large than BCH_MIN_STRIPE_SZ. Then the minimal stripe size of a bcache device will always be >= 4MB. For a 40TB raid5 device with 512KB limits.io_opt, memory occupied by bcache->stripe_sectors_dirty and bcache->full_dirty_stripes will be 50MB in total. For a 2TB hard drive with 4KB limits.io_opt, memory occupied by these two arraies will be 2.5MB in total. Such mount of memory allocated for bcache->stripe_sectors_dirty and bcache->full_dirty_stripes is reasonable for most of storage devices. Reported-by: Andrea Tomassetti Signed-off-by: Coly Li Reviewed-by: Eric Wheeler Link: https://lore.kernel.org/r/20231120052503.6122-2-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 05be59ae21b2..6ae2329052c9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -265,6 +265,7 @@ struct bcache_device { #define BCACHE_DEV_WB_RUNNING 3 #define BCACHE_DEV_RATE_DW_RUNNING 4 int nr_stripes; +#define BCH_MIN_STRIPE_SZ ((4 << 20) >> SECTOR_SHIFT) unsigned int stripe_size; atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 8bd899766372..c7ecc7058d77 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -905,6 +905,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, if (!d->stripe_size) d->stripe_size = 1 << 31; + else if (d->stripe_size < BCH_MIN_STRIPE_SZ) + d->stripe_size = roundup(BCH_MIN_STRIPE_SZ, d->stripe_size); n = DIV_ROUND_UP_ULL(sectors, d->stripe_size); if (!n || n > max_stripes) { From 777967e7e9f6f5f3e153abffb562bffaf4430d26 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 20 Nov 2023 13:24:55 +0800 Subject: [PATCH 06/33] bcache: check return value from btree_node_alloc_replacement() In btree_gc_rewrite_node(), pointer 'n' is not checked after it returns from btree_gc_rewrite_node(). There is potential possibility that 'n' is a non NULL ERR_PTR(), referencing such error code is not permitted in following code. Therefore a return value checking is necessary after 'n' is back from btree_node_alloc_replacement(). Signed-off-by: Coly Li Reported-by: Dan Carpenter Cc: Link: https://lore.kernel.org/r/20231120052503.6122-3-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ae5cbb55861f..de8d552201dc 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1532,6 +1532,8 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, return 0; n = btree_node_alloc_replacement(replace, NULL); + if (IS_ERR(n)) + return 0; /* recheck reserve after allocating replacement node */ if (btree_check_reserve(b, NULL)) { From be93825f0e6428c2d3f03a6e4d447dc48d33d7ff Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 20 Nov 2023 13:24:56 +0800 Subject: [PATCH 07/33] bcache: remove redundant assignment to variable cur_idx Variable cur_idx is being initialized with a value that is never read, it is being re-assigned later in a while-loop. Remove the redundant assignment. Cleans up clang scan build warning: drivers/md/bcache/writeback.c:916:2: warning: Value stored to 'cur_idx' is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Reviewed-by: Coly Li Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-4-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 24c049067f61..c3e872e0a6f2 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -913,7 +913,7 @@ static int bch_dirty_init_thread(void *arg) int cur_idx, prev_idx, skip_nr; k = p = NULL; - cur_idx = prev_idx = 0; + prev_idx = 0; bch_btree_iter_init(&c->root->keys, &iter, NULL); k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); From 2c7f497ac274a14330208b18f6f734000868ebf9 Mon Sep 17 00:00:00 2001 From: Rand Deeb Date: Mon, 20 Nov 2023 13:24:57 +0800 Subject: [PATCH 08/33] bcache: prevent potential division by zero error In SHOW(), the variable 'n' is of type 'size_t.' While there is a conditional check to verify that 'n' is not equal to zero before executing the 'do_div' macro, concerns arise regarding potential division by zero error in 64-bit environments. The concern arises when 'n' is 64 bits in size, greater than zero, and the lower 32 bits of it are zeros. In such cases, the conditional check passes because 'n' is non-zero, but the 'do_div' macro casts 'n' to 'uint32_t,' effectively truncating it to its lower 32 bits. Consequently, the 'n' value becomes zero. To fix this potential division by zero error and ensure precise division handling, this commit replaces the 'do_div' macro with div64_u64(). div64_u64() is designed to work with 64-bit operands, guaranteeing that division is performed correctly. This change enhances the robustness of the code, ensuring that division operations yield accurate results in all scenarios, eliminating the possibility of division by zero, and improving compatibility across different 64-bit environments. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Rand Deeb Cc: Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-5-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 45d8af755de6..a438efb66069 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -1104,7 +1104,7 @@ SHOW(__bch_cache) sum += INITIAL_PRIO - cached[i]; if (n) - do_div(sum, n); + sum = div64_u64(sum, n); for (i = 0; i < ARRAY_SIZE(q); i++) q[i] = INITIAL_PRIO - cached[n * (i + 1) / From 7cc47e64d3d69786a2711a4767e26b26ba63d7ed Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Mon, 20 Nov 2023 13:24:58 +0800 Subject: [PATCH 09/33] bcache: fixup init dirty data errors We found that after long run, the dirty_data of the bcache device will have errors. This error cannot be eliminated unless re-register. We also found that reattach after detach, this error can accumulate. In bch_sectors_dirty_init(), all inode <= d->id keys will be recounted again. This is wrong, we only need to count the keys of the current device. Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded") Signed-off-by: Mingzhe Zou Cc: Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-6-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index c3e872e0a6f2..77fb72ac6b81 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -991,8 +991,11 @@ void bch_sectors_dirty_init(struct bcache_device *d) op.count = 0; for_each_key_filter(&c->root->keys, - k, &iter, bch_ptr_invalid) + k, &iter, bch_ptr_invalid) { + if (KEY_INODE(k) != op.inode) + continue; sectors_dirty_init_fn(&op.op, c->root, k); + } rw_unlock(0, c->root); return; From e34820f984512b433ee1fc291417e60c47d56727 Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Mon, 20 Nov 2023 13:24:59 +0800 Subject: [PATCH 10/33] bcache: fixup lock c->root error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We had a problem with io hung because it was waiting for c->root to release the lock. crash> cache_set.root -l cache_set.list ffffa03fde4c0050 root = 0xffff802ef454c800 crash> btree -o 0xffff802ef454c800 | grep rw_semaphore [ffff802ef454c858] struct rw_semaphore lock; crash> struct rw_semaphore ffff802ef454c858 struct rw_semaphore { count = { counter = -4294967297 }, wait_list = { next = 0xffff00006786fc28, prev = 0xffff00005d0efac8 }, wait_lock = { raw_lock = { { val = { counter = 0 }, { locked = 0 '\000', pending = 0 '\000' }, { locked_pending = 0, tail = 0 } } } }, osq = { tail = { counter = 0 } }, owner = 0xffffa03fdc586603 } The "counter = -4294967297" means that lock count is -1 and a write lock is being attempted. Then, we found that there is a btree with a counter of 1 in btree_cache_freeable. crash> cache_set -l cache_set.list ffffa03fde4c0050 -o|grep btree_cache [ffffa03fde4c1140] struct list_head btree_cache; [ffffa03fde4c1150] struct list_head btree_cache_freeable; [ffffa03fde4c1160] struct list_head btree_cache_freed; [ffffa03fde4c1170] unsigned int btree_cache_used; [ffffa03fde4c1178] wait_queue_head_t btree_cache_wait; [ffffa03fde4c1190] struct task_struct *btree_cache_alloc_lock; crash> list -H ffffa03fde4c1140|wc -l 973 crash> list -H ffffa03fde4c1150|wc -l 1123 crash> cache_set.btree_cache_used -l cache_set.list ffffa03fde4c0050 btree_cache_used = 2097 crash> list -s btree -l btree.list -H ffffa03fde4c1140|grep -E -A2 "^ lock = {" > btree_cache.txt crash> list -s btree -l btree.list -H ffffa03fde4c1150|grep -E -A2 "^ lock = {" > btree_cache_freeable.txt [root@node-3 127.0.0.1-2023-08-04-16:40:28]# pwd /var/crash/127.0.0.1-2023-08-04-16:40:28 [root@node-3 127.0.0.1-2023-08-04-16:40:28]# cat btree_cache.txt|grep counter|grep -v "counter = 0" [root@node-3 127.0.0.1-2023-08-04-16:40:28]# cat btree_cache_freeable.txt|grep counter|grep -v "counter = 0" counter = 1 We found that this is a bug in bch_sectors_dirty_init() when locking c->root: (1). Thread X has locked c->root(A) write. (2). Thread Y failed to lock c->root(A), waiting for the lock(c->root A). (3). Thread X bch_btree_set_root() changes c->root from A to B. (4). Thread X releases the lock(c->root A). (5). Thread Y successfully locks c->root(A). (6). Thread Y releases the lock(c->root B). down_write locked ---(1)----------------------┐ | | | down_read waiting ---(2)----┐ | | | ┌-------------┐ ┌-------------┐ bch_btree_set_root ===(3)========>> | c->root A | | c->root B | | | └-------------┘ └-------------┘ up_write ---(4)---------------------┘ | | | | | down_read locked ---(5)-----------┘ | | | up_read ---(6)-----------------------------┘ Since c->root may change, the correct steps to lock c->root should be the same as bch_root_usage(), compare after locking. static unsigned int bch_root_usage(struct cache_set *c) { unsigned int bytes = 0; struct bkey *k; struct btree *b; struct btree_iter iter; goto lock_root; do { rw_unlock(false, b); lock_root: b = c->root; rw_lock(false, b, b->level); } while (b != c->root); for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) bytes += bkey_bytes(k); rw_unlock(false, b); return (bytes * 100) / btree_bytes(c); } Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded") Signed-off-by: Mingzhe Zou Cc: Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-7-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 77fb72ac6b81..a1d760916246 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -977,14 +977,22 @@ static int bch_btre_dirty_init_thread_nr(void) void bch_sectors_dirty_init(struct bcache_device *d) { int i; + struct btree *b = NULL; struct bkey *k = NULL; struct btree_iter iter; struct sectors_dirty_init op; struct cache_set *c = d->c; struct bch_dirty_init_state state; +retry_lock: + b = c->root; + rw_lock(0, b, b->level); + if (b != c->root) { + rw_unlock(0, b); + goto retry_lock; + } + /* Just count root keys if no leaf node */ - rw_lock(0, c->root, c->root->level); if (c->root->level == 0) { bch_btree_op_init(&op.op, -1); op.inode = d->id; @@ -997,7 +1005,7 @@ void bch_sectors_dirty_init(struct bcache_device *d) sectors_dirty_init_fn(&op.op, c->root, k); } - rw_unlock(0, c->root); + rw_unlock(0, b); return; } @@ -1033,7 +1041,7 @@ void bch_sectors_dirty_init(struct bcache_device *d) out: /* Must wait for all threads to stop. */ wait_event(state.wait, atomic_read(&state.started) == 0); - rw_unlock(0, c->root); + rw_unlock(0, b); } void bch_cached_dev_writeback_init(struct cached_dev *dc) From 2faac25d7958c4761bb8cec54adb79f806783ad6 Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Mon, 20 Nov 2023 13:25:00 +0800 Subject: [PATCH 11/33] bcache: fixup multi-threaded bch_sectors_dirty_init() wake-up race We get a kernel crash about "unable to handle kernel paging request": ```dmesg [368033.032005] BUG: unable to handle kernel paging request at ffffffffad9ae4b5 [368033.032007] PGD fc3a0d067 P4D fc3a0d067 PUD fc3a0e063 PMD 8000000fc38000e1 [368033.032012] Oops: 0003 [#1] SMP PTI [368033.032015] CPU: 23 PID: 55090 Comm: bch_dirtcnt[0] Kdump: loaded Tainted: G OE --------- - - 4.18.0-147.5.1.es8_24.x86_64 #1 [368033.032017] Hardware name: Tsinghua Tongfang THTF Chaoqiang Server/072T6D, BIOS 2.4.3 01/17/2017 [368033.032027] RIP: 0010:native_queued_spin_lock_slowpath+0x183/0x1d0 [368033.032029] Code: 8b 02 48 85 c0 74 f6 48 89 c1 eb d0 c1 e9 12 83 e0 03 83 e9 01 48 c1 e0 05 48 63 c9 48 05 c0 3d 02 00 48 03 04 cd 60 68 93 ad <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 02 [368033.032031] RSP: 0018:ffffbb48852abe00 EFLAGS: 00010082 [368033.032032] RAX: ffffffffad9ae4b5 RBX: 0000000000000246 RCX: 0000000000003bf3 [368033.032033] RDX: ffff97b0ff8e3dc0 RSI: 0000000000600000 RDI: ffffbb4884743c68 [368033.032034] RBP: 0000000000000001 R08: 0000000000000000 R09: 000007ffffffffff [368033.032035] R10: ffffbb486bb01000 R11: 0000000000000001 R12: ffffffffc068da70 [368033.032036] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000000 [368033.032038] FS: 0000000000000000(0000) GS:ffff97b0ff8c0000(0000) knlGS:0000000000000000 [368033.032039] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [368033.032040] CR2: ffffffffad9ae4b5 CR3: 0000000fc3a0a002 CR4: 00000000003626e0 [368033.032042] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [368033.032043] bcache: bch_cached_dev_attach() Caching rbd479 as bcache462 on set 8cff3c36-4a76-4242-afaa-7630206bc70b [368033.032045] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [368033.032046] Call Trace: [368033.032054] _raw_spin_lock_irqsave+0x32/0x40 [368033.032061] __wake_up_common_lock+0x63/0xc0 [368033.032073] ? bch_ptr_invalid+0x10/0x10 [bcache] [368033.033502] bch_dirty_init_thread+0x14c/0x160 [bcache] [368033.033511] ? read_dirty_submit+0x60/0x60 [bcache] [368033.033516] kthread+0x112/0x130 [368033.033520] ? kthread_flush_work_fn+0x10/0x10 [368033.034505] ret_from_fork+0x35/0x40 ``` The crash occurred when call wake_up(&state->wait), and then we want to look at the value in the state. However, bch_sectors_dirty_init() is not found in the stack of any task. Since state is allocated on the stack, we guess that bch_sectors_dirty_init() has exited, causing bch_dirty_init_thread() to be unable to handle kernel paging request. In order to verify this idea, we added some printing information during wake_up(&state->wait). We find that "wake up" is printed twice, however we only expect the last thread to wake up once. ```dmesg [ 994.641004] alcache: bch_dirty_init_thread() wake up [ 994.641018] alcache: bch_dirty_init_thread() wake up [ 994.641523] alcache: bch_sectors_dirty_init() init exit ``` There is a race. If bch_sectors_dirty_init() exits after the first wake up, the second wake up will trigger this bug("unable to handle kernel paging request"). Proceed as follows: bch_sectors_dirty_init kthread_run ==============> bch_dirty_init_thread(bch_dirtcnt[0]) ... ... atomic_inc(&state.started) ... ... ... atomic_read(&state.enough) ... ... atomic_set(&state->enough, 1) kthread_run ======================================================> bch_dirty_init_thread(bch_dirtcnt[1]) ... atomic_dec_and_test(&state->started) ... atomic_inc(&state.started) ... ... ... wake_up(&state->wait) ... atomic_read(&state.enough) atomic_dec_and_test(&state->started) ... ... wait_event(state.wait, atomic_read(&state.started) == 0) ... return ... wake_up(&state->wait) We believe it is very common to wake up twice if there is no dirty, but crash is an extremely low probability event. It's hard for us to reproduce this issue. We attached and detached continuously for a week, with a total of more than one million attaches and only one crash. Putting atomic_inc(&state.started) before kthread_run() can avoid waking up twice. Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded") Signed-off-by: Mingzhe Zou Cc: Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-8-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index a1d760916246..3accfdaee6b1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -1025,17 +1025,18 @@ retry_lock: if (atomic_read(&state.enough)) break; + atomic_inc(&state.started); state.infos[i].state = &state; state.infos[i].thread = kthread_run(bch_dirty_init_thread, &state.infos[i], "bch_dirtcnt[%d]", i); if (IS_ERR(state.infos[i].thread)) { pr_err("fails to run thread bch_dirty_init[%d]\n", i); + atomic_dec(&state.started); for (--i; i >= 0; i--) kthread_stop(state.infos[i].thread); goto out; } - atomic_inc(&state.started); } out: From f72f4312d4388376fc8a1f6cf37cb21a0d41758b Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 20 Nov 2023 13:25:01 +0800 Subject: [PATCH 12/33] bcache: replace a mistaken IS_ERR() by IS_ERR_OR_NULL() in btree_gc_coalesce() Commit 028ddcac477b ("bcache: Remove unnecessary NULL point check in node allocations") do the following change inside btree_gc_coalesce(), 31 @@ -1340,7 +1340,7 @@ static int btree_gc_coalesce( 32 memset(new_nodes, 0, sizeof(new_nodes)); 33 closure_init_stack(&cl); 34 35 - while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) 36 + while (nodes < GC_MERGE_NODES && !IS_ERR(r[nodes].b)) 37 keys += r[nodes++].keys; 38 39 blocks = btree_default_blocks(b->c) * 2 / 3; At line 35 the original r[nodes].b is not always allocatored from __bch_btree_node_alloc(), and possibly initialized as NULL pointer by caller of btree_gc_coalesce(). Therefore the change at line 36 is not correct. This patch replaces the mistaken IS_ERR() by IS_ERR_OR_NULL() to avoid potential issue. Fixes: 028ddcac477b ("bcache: Remove unnecessary NULL point check in node allocations") Cc: # 6.5+ Cc: Zheng Wang Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-9-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index de8d552201dc..79f1fa4a0d55 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1368,7 +1368,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, memset(new_nodes, 0, sizeof(new_nodes)); closure_init_stack(&cl); - while (nodes < GC_MERGE_NODES && !IS_ERR(r[nodes].b)) + while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) keys += r[nodes++].keys; blocks = btree_default_blocks(b->c) * 2 / 3; From 31f5b956a197d4ec25c8a07cb3a2ab69d0c0b82f Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 20 Nov 2023 13:25:02 +0800 Subject: [PATCH 13/33] bcache: add code comments for bch_btree_node_get() and __bch_btree_node_alloc() This patch adds code comments to bch_btree_node_get() and __bch_btree_node_alloc() that NULL pointer will not be returned and it is unnecessary to check NULL pointer by the callers of these routines. Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-10-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 79f1fa4a0d55..de3019972b35 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1000,6 +1000,9 @@ err: * * The btree node will have either a read or a write lock held, depending on * level and op->lock. + * + * Note: Only error code or btree pointer will be returned, it is unncessary + * for callers to check NULL pointer. */ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, struct bkey *k, int level, bool write, @@ -1111,6 +1114,10 @@ retry: mutex_unlock(&b->c->bucket_lock); } +/* + * Only error code or btree pointer will be returned, it is unncessary for + * callers to check NULL pointer. + */ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, int level, bool wait, struct btree *parent) From 3eba5e0b2422aec3c9e79822029599961fdcab97 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 20 Nov 2023 13:25:03 +0800 Subject: [PATCH 14/33] bcache: avoid NULL checking to c->root in run_cache_set() In run_cache_set() after c->root returned from bch_btree_node_get(), it is checked by IS_ERR_OR_NULL(). Indeed it is unncessary to check NULL because bch_btree_node_get() will not return NULL pointer to caller. This patch replaces IS_ERR_OR_NULL() by IS_ERR() for the above reason. Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-11-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c7ecc7058d77..bfe1685dbae5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2018,7 +2018,7 @@ static int run_cache_set(struct cache_set *c) c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL); - if (IS_ERR_OR_NULL(c->root)) + if (IS_ERR(c->root)) goto err; list_del_init(&c->root->list); From 1b59860540a4018e8071dc18d4893ec389506b7d Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 17 Nov 2023 00:23:14 +0800 Subject: [PATCH 15/33] nbd: fold nbd config initialization into nbd_alloc_config() There are no functional changes, make the code cleaner and prepare to fix null-ptr-dereference while accessing 'nbd->config'. Signed-off-by: Li Nan Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20231116162316.1740402-2-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 855fdf5c3b4e..02f844832d91 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1530,17 +1530,20 @@ static int nbd_ioctl(struct block_device *bdev, blk_mode_t mode, return error; } -static struct nbd_config *nbd_alloc_config(void) +static int nbd_alloc_and_init_config(struct nbd_device *nbd) { struct nbd_config *config; + if (WARN_ON(nbd->config)) + return -EINVAL; + if (!try_module_get(THIS_MODULE)) - return ERR_PTR(-ENODEV); + return -ENODEV; config = kzalloc(sizeof(struct nbd_config), GFP_NOFS); if (!config) { module_put(THIS_MODULE); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } atomic_set(&config->recv_threads, 0); @@ -1548,7 +1551,10 @@ static struct nbd_config *nbd_alloc_config(void) init_waitqueue_head(&config->conn_wait); config->blksize_bits = NBD_DEF_BLKSIZE_BITS; atomic_set(&config->live_connections, 0); - return config; + nbd->config = config; + refcount_set(&nbd->config_refs, 1); + + return 0; } static int nbd_open(struct gendisk *disk, blk_mode_t mode) @@ -1567,21 +1573,17 @@ static int nbd_open(struct gendisk *disk, blk_mode_t mode) goto out; } if (!refcount_inc_not_zero(&nbd->config_refs)) { - struct nbd_config *config; - mutex_lock(&nbd->config_lock); if (refcount_inc_not_zero(&nbd->config_refs)) { mutex_unlock(&nbd->config_lock); goto out; } - config = nbd_alloc_config(); - if (IS_ERR(config)) { - ret = PTR_ERR(config); + ret = nbd_alloc_and_init_config(nbd); + if (ret) { mutex_unlock(&nbd->config_lock); goto out; } - nbd->config = config; - refcount_set(&nbd->config_refs, 1); + refcount_inc(&nbd->refs); mutex_unlock(&nbd->config_lock); if (max_part) @@ -1990,22 +1992,17 @@ again: pr_err("nbd%d already in use\n", index); return -EBUSY; } - if (WARN_ON(nbd->config)) { - mutex_unlock(&nbd->config_lock); - nbd_put(nbd); - return -EINVAL; - } - config = nbd_alloc_config(); - if (IS_ERR(config)) { + + ret = nbd_alloc_and_init_config(nbd); + if (ret) { mutex_unlock(&nbd->config_lock); nbd_put(nbd); pr_err("couldn't allocate config\n"); - return PTR_ERR(config); + return ret; } - nbd->config = config; - refcount_set(&nbd->config_refs, 1); - set_bit(NBD_RT_BOUND, &config->runtime_flags); + config = nbd->config; + set_bit(NBD_RT_BOUND, &config->runtime_flags); ret = nbd_genl_size_set(info, nbd); if (ret) goto out; From 3123ac77923341774ca3ad1196ad20bb0732bf70 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 17 Nov 2023 00:23:15 +0800 Subject: [PATCH 16/33] nbd: factor out a helper to get nbd_config without holding 'config_lock' There are no functional changes, just to make code cleaner and prepare to fix null-ptr-dereference while accessing 'nbd->config'. Signed-off-by: Li Nan Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20231116162316.1740402-3-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 02f844832d91..daaf8805e876 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -395,6 +395,14 @@ static u32 req_to_nbd_cmd_type(struct request *req) } } +static struct nbd_config *nbd_get_config_unlocked(struct nbd_device *nbd) +{ + if (refcount_inc_not_zero(&nbd->config_refs)) + return nbd->config; + + return NULL; +} + static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); @@ -409,13 +417,13 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) return BLK_EH_DONE; } - if (!refcount_inc_not_zero(&nbd->config_refs)) { + config = nbd_get_config_unlocked(nbd); + if (!config) { cmd->status = BLK_STS_TIMEOUT; __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); goto done; } - config = nbd->config; if (config->num_connections > 1 || (config->num_connections == 1 && nbd->tag_set.timeout)) { @@ -977,12 +985,12 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) struct nbd_sock *nsock; int ret; - if (!refcount_inc_not_zero(&nbd->config_refs)) { + config = nbd_get_config_unlocked(nbd); + if (!config) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Socks array is empty\n"); return -EINVAL; } - config = nbd->config; if (index >= config->num_connections) { dev_err_ratelimited(disk_to_dev(nbd->disk), @@ -1560,6 +1568,7 @@ static int nbd_alloc_and_init_config(struct nbd_device *nbd) static int nbd_open(struct gendisk *disk, blk_mode_t mode) { struct nbd_device *nbd; + struct nbd_config *config; int ret = 0; mutex_lock(&nbd_index_mutex); @@ -1572,7 +1581,9 @@ static int nbd_open(struct gendisk *disk, blk_mode_t mode) ret = -ENXIO; goto out; } - if (!refcount_inc_not_zero(&nbd->config_refs)) { + + config = nbd_get_config_unlocked(nbd); + if (!config) { mutex_lock(&nbd->config_lock); if (refcount_inc_not_zero(&nbd->config_refs)) { mutex_unlock(&nbd->config_lock); @@ -1588,7 +1599,7 @@ static int nbd_open(struct gendisk *disk, blk_mode_t mode) mutex_unlock(&nbd->config_lock); if (max_part) set_bit(GD_NEED_PART_SCAN, &disk->state); - } else if (nbd_disconnected(nbd->config)) { + } else if (nbd_disconnected(config)) { if (max_part) set_bit(GD_NEED_PART_SCAN, &disk->state); } @@ -2205,7 +2216,8 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) } mutex_unlock(&nbd_index_mutex); - if (!refcount_inc_not_zero(&nbd->config_refs)) { + config = nbd_get_config_unlocked(nbd); + if (!config) { dev_err(nbd_to_dev(nbd), "not configured, cannot reconfigure\n"); nbd_put(nbd); @@ -2213,7 +2225,6 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) } mutex_lock(&nbd->config_lock); - config = nbd->config; if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || !nbd->pid) { dev_err(nbd_to_dev(nbd), From c2da049f419417808466c529999170f5c3ef7d3d Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 17 Nov 2023 00:23:16 +0800 Subject: [PATCH 17/33] nbd: fix null-ptr-dereference while accessing 'nbd->config' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Memory reordering may occur in nbd_genl_connect(), causing config_refs to be set to 1 while nbd->config is still empty. Opening nbd at this time will cause null-ptr-dereference. T1 T2 nbd_open nbd_get_config_unlocked nbd_genl_connect nbd_alloc_and_init_config //memory reordered refcount_set(&nbd->config_refs, 1) // 2 nbd->config ->null point nbd->config = config // 1 Fix it by adding smp barrier to guarantee the execution sequence. Signed-off-by: Li Nan Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20231116162316.1740402-4-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index daaf8805e876..3f03cb3dc33c 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -397,8 +397,16 @@ static u32 req_to_nbd_cmd_type(struct request *req) static struct nbd_config *nbd_get_config_unlocked(struct nbd_device *nbd) { - if (refcount_inc_not_zero(&nbd->config_refs)) + if (refcount_inc_not_zero(&nbd->config_refs)) { + /* + * Add smp_mb__after_atomic to ensure that reading nbd->config_refs + * and reading nbd->config is ordered. The pair is the barrier in + * nbd_alloc_and_init_config(), avoid nbd->config_refs is set + * before nbd->config. + */ + smp_mb__after_atomic(); return nbd->config; + } return NULL; } @@ -1559,7 +1567,15 @@ static int nbd_alloc_and_init_config(struct nbd_device *nbd) init_waitqueue_head(&config->conn_wait); config->blksize_bits = NBD_DEF_BLKSIZE_BITS; atomic_set(&config->live_connections, 0); + nbd->config = config; + /* + * Order refcount_set(&nbd->config_refs, 1) and nbd->config assignment, + * its pair is the barrier in nbd_get_config_unlocked(). + * So nbd_get_config_unlocked() won't see nbd->config as null after + * refcount_inc_not_zero() succeed. + */ + smp_mb__before_atomic(); refcount_set(&nbd->config_refs, 1); return 0; From c96b8175522a2c52e297ee0a49827a668f95e1e8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 20 Nov 2023 16:06:11 +0900 Subject: [PATCH 18/33] block: Remove blk_set_runtime_active() The function blk_set_runtime_active() is called only from blk_post_runtime_resume(), so there is no need for that function to be exported. Open-code this function directly in blk_post_runtime_resume() and remove it. Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20231120070611.33951-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-pm.c | 33 +++++---------------------------- include/linux/blk-pm.h | 1 - 2 files changed, 5 insertions(+), 29 deletions(-) diff --git a/block/blk-pm.c b/block/blk-pm.c index 6b72b2e03fc8..42e842074715 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -163,38 +163,15 @@ EXPORT_SYMBOL(blk_pre_runtime_resume); * @q: the queue of the device * * Description: - * For historical reasons, this routine merely calls blk_set_runtime_active() - * to do the real work of restarting the queue. It does this regardless of - * whether the device's runtime-resume succeeded; even if it failed the + * Restart the queue of a runtime suspended device. It does this regardless + * of whether the device's runtime-resume succeeded; even if it failed the * driver or error handler will need to communicate with the device. * * This function should be called near the end of the device's - * runtime_resume callback. + * runtime_resume callback to correct queue runtime PM status and re-enable + * peeking requests from the queue. */ void blk_post_runtime_resume(struct request_queue *q) -{ - blk_set_runtime_active(q); -} -EXPORT_SYMBOL(blk_post_runtime_resume); - -/** - * blk_set_runtime_active - Force runtime status of the queue to be active - * @q: the queue of the device - * - * If the device is left runtime suspended during system suspend the resume - * hook typically resumes the device and corrects runtime status - * accordingly. However, that does not affect the queue runtime PM status - * which is still "suspended". This prevents processing requests from the - * queue. - * - * This function can be used in driver's resume hook to correct queue - * runtime PM status and re-enable peeking requests from the queue. It - * should be called before first request is added to the queue. - * - * This function is also called by blk_post_runtime_resume() for - * runtime resumes. It does everything necessary to restart the queue. - */ -void blk_set_runtime_active(struct request_queue *q) { int old_status; @@ -211,4 +188,4 @@ void blk_set_runtime_active(struct request_queue *q) if (old_status != RPM_ACTIVE) blk_clear_pm_only(q); } -EXPORT_SYMBOL(blk_set_runtime_active); +EXPORT_SYMBOL(blk_post_runtime_resume); diff --git a/include/linux/blk-pm.h b/include/linux/blk-pm.h index 2580e05a8ab6..004b38a538ff 100644 --- a/include/linux/blk-pm.h +++ b/include/linux/blk-pm.h @@ -15,7 +15,6 @@ extern int blk_pre_runtime_suspend(struct request_queue *q); extern void blk_post_runtime_suspend(struct request_queue *q, int err); extern void blk_pre_runtime_resume(struct request_queue *q); extern void blk_post_runtime_resume(struct request_queue *q); -extern void blk_set_runtime_active(struct request_queue *q); #else static inline void blk_pm_runtime_init(struct request_queue *q, struct device *dev) {} From 616add70bfdc0274a253e84fc78155c27aacde91 Mon Sep 17 00:00:00 2001 From: Mark O'Donovan Date: Wed, 11 Oct 2023 08:45:11 +0000 Subject: [PATCH 19/33] nvme-auth: unlock mutex in one place only Signed-off-by: Mark O'Donovan Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/auth.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 48328e36e93b..0f5ea63d3c8d 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -757,12 +757,11 @@ static void nvme_queue_auth_work(struct work_struct *work) __func__, chap->qid); mutex_lock(&ctrl->dhchap_auth_mutex); ret = nvme_auth_dhchap_setup_host_response(ctrl, chap); + mutex_unlock(&ctrl->dhchap_auth_mutex); if (ret) { - mutex_unlock(&ctrl->dhchap_auth_mutex); chap->error = ret; goto fail2; } - mutex_unlock(&ctrl->dhchap_auth_mutex); /* DH-HMAC-CHAP Step 3: send reply */ dev_dbg(ctrl->device, "%s: qid %d send reply\n", From 38ce1570e2c46e7e9af983aa337edd7e43723aa2 Mon Sep 17 00:00:00 2001 From: Mark O'Donovan Date: Wed, 11 Oct 2023 08:45:12 +0000 Subject: [PATCH 20/33] nvme-auth: set explanation code for failure2 msgs Some error cases were not setting an auth-failure-reason-code-explanation. This means an AUTH_Failure2 message will be sent with an explanation value of 0 which is a reserved value. Signed-off-by: Mark O'Donovan Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/auth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 0f5ea63d3c8d..72c0525c75f5 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -838,6 +838,8 @@ static void nvme_queue_auth_work(struct work_struct *work) } fail2: + if (chap->status == 0) + chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; dev_dbg(ctrl->device, "%s: qid %d send failure2, status %x\n", __func__, chap->qid, chap->status); tl = nvme_auth_set_dhchap_failure2_data(ctrl, chap); From 23441536b63677cb2ed9b1637d8ca70315e44bd0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 14 Nov 2023 14:18:21 +0100 Subject: [PATCH 21/33] nvme-tcp: only evaluate 'tls' option if TLS is selected We only need to evaluate the 'tls' connect option if TLS is enabled; otherwise we might be getting a link error. Fixes: 706add13676d ("nvme: keyring: fix conditional compilation") Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202311140426.0eHrTXBr-lkp@intel.com/ Signed-off-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 89661a9cf850..6ed794815517 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1916,7 +1916,7 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) int ret; key_serial_t pskid = 0; - if (ctrl->opts->tls) { + if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && ctrl->opts->tls) { if (ctrl->opts->tls_key) pskid = key_serial(ctrl->opts->tls_key); else From cd9aed606088d36a7ffff3e808db4e76b1854285 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 14 Nov 2023 14:27:01 +0100 Subject: [PATCH 22/33] nvme: catch errors from nvme_configure_metadata() nvme_configure_metadata() is issuing I/O, so we might incur an I/O error which will cause the connection to be reset. But in that case any further probing will race with reset and cause UAF errors. So return a status from nvme_configure_metadata() and abort probing if there was an I/O error. Signed-off-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 88b54cdcbd68..fd28e6b6574c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1814,16 +1814,18 @@ set_pi: return ret; } -static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) +static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) { struct nvme_ctrl *ctrl = ns->ctrl; + int ret; - if (nvme_init_ms(ns, id)) - return; + ret = nvme_init_ms(ns, id); + if (ret) + return ret; ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) - return; + return 0; if (ctrl->ops->flags & NVME_F_FABRICS) { /* @@ -1832,7 +1834,7 @@ static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) * remap the separate metadata buffer from the block layer. */ if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) - return; + return 0; ns->features |= NVME_NS_EXT_LBAS; @@ -1859,6 +1861,7 @@ static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) else ns->features |= NVME_NS_METADATA_SUPPORTED; } + return 0; } static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, @@ -2032,7 +2035,11 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, ns->lba_shift = id->lbaf[lbaf].ds; nvme_set_queue_limits(ns->ctrl, ns->queue); - nvme_configure_metadata(ns, id); + ret = nvme_configure_metadata(ns, id); + if (ret < 0) { + blk_mq_unfreeze_queue(ns->disk->queue); + goto out; + } nvme_set_chunk_sectors(ns, id); nvme_update_disk_info(ns->disk, ns, id); From c7ca9757bda35ff9ce27ab42f2cb8b84d983e6ad Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 16 Nov 2023 13:14:35 +0100 Subject: [PATCH 23/33] nvme: blank out authentication fabrics options if not configured If the config option NVME_HOST_AUTH is not selected we should not accept the corresponding fabrics options. This allows userspace to detect if NVMe authentication has been enabled for the kernel. Cc: Shin'ichiro Kawasaki Fixes: f50fff73d620 ("nvme: implement In-Band authentication") Signed-off-by: Hannes Reinecke Tested-by: Shin'ichiro Kawasaki Reviewed-by: Daniel Wagner Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 4673ead69c5f..aa88606a44c4 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -667,8 +667,10 @@ static const match_table_t opt_tokens = { #endif { NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" }, { NVMF_OPT_DISCOVERY, "discovery" }, +#ifdef CONFIG_NVME_HOST_AUTH { NVMF_OPT_DHCHAP_SECRET, "dhchap_secret=%s" }, { NVMF_OPT_DHCHAP_CTRL_SECRET, "dhchap_ctrl_secret=%s" }, +#endif #ifdef CONFIG_NVME_TCP_TLS { NVMF_OPT_TLS, "tls" }, #endif From 1c22e0295a5eb571c27b53c7371f95699ef705ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Nov 2023 08:13:36 -0500 Subject: [PATCH 24/33] nvmet: nul-terminate the NQNs passed in the connect command The host and subsystem NQNs are passed in the connect command payload and interpreted as nul-terminated strings. Ensure they actually are nul-terminated before using them. Fixes: a07b4970f464 "nvmet: add a generic NVMe target") Reported-by: Alon Zahavi Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/fabrics-cmd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 43b5bd8bb6a5..d8da840a1c0e 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -244,6 +244,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) goto out; } + d->subsysnqn[NVMF_NQN_FIELD_LEN - 1] = '\0'; + d->hostnqn[NVMF_NQN_FIELD_LEN - 1] = '\0'; status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, le32_to_cpu(c->kato), &ctrl); if (status) @@ -313,6 +315,8 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) goto out; } + d->subsysnqn[NVMF_NQN_FIELD_LEN - 1] = '\0'; + d->hostnqn[NVMF_NQN_FIELD_LEN - 1] = '\0'; ctrl = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn, le16_to_cpu(d->cntlid), req); if (!ctrl) { From 11b9d0b4999705105d1fb7f0e8ac969e0f41b1b8 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 20 Oct 2023 07:06:06 +0200 Subject: [PATCH 25/33] nvmet-tcp: always initialize tls_handshake_tmo_work The TLS handshake timeout work item should always be initialized to avoid a crash when cancelling the workqueue. Fixes: 675b453e0241 ("nvmet-tcp: enable TLS handshake upcall") Suggested-by: Maurizio Lombardi Signed-off-by: Hannes Reinecke Tested-by: Shin'ichiro Kawasaki Tested-by: Yi Zhang Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 92b74d0b8686..4cc27856aa8f 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1854,6 +1854,8 @@ static int nvmet_tcp_tls_handshake(struct nvmet_tcp_queue *queue) } return ret; } +#else +static void nvmet_tcp_tls_handshake_timeout(struct work_struct *w) {} #endif static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, @@ -1911,9 +1913,9 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list); mutex_unlock(&nvmet_tcp_queue_mutex); -#ifdef CONFIG_NVME_TARGET_TCP_TLS INIT_DELAYED_WORK(&queue->tls_handshake_tmo_work, nvmet_tcp_tls_handshake_timeout); +#ifdef CONFIG_NVME_TARGET_TCP_TLS if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) { struct sock *sk = queue->sock->sk; From 53f2bca2609237f910531f2c1a7779b16ce7952d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 20 Nov 2023 03:25:21 +0000 Subject: [PATCH 26/33] block/null_blk: Fix double blk_mq_start_request() warning When CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION is enabled, null_queue_rq() would return BLK_STS_RESOURCE or BLK_STS_DEV_RESOURCE for the request, which has been marked as MQ_RQ_IN_FLIGHT by blk_mq_start_request(). Then null_queue_rqs() put these requests in the rqlist, return back to the block layer core, which would try to queue them individually again, so the warning in blk_mq_start_request() triggered. Fix it by splitting the null_queue_rq() into two parts: the first is the preparation of request, the second is the handling of request. We put the blk_mq_start_request() after the preparation part, which may fail and return back to the block layer core. The throttling also belongs to the preparation part, so move it before blk_mq_start_request(). And change the return type of null_handle_cmd() to void, since it always return BLK_STS_OK now. Reported-by: Closes: https://lore.kernel.org/all/0000000000000e6aac06098aee0c@google.com/ Fixes: d78bfa1346ab ("block/null_blk: add queue_rqs() support") Suggested-by: Bart Van Assche Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/r/20231120032521.1012037-1-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 22a3cf7f32e2..3021d58ca51c 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1464,19 +1464,13 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, return BLK_STS_OK; } -static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, - sector_t nr_sectors, enum req_op op) +static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, + sector_t nr_sectors, enum req_op op) { struct nullb_device *dev = cmd->nq->dev; struct nullb *nullb = dev->nullb; blk_status_t sts; - if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { - sts = null_handle_throttled(cmd); - if (sts != BLK_STS_OK) - return sts; - } - if (op == REQ_OP_FLUSH) { cmd->error = errno_to_blk_status(null_handle_flush(nullb)); goto out; @@ -1493,7 +1487,6 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, out: nullb_complete_cmd(cmd); - return BLK_STS_OK; } static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) @@ -1724,8 +1717,6 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->fake_timeout = should_timeout_request(rq) || blk_should_fake_timeout(rq->q); - blk_mq_start_request(rq); - if (should_requeue_request(rq)) { /* * Alternate between hitting the core BUSY path, and the @@ -1738,6 +1729,15 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } + if (test_bit(NULLB_DEV_FL_THROTTLED, &nq->dev->flags)) { + blk_status_t sts = null_handle_throttled(cmd); + + if (sts != BLK_STS_OK) + return sts; + } + + blk_mq_start_request(rq); + if (is_poll) { spin_lock(&nq->poll_lock); list_add_tail(&rq->queuelist, &nq->poll_list); @@ -1747,7 +1747,8 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, if (cmd->fake_timeout) return BLK_STS_OK; - return null_handle_cmd(cmd, sector, nr_sectors, req_op(rq)); + null_handle_cmd(cmd, sector, nr_sectors, req_op(rq)); + return BLK_STS_OK; } static void null_queue_rqs(struct request **rqlist) From 5029c5e4f20d8d6b41cefbde4b3eeadaec4662c6 Mon Sep 17 00:00:00 2001 From: Muhammad Muzammil Date: Wed, 25 Oct 2023 15:24:36 +0200 Subject: [PATCH 27/33] s390/dasd: resolve spelling mistake resolve typing mistake from pimary to primary Signed-off-by: Muhammad Muzammil Link: https://lore.kernel.org/r/20231010043140.28416-1-m.muzzammilashraf@gmail.com Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20231025132437.1223363-2-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_int.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 2e663131adaf..1b1b8a41c4d4 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -283,7 +283,7 @@ struct dasd_pprc_dev_info { __u8 secondary; /* 7 Secondary device address */ __u16 pprc_id; /* 8-9 Peer-to-Peer Remote Copy ID */ __u8 reserved2[12]; /* 10-21 reserved */ - __u16 prim_cu_ssid; /* 22-23 Pimary Control Unit SSID */ + __u16 prim_cu_ssid; /* 22-23 Primary Control Unit SSID */ __u8 reserved3[12]; /* 24-35 reserved */ __u16 sec_cu_ssid; /* 36-37 Secondary Control Unit SSID */ __u8 reserved4[90]; /* 38-127 reserved */ From db46cd1e0426f52999d50fa72cfa97fa39952885 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Wed, 25 Oct 2023 15:24:37 +0200 Subject: [PATCH 28/33] s390/dasd: protect device queue against concurrent access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In dasd_profile_start() the amount of requests on the device queue are counted. The access to the device queue is unprotected against concurrent access. With a lot of parallel I/O, especially with alias devices enabled, the device queue can change while dasd_profile_start() is accessing the queue. In the worst case this leads to a kernel panic due to incorrect pointer accesses. Fix this by taking the device lock before accessing the queue and counting the requests. Additionally the check for a valid profile data pointer can be done earlier to avoid unnecessary locking in a hot path. Cc: Fixes: 4fa52aa7a82f ("[S390] dasd: add enhanced DASD statistics interface") Reviewed-by: Stefan Haberland Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20231025132437.1223363-3-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index d440319a7945..833cfab7d877 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -676,18 +676,20 @@ static void dasd_profile_start(struct dasd_block *block, * we count each request only once. */ device = cqr->startdev; - if (device->profile.data) { - counter = 1; /* request is not yet queued on the start device */ - list_for_each(l, &device->ccw_queue) - if (++counter >= 31) - break; - } + if (!device->profile.data) + return; + + spin_lock(get_ccwdev_lock(device->cdev)); + counter = 1; /* request is not yet queued on the start device */ + list_for_each(l, &device->ccw_queue) + if (++counter >= 31) + break; + spin_unlock(get_ccwdev_lock(device->cdev)); + spin_lock(&device->profile.lock); - if (device->profile.data) { - device->profile.data->dasd_io_nr_req[counter]++; - if (rq_data_dir(req) == READ) - device->profile.data->dasd_read_nr_req[counter]++; - } + device->profile.data->dasd_io_nr_req[counter]++; + if (rq_data_dir(req) == READ) + device->profile.data->dasd_read_nr_req[counter]++; spin_unlock(&device->profile.lock); } From 98c598afc22d4e43c2ad91860b65996d0c099a5d Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 11 Sep 2023 10:33:08 +0800 Subject: [PATCH 29/33] nbd: pass nbd_sock to nbd_read_reply() instead of index If a socket is processing ioctl 'NBD_SET_SOCK', config->socks might be krealloc in nbd_add_socket(), and a garbage request is received now, a UAF may occurs. T1 nbd_ioctl __nbd_ioctl nbd_add_socket blk_mq_freeze_queue T2 recv_work nbd_read_reply sock_xmit krealloc config->socks def config->socks Pass nbd_sock to nbd_read_reply(). And introduce a new function sock_xmit_recv(), which differs from sock_xmit only in the way it get socket. ================================================================== BUG: KASAN: use-after-free in sock_xmit+0x525/0x550 Read of size 8 at addr ffff8880188ec428 by task kworker/u12:1/18779 Workqueue: knbd4-recv recv_work Call Trace: __dump_stack dump_stack+0xbe/0xfd print_address_description.constprop.0+0x19/0x170 __kasan_report.cold+0x6c/0x84 kasan_report+0x3a/0x50 sock_xmit+0x525/0x550 nbd_read_reply+0xfe/0x2c0 recv_work+0x1c2/0x750 process_one_work+0x6b6/0xf10 worker_thread+0xdd/0xd80 kthread+0x30a/0x410 ret_from_fork+0x22/0x30 Allocated by task 18784: kasan_save_stack+0x1b/0x40 kasan_set_track set_alloc_info __kasan_kmalloc __kasan_kmalloc.constprop.0+0xf0/0x130 slab_post_alloc_hook slab_alloc_node slab_alloc __kmalloc_track_caller+0x157/0x550 __do_krealloc krealloc+0x37/0xb0 nbd_add_socket +0x2d3/0x880 __nbd_ioctl nbd_ioctl+0x584/0x8e0 __blkdev_driver_ioctl blkdev_ioctl+0x2a0/0x6e0 block_ioctl+0xee/0x130 vfs_ioctl __do_sys_ioctl __se_sys_ioctl+0x138/0x190 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6 Freed by task 18784: kasan_save_stack+0x1b/0x40 kasan_set_track+0x1c/0x30 kasan_set_free_info+0x20/0x40 __kasan_slab_free.part.0+0x13f/0x1b0 slab_free_hook slab_free_freelist_hook slab_free kfree+0xcb/0x6c0 krealloc+0x56/0xb0 nbd_add_socket+0x2d3/0x880 __nbd_ioctl nbd_ioctl+0x584/0x8e0 __blkdev_driver_ioctl blkdev_ioctl+0x2a0/0x6e0 block_ioctl+0xee/0x130 vfs_ioctl __do_sys_ioctl __se_sys_ioctl+0x138/0x190 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6 Signed-off-by: Li Nan Reviewed-by: Yu Kuai Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230911023308.3467802-1-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 3f03cb3dc33c..b6414e1e645b 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -67,6 +67,7 @@ struct nbd_sock { struct recv_thread_args { struct work_struct work; struct nbd_device *nbd; + struct nbd_sock *nsock; int index; }; @@ -505,15 +506,9 @@ done: return BLK_EH_DONE; } -/* - * Send or receive packet. Return a positive value on success and - * negtive value on failue, and never return 0. - */ -static int sock_xmit(struct nbd_device *nbd, int index, int send, - struct iov_iter *iter, int msg_flags, int *sent) +static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send, + struct iov_iter *iter, int msg_flags, int *sent) { - struct nbd_config *config = nbd->config; - struct socket *sock = config->socks[index]->sock; int result; struct msghdr msg; unsigned int noreclaim_flag; @@ -556,6 +551,19 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, return result; } +/* + * Send or receive packet. Return a positive value on success and + * negtive value on failure, and never return 0. + */ +static int sock_xmit(struct nbd_device *nbd, int index, int send, + struct iov_iter *iter, int msg_flags, int *sent) +{ + struct nbd_config *config = nbd->config; + struct socket *sock = config->socks[index]->sock; + + return __sock_xmit(nbd, sock, send, iter, msg_flags, sent); +} + /* * Different settings for sk->sk_sndtimeo can result in different return values * if there is a signal pending when we enter sendmsg, because reasons? @@ -712,7 +720,7 @@ out: return 0; } -static int nbd_read_reply(struct nbd_device *nbd, int index, +static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock, struct nbd_reply *reply) { struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)}; @@ -721,7 +729,7 @@ static int nbd_read_reply(struct nbd_device *nbd, int index, reply->magic = 0; iov_iter_kvec(&to, ITER_DEST, &iov, 1, sizeof(*reply)); - result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); + result = __sock_xmit(nbd, sock, 0, &to, MSG_WAITALL, NULL); if (result < 0) { if (!nbd_disconnected(nbd->config)) dev_err(disk_to_dev(nbd->disk), @@ -845,14 +853,14 @@ static void recv_work(struct work_struct *work) struct nbd_device *nbd = args->nbd; struct nbd_config *config = nbd->config; struct request_queue *q = nbd->disk->queue; - struct nbd_sock *nsock; + struct nbd_sock *nsock = args->nsock; struct nbd_cmd *cmd; struct request *rq; while (1) { struct nbd_reply reply; - if (nbd_read_reply(nbd, args->index, &reply)) + if (nbd_read_reply(nbd, nsock->sock, &reply)) break; /* @@ -887,7 +895,6 @@ static void recv_work(struct work_struct *work) percpu_ref_put(&q->q_usage_counter); } - nsock = config->socks[args->index]; mutex_lock(&nsock->tx_lock); nbd_mark_nsock_dead(nbd, nsock, 1); mutex_unlock(&nsock->tx_lock); @@ -1231,6 +1238,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) INIT_WORK(&args->work, recv_work); args->index = i; args->nbd = nbd; + args->nsock = nsock; nsock->cookie++; mutex_unlock(&nsock->tx_lock); sockfd_put(old); @@ -1413,6 +1421,7 @@ static int nbd_start_device(struct nbd_device *nbd) refcount_inc(&nbd->config_refs); INIT_WORK(&args->work, recv_work); args->nbd = nbd; + args->nsock = config->socks[i]; args->index = i; queue_work(nbd->recv_workq, &args->work); } From 3af755a46881c32fecaecfdeaf3a8f0a869deca5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Nov 2023 09:01:03 +0100 Subject: [PATCH 30/33] nvme: move nvme_stop_keep_alive() back to original position Stopping keep-alive not only stops the keep-alive workqueue, but also needs to be synchronized with I/O termination as we must not send a keep-alive command after all I/O had been terminated. So to avoid any regressions move the call to stop_keep_alive() back to its original position and ensure that keep-alive is correctly stopped failing to setup the admin queue. Fixes: 4733b65d82bd ("nvme: start keep-alive after admin queue setup") Suggested-by: Sagi Grimberg Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/fc.c | 19 ++++++++----------- drivers/nvme/host/rdma.c | 1 + drivers/nvme/host/tcp.c | 1 + 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fd28e6b6574c..46a4c9c5ea96 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -482,7 +482,6 @@ EXPORT_SYMBOL_GPL(nvme_cancel_tagset); void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl) { - nvme_stop_keep_alive(ctrl); if (ctrl->admin_tagset) { blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl); @@ -4355,6 +4354,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_mpath_stop(ctrl); nvme_auth_stop(ctrl); + nvme_stop_keep_alive(ctrl); nvme_stop_failfast_work(ctrl); flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fw_act_work); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 49c3e46eaa1e..9f9a3b35dc64 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2530,12 +2530,6 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) * clean up the admin queue. Same thing as above. */ nvme_quiesce_admin_queue(&ctrl->ctrl); - - /* - * Open-coding nvme_cancel_admin_tagset() as fc - * is not using nvme_cancel_request(). - */ - nvme_stop_keep_alive(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -3138,11 +3132,12 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) nvme_unquiesce_admin_queue(&ctrl->ctrl); ret = nvme_init_ctrl_finish(&ctrl->ctrl, false); - if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) - ret = -EIO; if (ret) goto out_disconnect_admin_queue; - + if (test_bit(ASSOC_FAILED, &ctrl->flags)) { + ret = -EIO; + goto out_stop_keep_alive; + } /* sanity checks */ /* FC-NVME does not have other data in the capsule */ @@ -3150,7 +3145,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n", ctrl->ctrl.icdoff); ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR; - goto out_disconnect_admin_queue; + goto out_stop_keep_alive; } /* FC-NVME supports normal SGL Data Block Descriptors */ @@ -3158,7 +3153,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) dev_err(ctrl->ctrl.device, "Mandatory sgls are not supported!\n"); ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR; - goto out_disconnect_admin_queue; + goto out_stop_keep_alive; } if (opts->queue_size > ctrl->ctrl.maxcmd) { @@ -3205,6 +3200,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) out_term_aen_ops: nvme_fc_term_aen_ops(ctrl); +out_stop_keep_alive: + nvme_stop_keep_alive(&ctrl->ctrl); out_disconnect_admin_queue: dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: create_assoc failed, assoc_id %llx ret %d\n", diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index a7fea4cbacd7..6d178d555920 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1080,6 +1080,7 @@ destroy_io: nvme_rdma_free_io_queues(ctrl); } destroy_admin: + nvme_stop_keep_alive(&ctrl->ctrl); nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 6ed794815517..ddcd23fb8b75 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2237,6 +2237,7 @@ destroy_io: nvme_tcp_destroy_io_queues(ctrl, new); } destroy_admin: + nvme_stop_keep_alive(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); return ret; } From d78abcbabe7e98bb4baa4dea87550806944790ed Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Nov 2023 23:47:17 +0100 Subject: [PATCH 31/33] nvme: target: fix nvme_keyring_id() references In configurations without CONFIG_NVME_TARGET_TCP_TLS, the keyring code might not be available, or using it will result in a runtime failure: x86_64-linux-ld: vmlinux.o: in function `nvmet_ports_make': configfs.c:(.text+0x100a211): undefined reference to `nvme_keyring_id' Add a check to ensure we only check the keyring if there is a chance of it being used, which avoids both the runtime and link-time problems. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20231122224719.4042108-2-arnd@kernel.org Signed-off-by: Jens Axboe --- drivers/nvme/target/configfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 9eed6e6765ea..e307a044b1a1 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1893,7 +1893,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group, return ERR_PTR(-ENOMEM); } - if (nvme_keyring_id()) { + if (IS_ENABLED(CONFIG_NVME_TARGET_TCP_TLS) && nvme_keyring_id()) { port->keyring = key_lookup(nvme_keyring_id()); if (IS_ERR(port->keyring)) { pr_warn("NVMe keyring not available, disabling TLS\n"); From 65e2a74c44ddfa174b700f5da2d1d29b4ba6639b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Nov 2023 23:47:18 +0100 Subject: [PATCH 32/33] nvme: target: fix Kconfig select statements When the NVME target code is built-in but its TCP frontend is a loadable module, enabling keyring support causes a link failure: x86_64-linux-ld: vmlinux.o: in function `nvmet_ports_make': configfs.c:(.text+0x100a211): undefined reference to `nvme_keyring_id' The problem is that CONFIG_NVME_TARGET_TCP_TLS is a 'bool' symbol that depends on the tristate CONFIG_NVME_TARGET_TCP, so any 'select' from it inherits the state of the tristate symbol rather than the intended CONFIG_NVME_TARGET one that contains the actual call. The same thing is true for CONFIG_KEYS, which itself is required for NVME_KEYRING. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20231122224719.4042108-3-arnd@kernel.org Signed-off-by: Jens Axboe --- drivers/nvme/target/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 31633da9427c..e1ebc73f3e5e 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -4,6 +4,8 @@ config NVME_TARGET tristate "NVMe Target support" depends on BLOCK depends on CONFIGFS_FS + select NVME_KEYRING if NVME_TARGET_TCP_TLS + select KEYS if NVME_TARGET_TCP_TLS select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY select SGL_ALLOC help @@ -87,9 +89,7 @@ config NVME_TARGET_TCP config NVME_TARGET_TCP_TLS bool "NVMe over Fabrics TCP target TLS encryption support" depends on NVME_TARGET_TCP - select NVME_KEYRING select NET_HANDSHAKE - select KEYS help Enables TLS encryption for the NVMe TCP target using the netlink handshake API. From 0e6c4fe782e683ff55a27fbb10e9c6b5c241533b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Nov 2023 23:47:19 +0100 Subject: [PATCH 33/33] nvme: tcp: fix compile-time checks for TLS mode When CONFIG_NVME_KEYRING is enabled as a loadable module, but the TCP host code is built-in, it fails to link: arm-linux-gnueabi-ld: drivers/nvme/host/tcp.o: in function `nvme_tcp_setup_ctrl': tcp.c:(.text+0x1940): undefined reference to `nvme_tls_psk_default' The problem is that the compile-time conditionals are inconsistent here, using a mix of #ifdef CONFIG_NVME_TCP_TLS, IS_ENABLED(CONFIG_NVME_TCP_TLS) and IS_ENABLED(CONFIG_NVME_KEYRING) checks, with CONFIG_NVME_KEYRING controlling whether the implementation is actually built. Change it to use IS_ENABLED(CONFIG_NVME_KEYRING) checks consistently, which should help readability and make it less error-prone. Combining it with the check for the ctrl->opts->tls flag lets the compiler drop all the TLS code in configurations without this feature, which also helps runtime behavior in addition to avoiding the link failure. To make it possible for the compiler to build the dead code, both the tls_handshake_timeout variable and the TLS specific members of nvme_tcp_queue need to be moved out of the #ifdef block as well, but at least the former of these gets optimized out again. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20231122224719.4042108-4-arnd@kernel.org Signed-off-by: Jens Axboe --- drivers/nvme/host/tcp.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index ddcd23fb8b75..d79811cfa0ce 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -36,11 +36,11 @@ static int so_priority; module_param(so_priority, int, 0644); MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); -#ifdef CONFIG_NVME_TCP_TLS /* * TLS handshake timeout */ static int tls_handshake_timeout = 10; +#ifdef CONFIG_NVME_TCP_TLS module_param(tls_handshake_timeout, int, 0644); MODULE_PARM_DESC(tls_handshake_timeout, "nvme TLS handshake timeout in seconds (default 10)"); @@ -161,10 +161,8 @@ struct nvme_tcp_queue { struct ahash_request *snd_hash; __le32 exp_ddgst; __le32 recv_ddgst; -#ifdef CONFIG_NVME_TCP_TLS struct completion tls_complete; int tls_err; -#endif struct page_frag_cache pf_cache; void (*state_change)(struct sock *); @@ -207,6 +205,14 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) return queue - queue->ctrl->queues; } +static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl) +{ + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) + return 0; + + return ctrl->opts->tls; +} + static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) { u32 queue_idx = nvme_tcp_queue_id(queue); @@ -1412,7 +1418,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) memset(&msg, 0, sizeof(msg)); iov.iov_base = icresp; iov.iov_len = sizeof(*icresp); - if (queue->ctrl->ctrl.opts->tls) { + if (nvme_tcp_tls(&queue->ctrl->ctrl)) { msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); } @@ -1424,7 +1430,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) goto free_icresp; } ret = -ENOTCONN; - if (queue->ctrl->ctrl.opts->tls) { + if (nvme_tcp_tls(&queue->ctrl->ctrl)) { ctype = tls_get_record_type(queue->sock->sk, (struct cmsghdr *)cbuf); if (ctype != TLS_RECORD_TYPE_DATA) { @@ -1548,7 +1554,6 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); } -#ifdef CONFIG_NVME_TCP_TLS static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) { struct nvme_tcp_queue *queue = data; @@ -1625,14 +1630,6 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl, } return ret; } -#else -static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl, - struct nvme_tcp_queue *queue, - key_serial_t pskid) -{ - return -EPROTONOSUPPORT; -} -#endif static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, key_serial_t pskid) @@ -1759,7 +1756,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, } /* If PSKs are configured try to start TLS */ - if (pskid) { + if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) { ret = nvme_tcp_start_tls(nctrl, queue, pskid); if (ret) goto err_init_connect; @@ -1916,7 +1913,7 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) int ret; key_serial_t pskid = 0; - if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && ctrl->opts->tls) { + if (nvme_tcp_tls(ctrl)) { if (ctrl->opts->tls_key) pskid = key_serial(ctrl->opts->tls_key); else @@ -1949,7 +1946,7 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { int i, ret; - if (ctrl->opts->tls && !ctrl->tls_key) { + if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) { dev_err(ctrl->device, "no PSK negotiated\n"); return -ENOKEY; }