From 151f66bb618d1fd0eeb84acb61b4a9fa5d8bb0fa Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 22 Mar 2024 16:10:05 +0800 Subject: [PATCH 1/4] md/raid5: fix deadlock that raid5d() wait for itself to clear MD_SB_CHANGE_PENDING Xiao reported that lvm2 test lvconvert-raid-takeover.sh can hang with small possibility, the root cause is exactly the same as commit bed9e27baf52 ("Revert "md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d"") However, Dan reported another hang after that, and junxiao investigated the problem and found out that this is caused by plugged bio can't issue from raid5d(). Current implementation in raid5d() has a weird dependence: 1) md_check_recovery() from raid5d() must hold 'reconfig_mutex' to clear MD_SB_CHANGE_PENDING; 2) raid5d() handles IO in a deadloop, until all IO are issued; 3) IO from raid5d() must wait for MD_SB_CHANGE_PENDING to be cleared; This behaviour is introduce before v2.6, and for consequence, if other context hold 'reconfig_mutex', and md_check_recovery() can't update super_block, then raid5d() will waste one cpu 100% by the deadloop, until 'reconfig_mutex' is released. Refer to the implementation from raid1 and raid10, fix this problem by skipping issue IO if MD_SB_CHANGE_PENDING is still set after md_check_recovery(), daemon thread will be woken up when 'reconfig_mutex' is released. Meanwhile, the hang problem will be fixed as well. Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d") Cc: stable@vger.kernel.org # v5.19+ Reported-and-tested-by: Dan Moulding Closes: https://lore.kernel.org/all/20240123005700.9302-1-dan@danm.net/ Investigated-by: Junxiao Bi Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240322081005.1112401-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid5.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d874abfc1836..2bd1ce9b3922 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -36,7 +36,6 @@ */ #include -#include #include #include #include @@ -6734,6 +6733,9 @@ static void raid5d(struct md_thread *thread) int batch_size, released; unsigned int offset; + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + break; + released = release_stripe_list(conf, conf->temp_inactive_list); if (released) clear_bit(R5_DID_ALLOC, &conf->cache_state); @@ -6770,18 +6772,7 @@ static void raid5d(struct md_thread *thread) spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); - - /* - * Waiting on MD_SB_CHANGE_PENDING below may deadlock - * seeing md_check_recovery() is needed to clear - * the flag when using mdmon. - */ - continue; } - - wait_event_lock_irq(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), - conf->device_lock); } pr_debug("%d stripes handled\n", handled); From 3821bbad0d0fbb6c9d77987bd54c89348752056d Mon Sep 17 00:00:00 2001 From: Florian-Ewald Mueller Date: Wed, 27 Mar 2024 12:40:22 +0100 Subject: [PATCH 2/4] md: add check for sleepers in md_wakeup_thread() Check for sleeping thread before attempting its wake_up in md_wakeup_thread() to avoid unnecessary spinlock contention. With a 6.1 kernel, fio random read/write tests on many (>= 100) virtual volumes, of 100 GiB each, on 3 md-raid5s on 8 SSDs each (building a raid50), show by 3 to 4 % improved IOPS performance. Signed-off-by: Florian-Ewald Mueller Reviewed-by: Yu Kuai Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20240327114022.74634-1-jinpu.wang@ionos.com Signed-off-by: Song Liu --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e575e74aabf5..fd574b015d78 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8087,7 +8087,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread) if (t) { pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); set_bit(THREAD_WAKEUP, &t->flags); - wake_up(&t->wqueue); + if (wq_has_sleeper(&t->wqueue)) + wake_up(&t->wqueue); } rcu_read_unlock(); } From 3f9f231236ce7e48780d8a4f1f8cb9fae2df1e4e Mon Sep 17 00:00:00 2001 From: Li Nan Date: Wed, 17 Jan 2024 11:19:45 +0800 Subject: [PATCH 3/4] md: Fix overflow in is_mddev_idle UBSAN reports this problem: UBSAN: Undefined behaviour in drivers/md/md.c:8175:15 signed integer overflow: -2147483291 - 2072033152 cannot be represented in type 'int' Call trace: dump_backtrace+0x0/0x310 show_stack+0x28/0x38 dump_stack+0xec/0x15c ubsan_epilogue+0x18/0x84 handle_overflow+0x14c/0x19c __ubsan_handle_sub_overflow+0x34/0x44 is_mddev_idle+0x338/0x3d8 md_do_sync+0x1bb8/0x1cf8 md_thread+0x220/0x288 kthread+0x1d8/0x1e0 ret_from_fork+0x10/0x18 'curr_events' will overflow when stat accum or 'sync_io' is greater than INT_MAX. Fix it by changing sync_io, last_events and curr_events to 64bit. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240117031946.2324519-2-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md.c | 7 ++++--- drivers/md/md.h | 4 ++-- include/linux/blkdev.h | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index fd574b015d78..f28e41f9ac4a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8577,14 +8577,15 @@ static int is_mddev_idle(struct mddev *mddev, int init) { struct md_rdev *rdev; int idle; - int curr_events; + long long curr_events; idle = 1; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_disk; - curr_events = (int)part_stat_read_accum(disk->part0, sectors) - - atomic_read(&disk->sync_io); + curr_events = + (long long)part_stat_read_accum(disk->part0, sectors) - + atomic64_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and * disk_stats is counted when it completes. diff --git a/drivers/md/md.h b/drivers/md/md.h index 097d9dbd69b8..d0db98c0d33b 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -51,7 +51,7 @@ struct md_rdev { sector_t sectors; /* Device size (in 512bytes sectors) */ struct mddev *mddev; /* RAID array if running */ - int last_events; /* IO event timestamp */ + long long last_events; /* IO event timestamp */ /* * If meta_bdev is non-NULL, it means that a separate device is @@ -621,7 +621,7 @@ extern void mddev_unlock(struct mddev *mddev); static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) { - atomic_add(nr_sectors, &bdev->bd_disk->sync_io); + atomic64_add(nr_sectors, &bdev->bd_disk->sync_io); } static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c3e8f7cf96be..d0fa4f1c29a2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -172,7 +172,7 @@ struct gendisk { struct list_head slave_bdevs; #endif struct timer_rand_state *random; - atomic_t sync_io; /* RAID */ + atomic64_t sync_io; /* RAID */ struct disk_events *ev; #ifdef CONFIG_BLK_DEV_ZONED From 9d1110f99c253ccef82e480bfe9f38a12eb797a7 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Wed, 17 Jan 2024 11:19:46 +0800 Subject: [PATCH 4/4] md: don't account sync_io if iostats of the disk is disabled If iostats is disabled, disk_stats will not be updated and part_stat_read_accum() only returns a constant value. In this case, continuing to count sync_io and to check is_mddev_idle() is no longer meaningful. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240117031946.2324519-3-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md.c | 4 ++++ drivers/md/md.h | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f28e41f9ac4a..00bbafcd27bb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8583,6 +8583,10 @@ static int is_mddev_idle(struct mddev *mddev, int init) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_disk; + + if (!init && !blk_queue_io_stat(disk->queue)) + continue; + curr_events = (long long)part_stat_read_accum(disk->part0, sectors) - atomic64_read(&disk->sync_io); diff --git a/drivers/md/md.h b/drivers/md/md.h index d0db98c0d33b..029dd0491a36 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -621,7 +621,8 @@ extern void mddev_unlock(struct mddev *mddev); static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) { - atomic64_add(nr_sectors, &bdev->bd_disk->sync_io); + if (blk_queue_io_stat(bdev->bd_disk->queue)) + atomic64_add(nr_sectors, &bdev->bd_disk->sync_io); } static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)