From 21925ede449e038ed6f9efdfe0e79f15bddc34bc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 30 Jan 2025 05:06:07 +0000 Subject: [PATCH 01/68] f2fs: introduce f2fs_base_attr for global sysfs entries In /sys/fs/f2fs/features, there's no f2fs_sb_info, so let's avoid to get the pointer. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 74 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d15c68b28952..b419555e1ea7 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -61,6 +61,12 @@ struct f2fs_attr { int id; }; +struct f2fs_base_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_base_attr *a, char *buf); + ssize_t (*store)(struct f2fs_base_attr *a, const char *buf, size_t len); +}; + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); @@ -862,6 +868,25 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +static ssize_t f2fs_base_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->show ? a->show(a, buf) : 0; +} + +static ssize_t f2fs_base_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->store ? a->store(a, buf, len) : 0; +} + /* * Note that there are three feature list entries: * 1) /sys/fs/f2fs/features @@ -880,14 +905,13 @@ static void f2fs_sb_release(struct kobject *kobj) * please add new on-disk feature in this list only. * - ref. F2FS_SB_FEATURE_RO_ATTR() */ -static ssize_t f2fs_feature_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) +static ssize_t f2fs_feature_show(struct f2fs_base_attr *a, char *buf) { return sysfs_emit(buf, "supported\n"); } #define F2FS_FEATURE_RO_ATTR(_name) \ -static struct f2fs_attr f2fs_attr_##_name = { \ +static struct f2fs_base_attr f2fs_base_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0444 }, \ .show = f2fs_feature_show, \ } @@ -1256,37 +1280,38 @@ static struct attribute *f2fs_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs); +#define BASE_ATTR_LIST(name) (&f2fs_base_attr_##name.attr) static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION - ATTR_LIST(encryption), - ATTR_LIST(test_dummy_encryption_v2), + BASE_ATTR_LIST(encryption), + BASE_ATTR_LIST(test_dummy_encryption_v2), #if IS_ENABLED(CONFIG_UNICODE) - ATTR_LIST(encrypted_casefold), + BASE_ATTR_LIST(encrypted_casefold), #endif #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED - ATTR_LIST(block_zoned), + BASE_ATTR_LIST(block_zoned), #endif - ATTR_LIST(atomic_write), - ATTR_LIST(extra_attr), - ATTR_LIST(project_quota), - ATTR_LIST(inode_checksum), - ATTR_LIST(flexible_inline_xattr), - ATTR_LIST(quota_ino), - ATTR_LIST(inode_crtime), - ATTR_LIST(lost_found), + BASE_ATTR_LIST(atomic_write), + BASE_ATTR_LIST(extra_attr), + BASE_ATTR_LIST(project_quota), + BASE_ATTR_LIST(inode_checksum), + BASE_ATTR_LIST(flexible_inline_xattr), + BASE_ATTR_LIST(quota_ino), + BASE_ATTR_LIST(inode_crtime), + BASE_ATTR_LIST(lost_found), #ifdef CONFIG_FS_VERITY - ATTR_LIST(verity), + BASE_ATTR_LIST(verity), #endif - ATTR_LIST(sb_checksum), + BASE_ATTR_LIST(sb_checksum), #if IS_ENABLED(CONFIG_UNICODE) - ATTR_LIST(casefold), + BASE_ATTR_LIST(casefold), #endif - ATTR_LIST(readonly), + BASE_ATTR_LIST(readonly), #ifdef CONFIG_F2FS_FS_COMPRESSION - ATTR_LIST(compression), + BASE_ATTR_LIST(compression), #endif - ATTR_LIST(pin_file), + BASE_ATTR_LIST(pin_file), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -1362,9 +1387,14 @@ static struct kset f2fs_kset = { .kobj = {.ktype = &f2fs_ktype}, }; +static const struct sysfs_ops f2fs_feat_attr_ops = { + .show = f2fs_base_attr_show, + .store = f2fs_base_attr_store, +}; + static const struct kobj_type f2fs_feat_ktype = { .default_groups = f2fs_feat_groups, - .sysfs_ops = &f2fs_attr_ops, + .sysfs_ops = &f2fs_feat_attr_ops, }; static struct kobject f2fs_feat = { From 53333cdf5b03870a8ed4f8d434bd98b3d6771d5f Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Sun, 2 Feb 2025 13:32:53 +0900 Subject: [PATCH 02/68] f2fs: remove unnecessary null checking When __GFP_DIRECT_RECLAIM (included in both GFP_NOIO and GFP_KERNEL) is specified, bio_alloc_bioset() never fails to allocate a bio. Commit 67883ade7a98 ("f2fs: remove FAULT_ALLOC_BIO") replaced f2fs_bio_alloc() with bio_alloc_bioset(), but null checking after bio_alloc_bioset() was still left. Fixes: 67883ade7a98 ("f2fs: remove FAULT_ALLOC_BIO") Signed-off-by: Kohei Enju Reviewed-by: Jens Axboe Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index de4da6d9cd93..07b46b444d31 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1041,8 +1041,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages), REQ_OP_READ | op_flag, for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset); - if (!bio) - return ERR_PTR(-ENOMEM); bio->bi_iter.bi_sector = sector; f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); bio->bi_end_io = f2fs_read_end_io; From eb85c2410d6f581e957cd03a644ff6ddbe592af9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 8 Feb 2025 10:33:21 +0800 Subject: [PATCH 03/68] f2fs: quota: fix to avoid warning in dquot_writeback_dquots() F2FS-fs (dm-59): checkpoint=enable has some unwritten data. ------------[ cut here ]------------ WARNING: CPU: 6 PID: 8013 at fs/quota/dquot.c:691 dquot_writeback_dquots+0x2fc/0x308 pc : dquot_writeback_dquots+0x2fc/0x308 lr : f2fs_quota_sync+0xcc/0x1c4 Call trace: dquot_writeback_dquots+0x2fc/0x308 f2fs_quota_sync+0xcc/0x1c4 f2fs_write_checkpoint+0x3d4/0x9b0 f2fs_issue_checkpoint+0x1bc/0x2c0 f2fs_sync_fs+0x54/0x150 f2fs_do_sync_file+0x2f8/0x814 __f2fs_ioctl+0x1960/0x3244 f2fs_ioctl+0x54/0xe0 __arm64_sys_ioctl+0xa8/0xe4 invoke_syscall+0x58/0x114 checkpoint and f2fs_remount may race as below, resulting triggering warning in dquot_writeback_dquots(). atomic write remount - do_remount - down_write(&sb->s_umount); - f2fs_remount - ioctl - f2fs_do_sync_file - f2fs_sync_fs - f2fs_write_checkpoint - block_operations - locked = down_read_trylock(&sbi->sb->s_umount) : fail to lock due to the write lock was held by remount - up_write(&sb->s_umount); - f2fs_quota_sync - dquot_writeback_dquots - WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)) : trigger warning because s_umount lock was unlocked by remount If checkpoint comes from mount/umount/remount/freeze/quotactl, caller of checkpoint has already held s_umount lock, calling dquot_writeback_dquots() in the context should be safe. So let's record task to sbi->umount_lock_holder, so that checkpoint can know whether the lock has held in the context or not by checking current w/ it. In addition, in order to not misrepresent caller of checkpoint, we should not allow to trigger async checkpoint for those callers: mount/umount/remount/ freeze/quotactl. Fixes: af033b2aa8a8 ("f2fs: guarantee journalled quota data by checkpoint") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 15 ++++++---- fs/f2fs/f2fs.h | 3 +- fs/f2fs/super.c | 65 ++++++++++++++++++++++++++++++++++---------- 3 files changed, 61 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index efda9a022981..bd890738b94d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1237,7 +1237,7 @@ static int block_operations(struct f2fs_sb_info *sbi) retry_flush_quotas: f2fs_lock_all(sbi); if (__need_flush_quota(sbi)) { - int locked; + bool need_lock = sbi->umount_lock_holder != current; if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) { set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); @@ -1246,11 +1246,13 @@ retry_flush_quotas: } f2fs_unlock_all(sbi); - /* only failed during mount/umount/freeze/quotactl */ - locked = down_read_trylock(&sbi->sb->s_umount); - f2fs_quota_sync(sbi->sb, -1); - if (locked) + /* don't grab s_umount lock during mount/umount/remount/freeze/quotactl */ + if (!need_lock) { + f2fs_do_quota_sync(sbi->sb, -1); + } else if (down_read_trylock(&sbi->sb->s_umount)) { + f2fs_do_quota_sync(sbi->sb, -1); up_read(&sbi->sb->s_umount); + } cond_resched(); goto retry_flush_quotas; } @@ -1867,7 +1869,8 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) struct cp_control cpc; cpc.reason = __get_cp_reason(sbi); - if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC || + sbi->umount_lock_holder == current) { int ret; f2fs_down_write(&sbi->gc_lock); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1afa7be16e7d..493dda2d4b66 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1659,6 +1659,7 @@ struct f2fs_sb_info { unsigned int nquota_files; /* # of quota sysfile */ struct f2fs_rwsem quota_sem; /* blocking cp for flags */ + struct task_struct *umount_lock_holder; /* s_umount lock holder */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; @@ -3624,7 +3625,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_dquot_initialize(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); -int f2fs_quota_sync(struct super_block *sb, int type); +int f2fs_do_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 19b67828ae32..1beff52ae80b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1737,22 +1737,28 @@ int f2fs_sync_fs(struct super_block *sb, int sync) static int f2fs_freeze(struct super_block *sb) { + struct f2fs_sb_info *sbi = F2FS_SB(sb); + if (f2fs_readonly(sb)) return 0; /* IO error happened before */ - if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + if (unlikely(f2fs_cp_error(sbi))) return -EIO; /* must be clean, since sync_filesystem() was already called */ - if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY)) return -EINVAL; + sbi->umount_lock_holder = current; + /* Let's flush checkpoints and stop the thread. */ - f2fs_flush_ckpt_thread(F2FS_SB(sb)); + f2fs_flush_ckpt_thread(sbi); + + sbi->umount_lock_holder = NULL; /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ - set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); + set_sbi_flag(sbi, SBI_IS_FREEZING); return 0; } @@ -2329,6 +2335,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; old_sb_flags = sb->s_flags; + sbi->umount_lock_holder = current; + #ifdef CONFIG_QUOTA org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { @@ -2552,6 +2560,8 @@ skip: limit_reserve_root(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); + + sbi->umount_lock_holder = NULL; return 0; restore_checkpoint: if (need_enable_checkpoint) { @@ -2592,6 +2602,8 @@ restore_opts: #endif sbi->mount_opt = org_mount_opt; sb->s_flags = old_sb_flags; + + sbi->umount_lock_holder = NULL; return err; } @@ -2908,7 +2920,7 @@ out: return ret; } -int f2fs_quota_sync(struct super_block *sb, int type) +int f2fs_do_quota_sync(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct quota_info *dqopt = sb_dqopt(sb); @@ -2956,11 +2968,21 @@ int f2fs_quota_sync(struct super_block *sb, int type) return ret; } +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + int ret; + + F2FS_SB(sb)->umount_lock_holder = current; + ret = f2fs_do_quota_sync(sb, type); + F2FS_SB(sb)->umount_lock_holder = NULL; + return ret; +} + static int f2fs_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { struct inode *inode; - int err; + int err = 0; /* if quota sysfile exists, deny enabling quota with specific file */ if (f2fs_sb_has_quota_ino(F2FS_SB(sb))) { @@ -2971,31 +2993,34 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, if (path->dentry->d_sb != sb) return -EXDEV; - err = f2fs_quota_sync(sb, type); + F2FS_SB(sb)->umount_lock_holder = current; + + err = f2fs_do_quota_sync(sb, type); if (err) - return err; + goto out; inode = d_inode(path->dentry); err = filemap_fdatawrite(inode->i_mapping); if (err) - return err; + goto out; err = filemap_fdatawait(inode->i_mapping); if (err) - return err; + goto out; err = dquot_quota_on(sb, type, format_id, path); if (err) - return err; + goto out; inode_lock(inode); F2FS_I(inode)->i_flags |= F2FS_QUOTA_DEFAULT_FL; f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); - - return 0; +out: + F2FS_SB(sb)->umount_lock_holder = NULL; + return err; } static int __f2fs_quota_off(struct super_block *sb, int type) @@ -3006,7 +3031,7 @@ static int __f2fs_quota_off(struct super_block *sb, int type) if (!inode || !igrab(inode)) return dquot_quota_off(sb, type); - err = f2fs_quota_sync(sb, type); + err = f2fs_do_quota_sync(sb, type); if (err) goto out_put; @@ -3029,6 +3054,8 @@ static int f2fs_quota_off(struct super_block *sb, int type) struct f2fs_sb_info *sbi = F2FS_SB(sb); int err; + F2FS_SB(sb)->umount_lock_holder = current; + err = __f2fs_quota_off(sb, type); /* @@ -3038,6 +3065,9 @@ static int f2fs_quota_off(struct super_block *sb, int type) */ if (is_journalled_quota(sbi)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + + F2FS_SB(sb)->umount_lock_holder = NULL; + return err; } @@ -3170,7 +3200,7 @@ int f2fs_dquot_initialize(struct inode *inode) return 0; } -int f2fs_quota_sync(struct super_block *sb, int type) +int f2fs_do_quota_sync(struct super_block *sb, int type) { return 0; } @@ -4703,6 +4733,7 @@ try_onemore: if (err) goto free_compress_inode; + sbi->umount_lock_holder = current; #ifdef CONFIG_QUOTA /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { @@ -4829,6 +4860,8 @@ reset_checkpoint: f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + + sbi->umount_lock_holder = NULL; return 0; sync_free_meta: @@ -4931,6 +4964,8 @@ static void kill_f2fs_super(struct super_block *sb) struct f2fs_sb_info *sbi = F2FS_SB(sb); if (sb->s_root) { + sbi->umount_lock_holder = current; + set_sbi_flag(sbi, SBI_IS_CLOSE); f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); From bd409934c0619a6c3507891844aeb0c0dca69e18 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Mon, 10 Feb 2025 09:24:09 +0800 Subject: [PATCH 04/68] f2fs: fix to return SHRINK_EMPTY if no objects to free Quoted from include/linux/shrinker.h "count_objects should return the number of freeable items in the cache. If there are no objects to free, it should return SHRINK_EMPTY, while 0 is returned in cases of the number of freeable items cannot be determined or shrinker should skip this cache for this time (e.g., their number is below shrinkable limit)." Signed-off-by: Zhiguo Niu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/shrinker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 83d6fb97dcae..fc1bbef418ce 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -73,7 +73,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, mutex_unlock(&sbi->umount_mutex); } spin_unlock(&f2fs_list_lock); - return count; + return count ?: SHRINK_EMPTY; } unsigned long f2fs_shrink_scan(struct shrinker *shrink, From 1534747d3170646ddeb9ea5f7caaac90359707cf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 Feb 2025 15:36:32 +0800 Subject: [PATCH 05/68] f2fs: don't retry IO for corrupted data scenario F2FS-fs (dm-105): inconsistent node block, nid:430, node_footer[nid:2198964142,ino:598252782,ofs:118300154,cpver:5409237455940746069,blkaddr:2125070942] F2FS-fs (dm-105): inconsistent node block, nid:430, node_footer[nid:2198964142,ino:598252782,ofs:118300154,cpver:5409237455940746069,blkaddr:2125070942] F2FS-fs (dm-105): inconsistent node block, nid:430, node_footer[nid:2198964142,ino:598252782,ofs:118300154,cpver:5409237455940746069,blkaddr:2125070942] F2FS-fs (dm-105): inconsistent node block, nid:430, node_footer[nid:2198964142,ino:598252782,ofs:118300154,cpver:5409237455940746069,blkaddr:2125070942] F2FS-fs (dm-105): inconsistent node block, nid:430, node_footer[nid:2198964142,ino:598252782,ofs:118300154,cpver:5409237455940746069,blkaddr:2125070942] F2FS-fs (dm-105): inconsistent node block, nid:430, node_footer[nid:2198964142,ino:598252782,ofs:118300154,cpver:5409237455940746069,blkaddr:2125070942] F2FS-fs (dm-105): inconsistent node block, nid:430, node_footer[nid:2198964142,ino:598252782,ofs:118300154,cpver:5409237455940746069,blkaddr:2125070942] F2FS-fs (dm-105): inconsistent node block, nid:430, node_footer[nid:2198964142,ino:598252782,ofs:118300154,cpver:5409237455940746069,blkaddr:2125070942] F2FS-fs (dm-105): inconsistent node block, nid:430, node_footer[nid:2198964142,ino:598252782,ofs:118300154,cpver:5409237455940746069,blkaddr:2125070942] F2FS-fs (dm-105): inconsistent node block, nid:430, node_footer[nid:2198964142,ino:598252782,ofs:118300154,cpver:5409237455940746069,blkaddr:2125070942] If node block is loaded successfully, but its content is inconsistent, it doesn't need to retry IO. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 3dd25f64d6f1..f238be29a70b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -765,8 +765,12 @@ retry: if (err == -ENOENT) return; + if (err == -EFSCORRUPTED) + goto stop_checkpoint; + if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; +stop_checkpoint: f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE); return; } From 4f91f074702af3931a35c244470ae0c4b66f909c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Feb 2025 09:54:13 +0800 Subject: [PATCH 06/68] f2fs: add dump_stack() in f2fs_handle_critical_error() To show call stack, so that we can see who causes critical error, note that it won't call dump_stack() for shutdown path. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1beff52ae80b..759b642991d4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4250,6 +4250,8 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason) if (shutdown) set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + else + dump_stack(); /* * Continue filesystem operators if errors=continue. Should not set From 5f95c1812a65e4e8a6b89b6c0bafd654e8bc03de Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 4 Feb 2025 10:06:35 -0800 Subject: [PATCH 07/68] f2fs: add ioctl to get IO priority hint This patch adds an ioctl to give a per-file priority hint to attach REQ_PRIO. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 ++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 20 ++++++++++++++++++++ include/uapi/linux/f2fs.h | 7 +++++++ 4 files changed, 34 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 07b46b444d31..24c5cb1f5ada 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -413,6 +413,7 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); + struct folio *fio_folio = page_folio(fio->page); unsigned int fua_flag, meta_flag, io_flag; blk_opf_t op_flags = 0; @@ -438,6 +439,11 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) op_flags |= REQ_META; if (BIT(fio->temp) & fua_flag) op_flags |= REQ_FUA; + + if (fio->type == DATA && + F2FS_I(fio_folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE) + op_flags |= REQ_PRIO; + return op_flags; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 493dda2d4b66..395f9d37449c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -830,6 +830,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ + unsigned int ioprio_hint; /* hint for IO priority */ struct f2fs_rwsem i_sem; /* protect fi info */ atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f92a9fba9991..a92c347841e6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3446,6 +3446,23 @@ static int f2fs_ioc_get_dev_alias_file(struct file *filp, unsigned long arg) (u32 __user *)arg); } +static int f2fs_ioc_io_prio(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 level; + + if (get_user(level, (__u32 __user *)arg)) + return -EFAULT; + + if (!S_ISREG(inode->i_mode) || level >= F2FS_IOPRIO_MAX) + return -EINVAL; + + inode_lock(inode); + F2FS_I(inode)->ioprio_hint = level; + inode_unlock(inode); + return 0; +} + int f2fs_precache_extents(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -4547,6 +4564,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_compress_file(filp); case F2FS_IOC_GET_DEV_ALIAS_FILE: return f2fs_ioc_get_dev_alias_file(filp, arg); + case F2FS_IOC_IO_PRIO: + return f2fs_ioc_io_prio(filp, arg); default: return -ENOTTY; } @@ -5261,6 +5280,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_DECOMPRESS_FILE: case F2FS_IOC_COMPRESS_FILE: case F2FS_IOC_GET_DEV_ALIAS_FILE: + case F2FS_IOC_IO_PRIO: break; default: return -ENOIOCTLCMD; diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h index f7aaf8d23e20..795e26258355 100644 --- a/include/uapi/linux/f2fs.h +++ b/include/uapi/linux/f2fs.h @@ -44,6 +44,7 @@ #define F2FS_IOC_COMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 24) #define F2FS_IOC_START_ATOMIC_REPLACE _IO(F2FS_IOCTL_MAGIC, 25) #define F2FS_IOC_GET_DEV_ALIAS_FILE _IOR(F2FS_IOCTL_MAGIC, 26, __u32) +#define F2FS_IOC_IO_PRIO _IOW(F2FS_IOCTL_MAGIC, 27, __u32) /* * should be same as XFS_IOC_GOINGDOWN. @@ -63,6 +64,12 @@ #define F2FS_TRIM_FILE_ZEROOUT 0x2 /* zero out */ #define F2FS_TRIM_FILE_MASK 0x3 +/* for F2FS_IOC_IO_PRIO */ +enum { + F2FS_IOPRIO_WRITE = 1, /* high write priority */ + F2FS_IOPRIO_MAX, +}; + struct f2fs_gc_range { __u32 sync; __u64 start; From 48ea8b200414ac69ea96f4c231f5c7ef1fbeffef Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Feb 2025 14:36:57 +0800 Subject: [PATCH 08/68] f2fs: fix to avoid panic once fallocation fails for pinfile syzbot reports a f2fs bug as below: ------------[ cut here ]------------ kernel BUG at fs/f2fs/segment.c:2746! CPU: 0 UID: 0 PID: 5323 Comm: syz.0.0 Not tainted 6.13.0-rc2-syzkaller-00018-g7cb1b4663150 #0 RIP: 0010:get_new_segment fs/f2fs/segment.c:2746 [inline] RIP: 0010:new_curseg+0x1f52/0x1f70 fs/f2fs/segment.c:2876 Call Trace: __allocate_new_segment+0x1ce/0x940 fs/f2fs/segment.c:3210 f2fs_allocate_new_section fs/f2fs/segment.c:3224 [inline] f2fs_allocate_pinning_section+0xfa/0x4e0 fs/f2fs/segment.c:3238 f2fs_expand_inode_data+0x696/0xca0 fs/f2fs/file.c:1830 f2fs_fallocate+0x537/0xa10 fs/f2fs/file.c:1940 vfs_fallocate+0x569/0x6e0 fs/open.c:327 do_vfs_ioctl+0x258c/0x2e40 fs/ioctl.c:885 __do_sys_ioctl fs/ioctl.c:904 [inline] __se_sys_ioctl+0x80/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Concurrent pinfile allocation may run out of free section, result in panic in get_new_segment(), let's expand pin_sem lock coverage to include f2fs_gc(), so that we can make sure to reclaim enough free space for following allocation. In addition, do below changes to enhance error path handling: - call f2fs_bug_on() only in non-pinfile allocation path in get_new_segment(). - call reset_curseg_fields() to reset all fields of curseg in new_curseg() Fixes: f5a53edcf01e ("f2fs: support aligned pinned file") Reported-by: syzbot+15669ec8c35ddf6c3d43@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-f2fs-devel/675cd64e.050a0220.37aaf.00bb.GAE@google.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 8 +++++--- fs/f2fs/segment.c | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a92c347841e6..f2f298c75921 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1834,18 +1834,20 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, map.m_len = sec_blks; next_alloc: + f2fs_down_write(&sbi->pin_sem); + if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ? ZONED_PIN_SEC_REQUIRED_COUNT : GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); - if (err && err != -ENODATA) + if (err && err != -ENODATA) { + f2fs_up_write(&sbi->pin_sem); goto out_err; + } } - f2fs_down_write(&sbi->pin_sem); - err = f2fs_allocate_pinning_section(sbi); if (err) { f2fs_up_write(&sbi->pin_sem); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c282e8a0a2ec..6ebe25eafafa 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2806,7 +2806,7 @@ find_other_zone: MAIN_SECS(sbi)); if (secno >= MAIN_SECS(sbi)) { ret = -ENOSPC; - f2fs_bug_on(sbi, 1); + f2fs_bug_on(sbi, !pinning); goto out_unlock; } } @@ -2848,7 +2848,7 @@ got_it: out_unlock: spin_unlock(&free_i->segmap_lock); - if (ret == -ENOSPC) + if (ret == -ENOSPC && !pinning) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT); return ret; } @@ -2921,6 +2921,13 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) return curseg->segno; } +static void reset_curseg_fields(struct curseg_info *curseg) +{ + curseg->inited = false; + curseg->segno = NULL_SEGNO; + curseg->next_segno = 0; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -2939,7 +2946,7 @@ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) ret = get_new_segment(sbi, &segno, new_sec, pinning); if (ret) { if (ret == -ENOSPC) - curseg->segno = NULL_SEGNO; + reset_curseg_fields(curseg); return ret; } @@ -3710,13 +3717,6 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, get_random_u32_inclusive(1, sbi->max_fragment_hole); } -static void reset_curseg_fields(struct curseg_info *curseg) -{ - curseg->inited = false; - curseg->segno = NULL_SEGNO; - curseg->next_segno = 0; -} - int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, From ef0c333cad8d1940f132a7ce15f15920216a3bd5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 31 Jan 2025 22:27:56 +0000 Subject: [PATCH 09/68] f2fs: keep POSIX_FADV_NOREUSE ranges This patch records POSIX_FADV_NOREUSE ranges for users to reclaim the caches instantly off from LRU. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 3 +++ fs/f2fs/f2fs.h | 12 +++++++++- fs/f2fs/file.c | 60 ++++++++++++++++++++++++++++++++++++++++++++----- fs/f2fs/inode.c | 14 ++++++++++++ fs/f2fs/super.c | 1 + 5 files changed, 84 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 468828288a4a..16c2dfb4f595 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -164,6 +164,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->ndonate_files = sbi->donate_files; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->aw_cnt = atomic_read(&sbi->atomic_files); @@ -501,6 +502,8 @@ static int stat_show(struct seq_file *s, void *v) si->compr_inode, si->compr_blocks); seq_printf(s, " - Swapfile Inode: %u\n", si->swapfile_inode); + seq_printf(s, " - Donate Inode: %u\n", + si->ndonate_files); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 395f9d37449c..3abcb84a0d47 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -850,6 +850,11 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ + + /* linked in global inode list for cache donation */ + struct list_head gdonate_list; + pgoff_t donate_start, donate_end; /* inclusive */ + struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree[NR_EXTENT_CACHES]; /* cached extent_tree entry */ @@ -1274,6 +1279,7 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ + DONATE_INODE, /* for all inode to donate pages */ NR_INODE_TYPE, }; @@ -1629,6 +1635,9 @@ struct f2fs_sb_info { unsigned int warm_data_age_threshold; unsigned int last_age_weight; + /* control donate caches */ + unsigned int donate_files; + /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ unsigned int log_blocksize; /* log2 block size */ @@ -3968,7 +3977,8 @@ struct f2fs_stat_info { unsigned long long allocated_data_blocks; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; - unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; + unsigned int ndirty_dirs, ndirty_files, ndirty_all; + unsigned int nquota_files, ndonate_files; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; int total_count, utilization; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f2f298c75921..014cb7660a9a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2450,6 +2450,52 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) return ret; } +static void f2fs_keep_noreuse_range(struct inode *inode, + loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + u64 max_bytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode)); + u64 start, end; + + if (!S_ISREG(inode->i_mode)) + return; + + if (offset >= max_bytes || len > max_bytes || + (offset + len) > max_bytes) + return; + + start = offset >> PAGE_SHIFT; + end = DIV_ROUND_UP(offset + len, PAGE_SIZE); + + inode_lock(inode); + if (f2fs_is_atomic_file(inode)) { + inode_unlock(inode); + return; + } + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + /* let's remove the range, if len = 0 */ + if (!len) { + if (!list_empty(&F2FS_I(inode)->gdonate_list)) { + list_del_init(&F2FS_I(inode)->gdonate_list); + sbi->donate_files--; + } + } else { + if (list_empty(&F2FS_I(inode)->gdonate_list)) { + list_add_tail(&F2FS_I(inode)->gdonate_list, + &sbi->inode_list[DONATE_INODE]); + sbi->donate_files++; + } else { + list_move_tail(&F2FS_I(inode)->gdonate_list, + &sbi->inode_list[DONATE_INODE]); + } + F2FS_I(inode)->donate_start = start; + F2FS_I(inode)->donate_end = end - 1; + } + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + inode_unlock(inode); +} + static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -5168,12 +5214,16 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, } err = generic_fadvise(filp, offset, len, advice); - if (!err && advice == POSIX_FADV_DONTNEED && - test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && - f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + if (err) + return err; - return err; + if (advice == POSIX_FADV_DONTNEED && + (test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && + f2fs_compressed_file(inode))) + f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + else if (advice == POSIX_FADV_NOREUSE) + f2fs_keep_noreuse_range(inode, offset, len); + return 0; } #ifdef CONFIG_COMPAT diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index f238be29a70b..d6ad7810df69 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -808,6 +808,19 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; } +static void f2fs_remove_donate_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (list_empty(&F2FS_I(inode)->gdonate_list)) + return; + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + list_del_init(&F2FS_I(inode)->gdonate_list); + sbi->donate_files--; + spin_unlock(&sbi->inode_lock[DONATE_INODE]); +} + /* * Called at the last iput() if i_nlink is zero */ @@ -842,6 +855,7 @@ void f2fs_evict_inode(struct inode *inode) f2fs_bug_on(sbi, get_dirty_pages(inode)); f2fs_remove_dirty_inode(inode); + f2fs_remove_donate_inode(inode); if (!IS_DEVICE_ALIASING(inode)) f2fs_destroy_extent_tree(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 759b642991d4..f5c69cc2de72 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1441,6 +1441,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); + INIT_LIST_HEAD(&fi->gdonate_list); init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); init_f2fs_rwsem(&fi->i_xattr_sem); From a907f3a68ee26ba493a08a958809208d17f3347e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 31 Jan 2025 22:27:57 +0000 Subject: [PATCH 10/68] f2fs: add a sysfs entry to reclaim POSIX_FADV_NOREUSE pages 1. fadvise(fd1, POSIX_FADV_NOREUSE, {0,3}); 2. fadvise(fd2, POSIX_FADV_NOREUSE, {1,2}); 3. fadvise(fd3, POSIX_FADV_NOREUSE, {3,1}); 4. echo 1024 > /sys/fs/f2fs/tuning/reclaim_caches_kb This gives a way to reclaim file-backed pages by iterating all f2fs mounts until reclaiming 1MB page cache ranges, registered by #1, #2, and #3. 5. cat /sys/fs/f2fs/tuning/reclaim_caches_kb -> gives total number of registered file ranges. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 7 ++ fs/f2fs/f2fs.h | 2 + fs/f2fs/shrinker.c | 90 +++++++++++++++++++++++++ fs/f2fs/sysfs.c | 63 +++++++++++++++++ 4 files changed, 162 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 3e1630c70d8a..81deae2af84d 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -828,3 +828,10 @@ Date: November 2024 Contact: "Chao Yu" Description: It controls max read extent count for per-inode, the value of threshold is 10240 by default. + +What: /sys/fs/f2fs/tuning/reclaim_caches_kb +Date: February 2025 +Contact: "Jaegeuk Kim" +Description: It reclaims the given KBs of file-backed pages registered by + ioctl(F2FS_IOC_DONATE_RANGE). + For example, writing N tries to drop N KBs spaces in LRU. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3abcb84a0d47..05879c6dc4d6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4243,6 +4243,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, struct shrink_control *sc); unsigned long f2fs_shrink_scan(struct shrinker *shrink, struct shrink_control *sc); +unsigned int f2fs_donate_files(void); +void f2fs_reclaim_caches(unsigned int reclaim_caches_kb); void f2fs_join_shrinker(struct f2fs_sb_info *sbi); void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index fc1bbef418ce..9c8d3aee89af 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -130,6 +130,96 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, return freed; } +unsigned int f2fs_donate_files(void) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned int donate_files = 0; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + donate_files += sbi->donate_files; + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); + + return donate_files; +} + +static unsigned int do_reclaim_caches(struct f2fs_sb_info *sbi, + unsigned int reclaim_caches_kb) +{ + struct inode *inode; + struct f2fs_inode_info *fi; + unsigned int nfiles = sbi->donate_files; + pgoff_t npages = reclaim_caches_kb >> (PAGE_SHIFT - 10); + + while (npages && nfiles--) { + pgoff_t len; + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + if (list_empty(&sbi->inode_list[DONATE_INODE])) { + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + break; + } + fi = list_first_entry(&sbi->inode_list[DONATE_INODE], + struct f2fs_inode_info, gdonate_list); + list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + + if (!inode) + continue; + + len = fi->donate_end - fi->donate_start + 1; + npages = npages < len ? 0 : npages - len; + invalidate_inode_pages2_range(inode->i_mapping, + fi->donate_start, fi->donate_end); + iput(inode); + cond_resched(); + } + return npages << (PAGE_SHIFT - 10); +} + +void f2fs_reclaim_caches(unsigned int reclaim_caches_kb) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list && reclaim_caches_kb) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + reclaim_caches_kb = do_reclaim_caches(sbi, reclaim_caches_kb); + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); +} + void f2fs_join_shrinker(struct f2fs_sb_info *sbi) { spin_lock(&f2fs_list_lock); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index b419555e1ea7..b27336acf519 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -916,6 +916,39 @@ static struct f2fs_base_attr f2fs_base_attr_##_name = { \ .show = f2fs_feature_show, \ } +static ssize_t f2fs_tune_show(struct f2fs_base_attr *a, char *buf) +{ + unsigned int res = 0; + + if (!strcmp(a->attr.name, "reclaim_caches_kb")) + res = f2fs_donate_files(); + + return sysfs_emit(buf, "%u\n", res); +} + +static ssize_t f2fs_tune_store(struct f2fs_base_attr *a, + const char *buf, size_t count) +{ + unsigned long t; + int ret; + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + + if (!strcmp(a->attr.name, "reclaim_caches_kb")) + f2fs_reclaim_caches(t); + + return count; +} + +#define F2FS_TUNE_RW_ATTR(_name) \ +static struct f2fs_base_attr f2fs_base_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0644 }, \ + .show = f2fs_tune_show, \ + .store = f2fs_tune_store, \ +} + static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -1368,6 +1401,14 @@ static struct attribute *f2fs_sb_feat_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs_sb_feat); +F2FS_TUNE_RW_ATTR(reclaim_caches_kb); + +static struct attribute *f2fs_tune_attrs[] = { + BASE_ATTR_LIST(reclaim_caches_kb), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_tune); + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -1401,6 +1442,20 @@ static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; +static const struct sysfs_ops f2fs_tune_attr_ops = { + .show = f2fs_base_attr_show, + .store = f2fs_base_attr_store, +}; + +static const struct kobj_type f2fs_tune_ktype = { + .default_groups = f2fs_tune_groups, + .sysfs_ops = &f2fs_tune_attr_ops, +}; + +static struct kobject f2fs_tune = { + .kset = &f2fs_kset, +}; + static ssize_t f2fs_stat_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -1637,6 +1692,11 @@ int __init f2fs_init_sysfs(void) if (ret) goto put_kobject; + ret = kobject_init_and_add(&f2fs_tune, &f2fs_tune_ktype, + NULL, "tuning"); + if (ret) + goto put_kobject; + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); if (!f2fs_proc_root) { ret = -ENOMEM; @@ -1644,7 +1704,9 @@ int __init f2fs_init_sysfs(void) } return 0; + put_kobject: + kobject_put(&f2fs_tune); kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); return ret; @@ -1652,6 +1714,7 @@ put_kobject: void f2fs_exit_sysfs(void) { + kobject_put(&f2fs_tune); kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); remove_proc_entry("fs/f2fs", NULL); From 1b60b23975d6d81703826e3797738e471c3009c6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 24 Feb 2025 14:20:07 +0800 Subject: [PATCH 11/68] f2fs: fix to set .discard_granularity correctly commit 4f993264fe29 ("f2fs: introduce discard_unit mount option") introduced a bug, when we enable discard_unit=section option, it will set .discard_granularity to BLKS_PER_SEC(), however discard granularity only supports [1, 512], once section size is not equal to segment size, it will cause issue_discard_thread() in DPOLICY_BG mode will not select discard entry w/ any granularity to issue. Fixes: 4f993264fe29 ("f2fs: introduce discard_unit mount option") Reviewed-by: Daeho Jeong Signed-off-by: Yohan Joung Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6ebe25eafafa..2b415926641f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2320,10 +2320,9 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE; - if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT || + F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) dcc->discard_granularity = BLKS_PER_SEG(sbi); - else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) - dcc->discard_granularity = BLKS_PER_SEC(sbi); INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) From 201e07aec617b10360df09090651dea9d0d4f7d3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 27 Feb 2025 19:00:35 +0000 Subject: [PATCH 12/68] f2fs: fix the missing write pointer correction If checkpoint was disabled, we missed to fix the write pointers. Cc: Fixes: 1015035609e4 ("f2fs: fix changing cursegs if recovery fails on zoned device") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f5c69cc2de72..7a8fcc1e278c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4752,8 +4752,10 @@ try_onemore: if (err) goto free_meta; - if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) + if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) { + skip_recovery = true; goto reset_checkpoint; + } /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD) && From 81edb983b3f5d6900d3e337b9af7b1bec4bef0f2 Mon Sep 17 00:00:00 2001 From: Leo Stone Date: Thu, 27 Feb 2025 23:54:20 +0800 Subject: [PATCH 13/68] f2fs: add check for deleted inode The syzbot reproducer mounts a f2fs image, then tries to unlink an existing file. However, the unlinked file already has a link count of 0 when it is read for the first time in do_read_inode(). Add a check to sanity_check_inode() for i_nlink == 0. [Chao Yu: rebase the code and fix orphan inode recovery issue] Reported-by: syzbot+b01a36acd7007e273a83@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b01a36acd7007e273a83 Fixes: 39a53e0ce0df ("f2fs: add superblock and major in-memory structure") Signed-off-by: Leo Stone Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a278c7da8177..3d85d8116dae 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -502,6 +502,14 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out; } + if (inode->i_nlink == 0) { + f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink", + __func__, inode->i_ino); + err = -EFSCORRUPTED; + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + goto out_iput; + } + if (IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { From 3147ee567dd9004a49826ddeaf0a4b12865d4409 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 3 Mar 2025 11:23:29 +0800 Subject: [PATCH 14/68] f2fs: fix potential deadloop in prepare_compress_overwrite() Jan Prusakowski reported a kernel hang issue as below: When running xfstests on linux-next kernel (6.14.0-rc3, 6.12) I encountered a problem in generic/475 test where fsstress process gets blocked in __f2fs_write_data_pages() and the test hangs. The options I used are: MKFS_OPTIONS -- -O compression -O extra_attr -O project_quota -O quota /dev/vdc MOUNT_OPTIONS -- -o acl,user_xattr -o discard,compress_extension=* /dev/vdc /vdc INFO: task kworker/u8:0:11 blocked for more than 122 seconds. Not tainted 6.14.0-rc3-xfstests-lockdep #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:kworker/u8:0 state:D stack:0 pid:11 tgid:11 ppid:2 task_flags:0x4208160 flags:0x00004000 Workqueue: writeback wb_workfn (flush-253:0) Call Trace: __schedule+0x309/0x8e0 schedule+0x3a/0x100 schedule_preempt_disabled+0x15/0x30 __mutex_lock+0x59a/0xdb0 __f2fs_write_data_pages+0x3ac/0x400 do_writepages+0xe8/0x290 __writeback_single_inode+0x5c/0x360 writeback_sb_inodes+0x22f/0x570 wb_writeback+0xb0/0x410 wb_do_writeback+0x47/0x2f0 wb_workfn+0x5a/0x1c0 process_one_work+0x223/0x5b0 worker_thread+0x1d5/0x3c0 kthread+0xfd/0x230 ret_from_fork+0x31/0x50 ret_from_fork_asm+0x1a/0x30 The root cause is: once generic/475 starts toload error table to dm device, f2fs_prepare_compress_overwrite() will loop reading compressed cluster pages due to IO error, meanwhile it has held .writepages lock, it can block all other writeback tasks. Let's fix this issue w/ below changes: - add f2fs_handle_page_eio() in prepare_compress_overwrite() to detect IO error. - detect cp_error earler in f2fs_read_multi_pages(). Fixes: 4c8ff7095bef ("f2fs: support data compression") Reported-by: Jan Prusakowski Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 1 + fs/f2fs/data.c | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 985690d81a82..9b94810675c1 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1150,6 +1150,7 @@ retry: f2fs_compress_ctx_add_page(cc, page_folio(page)); if (!PageUptodate(page)) { + f2fs_handle_page_eio(sbi, page_folio(page), DATA); release_and_retry: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 24c5cb1f5ada..d36c876acd74 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2182,6 +2182,12 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, int i; int ret = 0; + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + from_dnode = false; + goto out_put_dnode; + } + f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + @@ -2225,10 +2231,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (ret) goto out; - if (unlikely(f2fs_cp_error(sbi))) { - ret = -EIO; - goto out_put_dnode; - } f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); skip_reading_dnode: From d8f5b91d77a651705d3f76ba0ebd5d7981533333 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 3 Mar 2025 11:25:00 +0800 Subject: [PATCH 15/68] f2fs: fix to call f2fs_recover_quota_end() correctly f2fs_recover_quota_begin() and f2fs_recover_quota_end() should be called in pair, there is some cases we may skip calling f2fs_recover_quota_end(), fix it. Fixes: e1bb7d3d9cbf ("f2fs: fix to recover quota data correctly") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7a8fcc1e278c..3c875dc07266 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4805,10 +4805,10 @@ try_onemore: } } +reset_checkpoint: #ifdef CONFIG_QUOTA f2fs_recover_quota_end(sbi, quota_enabled); #endif -reset_checkpoint: /* * If the f2fs is not readonly and fsync data recovery succeeds, * write pointer consistency of cursegs and other zones are already From e6494977bd4a83862118a05f57a8df40256951c0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 3 Mar 2025 11:47:38 +0800 Subject: [PATCH 16/68] f2fs: fix to avoid out-of-bounds access in f2fs_truncate_inode_blocks() syzbot reports an UBSAN issue as below: ------------[ cut here ]------------ UBSAN: array-index-out-of-bounds in fs/f2fs/node.h:381:10 index 18446744073709550692 is out of range for type '__le32[5]' (aka 'unsigned int[5]') CPU: 0 UID: 0 PID: 5318 Comm: syz.0.0 Not tainted 6.14.0-rc3-syzkaller-00060-g6537cfb395f3 #0 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 ubsan_epilogue lib/ubsan.c:231 [inline] __ubsan_handle_out_of_bounds+0x121/0x150 lib/ubsan.c:429 get_nid fs/f2fs/node.h:381 [inline] f2fs_truncate_inode_blocks+0xa5e/0xf60 fs/f2fs/node.c:1181 f2fs_do_truncate_blocks+0x782/0x1030 fs/f2fs/file.c:808 f2fs_truncate_blocks+0x10d/0x300 fs/f2fs/file.c:836 f2fs_truncate+0x417/0x720 fs/f2fs/file.c:886 f2fs_file_write_iter+0x1bdb/0x2550 fs/f2fs/file.c:5093 aio_write+0x56b/0x7c0 fs/aio.c:1633 io_submit_one+0x8a7/0x18a0 fs/aio.c:2052 __do_sys_io_submit fs/aio.c:2111 [inline] __se_sys_io_submit+0x171/0x2e0 fs/aio.c:2081 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f238798cde9 index 18446744073709550692 (decimal, unsigned long long) = 0xfffffffffffffc64 (hexadecimal, unsigned long long) = -924 (decimal, long long) In f2fs_truncate_inode_blocks(), UBSAN detects that get_nid() tries to access .i_nid[-924], it means both offset[0] and level should zero. The possible case should be in f2fs_do_truncate_blocks(), we try to truncate inode size to zero, however, dn.ofs_in_node is zero and dn.node_page is not an inode page, so it fails to truncate inode page, and then pass zeroed free_from to f2fs_truncate_inode_blocks(), result in this issue. if (dn.ofs_in_node || IS_INODE(dn.node_page)) { f2fs_truncate_data_blocks_range(&dn, count); free_from += count; } I guess the reason why dn.node_page is not an inode page could be: there are multiple nat entries share the same node block address, once the node block address was reused, f2fs_get_node_page() may load a non-inode block. Let's add a sanity check for such condition to avoid out-of-bounds access issue. Reported-by: syzbot+6653f10281a1badc749e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/66fdcdf3.050a0220.40bef.0025.GAE@google.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f88392fc4ba9..c1274bcec68b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1135,7 +1135,14 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); - if (level < 0) { + if (level <= 0) { + if (!level) { + level = -EFSCORRUPTED; + f2fs_err(sbi, "%s: inode ino=%lx has corrupted node block, from:%lu addrs:%u", + __func__, inode->i_ino, + from, ADDRS_PER_INODE(inode)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } trace_f2fs_truncate_inode_blocks_exit(inode, level); return level; } From 17683927d078fe2ff924f110bfbe913d84eebe54 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:35 +0000 Subject: [PATCH 17/68] f2fs: Add f2fs_folio_wait_writeback() Convert f2fs_wait_on_page_writeback() to f2fs_folio_wait_writeback() and add a compatibiility wrapper. Replaces five calls to compound_head() with one. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++++-- fs/f2fs/segment.c | 19 +++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 05879c6dc4d6..645a53a1f39f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3769,8 +3769,10 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info *fio); void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, block_t blkaddr, unsigned int blkcnt); -void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool ordered, bool locked); +void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, + bool ordered, bool locked); +#define f2fs_wait_on_page_writeback(page, type, ordered, locked) \ + f2fs_folio_wait_writeback(page_folio(page), type, ordered, locked) void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2b415926641f..fd1f6e924e0f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4153,22 +4153,21 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, f2fs_update_data_blkaddr(dn, new_addr); } -void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool ordered, bool locked) +void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, + bool ordered, bool locked) { - if (folio_test_writeback(page_folio(page))) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); + if (folio_test_writeback(folio)) { + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); /* submit cached LFS IO */ - f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); + f2fs_submit_merged_write_cond(sbi, NULL, &folio->page, 0, type); /* submit cached IPU IO */ - f2fs_submit_merged_ipu_write(sbi, NULL, page); + f2fs_submit_merged_ipu_write(sbi, NULL, &folio->page); if (ordered) { - wait_on_page_writeback(page); - f2fs_bug_on(sbi, locked && - folio_test_writeback(page_folio(page))); + folio_wait_writeback(folio); + f2fs_bug_on(sbi, locked && folio_test_writeback(folio)); } else { - wait_for_stable_page(page); + folio_wait_stable(folio); } } } From 36e1d6344aca13e1f20af099561db75f28649151 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:36 +0000 Subject: [PATCH 18/68] mm: Remove wait_for_stable_page() The last caller has been converted to call folio_wait_stable(), so we can remove this wrapper. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/pagemap.h | 1 - mm/folio-compat.c | 6 ------ 2 files changed, 7 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 47bfc6b1b632..a19d8e334194 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1256,7 +1256,6 @@ void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); -void wait_for_stable_page(struct page *page); void folio_wait_stable(struct folio *folio); void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 1d1832e2a599..5766d135af1e 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -28,12 +28,6 @@ void wait_on_page_writeback(struct page *page) } EXPORT_SYMBOL_GPL(wait_on_page_writeback); -void wait_for_stable_page(struct page *page) -{ - return folio_wait_stable(page_folio(page)); -} -EXPORT_SYMBOL_GPL(wait_for_stable_page); - void mark_page_accessed(struct page *page) { folio_mark_accessed(page_folio(page)); From 894ac9d330c9eb8e108d0d3771ffa7bc89112316 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:37 +0000 Subject: [PATCH 19/68] f2fs: Add f2fs_folio_put() Convert f2fs_put_page() to f2fs_folio_put() and add a wrapper. Replaces three calls to compound_head() with one. [Jaegeuk Kim: fix missing null pointer check in f2fs_put_page] Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 645a53a1f39f..15ab02ea1016 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2815,16 +2815,23 @@ static inline struct page *f2fs_pagecache_get_page( return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); } +static inline void f2fs_folio_put(struct folio *folio, bool unlock) +{ + if (!folio) + return; + + if (unlock) { + f2fs_bug_on(F2FS_F_SB(folio), !folio_test_locked(folio)); + folio_unlock(folio); + } + folio_put(folio); +} + static inline void f2fs_put_page(struct page *page, int unlock) { if (!page) return; - - if (unlock) { - f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); - unlock_page(page); - } - put_page(page); + f2fs_folio_put(page_folio(page), unlock); } static inline void f2fs_put_dnode(struct dnode_of_data *dn) From 015d9c56bd5e925273bc75077f37f48496b2048d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:38 +0000 Subject: [PATCH 20/68] f2fs: Convert f2fs_flush_inline_data() to use a folio Use the folio APIs where they exist. Saves several hidden calls to compound_head(). Also removes a reference to page->mapping. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c1274bcec68b..a624522c98bc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1958,32 +1958,27 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; - if (!IS_INODE(page)) + if (!IS_INODE(&folio->page)) continue; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { -continue_unlock: - unlock_page(page); - continue; - } - - if (!PageDirty(page)) { - /* someone wrote it for us */ - goto continue_unlock; - } + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) + goto unlock; + if (!folio_test_dirty(folio)) + goto unlock; /* flush inline_data, if it's async context. */ - if (page_private_inline(page)) { - clear_page_private_inline(page); - unlock_page(page); - flush_inline_data(sbi, ino_of_node(page)); + if (page_private_inline(&folio->page)) { + clear_page_private_inline(&folio->page); + folio_unlock(folio); + flush_inline_data(sbi, ino_of_node(&folio->page)); continue; } - unlock_page(page); +unlock: + folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); From 5d0a91284853124954d7747e5066141b008e83de Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:39 +0000 Subject: [PATCH 21/68] f2fs: Convert f2fs_sync_node_pages() to use a folio Use the folio APIs where they exist. Saves several hidden calls to compound_head(). Also removes a reference to page->mapping. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a624522c98bc..a2da7b726dbf 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2007,7 +2007,7 @@ next_step: int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; bool submitted = false; /* give a priority to WB_SYNC threads */ @@ -2023,27 +2023,27 @@ next_step: * 1. dentry dnodes * 2. file dnodes */ - if (step == 0 && IS_DNODE(page)) + if (step == 0 && IS_DNODE(&folio->page)) continue; - if (step == 1 && (!IS_DNODE(page) || - is_cold_node(page))) + if (step == 1 && (!IS_DNODE(&folio->page) || + is_cold_node(&folio->page))) continue; - if (step == 2 && (!IS_DNODE(page) || - !is_cold_node(page))) + if (step == 2 && (!IS_DNODE(&folio->page) || + !is_cold_node(&folio->page))) continue; lock_node: if (wbc->sync_mode == WB_SYNC_ALL) - lock_page(page); - else if (!trylock_page(page)) + folio_lock(folio); + else if (!folio_trylock(folio)) continue; - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } @@ -2053,29 +2053,29 @@ continue_unlock: goto write_node; /* flush inline_data */ - if (page_private_inline(page)) { - clear_page_private_inline(page); - unlock_page(page); - flush_inline_data(sbi, ino_of_node(page)); + if (page_private_inline(&folio->page)) { + clear_page_private_inline(&folio->page); + folio_unlock(folio); + flush_inline_data(sbi, ino_of_node(&folio->page)); goto lock_node; } /* flush dirty inode */ - if (IS_INODE(page) && flush_dirty_inode(page)) + if (IS_INODE(&folio->page) && flush_dirty_inode(&folio->page)) goto lock_node; write_node: - f2fs_wait_on_page_writeback(page, NODE, true, true); + f2fs_folio_wait_writeback(folio, NODE, true, true); - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); + set_fsync_mark(&folio->page, 0); + set_dentry_mark(&folio->page, 0); - ret = __write_node_page(page, false, &submitted, + ret = __write_node_page(&folio->page, false, &submitted, wbc, do_balance, io_type, NULL); if (ret) - unlock_page(page); + folio_unlock(folio); else if (submitted) nwritten++; From de90f76144246062206daff08e7cecb6c27298d9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:40 +0000 Subject: [PATCH 22/68] f2fs: Pass a folio to flush_dirty_inode() Its one caller now has a folio; pass it in and do page conversions where necessary inside flush_dirty_inode(). Saves two hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a2da7b726dbf..ecba85f05221 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1927,18 +1927,18 @@ static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) return 1; } -static bool flush_dirty_inode(struct page *page) +static bool flush_dirty_inode(struct folio *folio) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); struct inode *inode; - nid_t ino = ino_of_node(page); + nid_t ino = ino_of_node(&folio->page); inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL); if (!inode) return false; - f2fs_update_inode(inode, page); - unlock_page(page); + f2fs_update_inode(inode, &folio->page); + folio_unlock(folio); iput(inode); return true; @@ -2061,7 +2061,7 @@ continue_unlock: } /* flush dirty inode */ - if (IS_INODE(&folio->page) && flush_dirty_inode(&folio->page)) + if (IS_INODE(&folio->page) && flush_dirty_inode(folio)) goto lock_node; write_node: f2fs_folio_wait_writeback(folio, NODE, true, true); From e23bebc3c0d2db16f959d43213a305e85efd8ae5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:41 +0000 Subject: [PATCH 23/68] f2fs: Convert f2fs_fsync_node_pages() to use a folio Use the folio APIs where they exist. Saves several hidden calls to compound_head(). Also removes a reference to page->mapping. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ecba85f05221..eeda3e6f83af 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1811,7 +1811,7 @@ retry: int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { @@ -1821,63 +1821,63 @@ retry: goto out; } - if (!IS_DNODE(page) || !is_cold_node(page)) + if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) continue; - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) continue; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) goto continue_unlock; - if (!PageDirty(page) && page != last_page) { + if (!folio_test_dirty(folio) && &folio->page != last_page) { /* someone wrote it for us */ goto continue_unlock; } - f2fs_wait_on_page_writeback(page, NODE, true, true); + f2fs_folio_wait_writeback(folio, NODE, true, true); - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); + set_fsync_mark(&folio->page, 0); + set_dentry_mark(&folio->page, 0); - if (!atomic || page == last_page) { - set_fsync_mark(page, 1); + if (!atomic || &folio->page == last_page) { + set_fsync_mark(&folio->page, 1); percpu_counter_inc(&sbi->rf_node_block_count); - if (IS_INODE(page)) { + if (IS_INODE(&folio->page)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) - f2fs_update_inode(inode, page); - set_dentry_mark(page, + f2fs_update_inode(inode, &folio->page); + set_dentry_mark(&folio->page, f2fs_need_dentry_mark(sbi, ino)); } /* may be written by other thread */ - if (!PageDirty(page)) - set_page_dirty(page); + if (!folio_test_dirty(folio)) + folio_mark_dirty(folio); } - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - ret = __write_node_page(page, atomic && - page == last_page, + ret = __write_node_page(&folio->page, atomic && + &folio->page == last_page, &submitted, wbc, true, FS_NODE_IO, seq_id); if (ret) { - unlock_page(page); + folio_unlock(folio); f2fs_put_page(last_page, 0); break; } else if (submitted) { nwritten++; } - if (page == last_page) { - f2fs_put_page(page, 0); + if (&folio->page == last_page) { + f2fs_folio_put(folio, false); marked = true; break; } From 18f3814fa6a8aac0b32b3b8e8f85a6b601047ddc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:42 +0000 Subject: [PATCH 24/68] f2fs: Convert last_fsync_dnode() to use a folio Use the folio APIs where they exist. Saves several hidden calls to compound_head(). Also removes a reference to page->mapping. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index eeda3e6f83af..90387dae2838 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1572,7 +1572,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index; struct folio_batch fbatch; - struct page *last_page = NULL; + struct folio *last_folio = NULL; int nr_folios; folio_batch_init(&fbatch); @@ -1584,45 +1584,45 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; if (unlikely(f2fs_cp_error(sbi))) { - f2fs_put_page(last_page, 0); + f2fs_folio_put(last_folio, false); folio_batch_release(&fbatch); return ERR_PTR(-EIO); } - if (!IS_DNODE(page) || !is_cold_node(page)) + if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) continue; - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) continue; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) goto continue_unlock; - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (last_page) - f2fs_put_page(last_page, 0); + if (last_folio) + f2fs_folio_put(last_folio, false); - get_page(page); - last_page = page; - unlock_page(page); + folio_get(folio); + last_folio = folio; + folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); } - return last_page; + return &last_folio->page; } static int __write_node_page(struct page *page, bool atomic, bool *submitted, From e11a31139517e8ddbe18f99504e60cfd74c58301 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:43 +0000 Subject: [PATCH 25/68] f2fs: Return a folio from last_fsync_dnode() Convert last_page to last_folio in f2fs_fsync_node_pages() and use folio APIs where they exist. Saves a few hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 90387dae2838..e2157c289552 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1568,7 +1568,7 @@ iput_out: iput(inode); } -static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) +static struct folio *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index; struct folio_batch fbatch; @@ -1622,7 +1622,7 @@ continue_unlock: folio_batch_release(&fbatch); cond_resched(); } - return &last_folio->page; + return last_folio; } static int __write_node_page(struct page *page, bool atomic, bool *submitted, @@ -1790,16 +1790,16 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, pgoff_t index; struct folio_batch fbatch; int ret = 0; - struct page *last_page = NULL; + struct folio *last_folio = NULL; bool marked = false; nid_t ino = inode->i_ino; int nr_folios; int nwritten = 0; if (atomic) { - last_page = last_fsync_dnode(sbi, ino); - if (IS_ERR_OR_NULL(last_page)) - return PTR_ERR_OR_ZERO(last_page); + last_folio = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_folio)) + return PTR_ERR_OR_ZERO(last_folio); } retry: folio_batch_init(&fbatch); @@ -1815,7 +1815,7 @@ retry: bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { - f2fs_put_page(last_page, 0); + f2fs_folio_put(last_folio, false); folio_batch_release(&fbatch); ret = -EIO; goto out; @@ -1836,7 +1836,7 @@ continue_unlock: if (ino_of_node(&folio->page) != ino) goto continue_unlock; - if (!folio_test_dirty(folio) && &folio->page != last_page) { + if (!folio_test_dirty(folio) && folio != last_folio) { /* someone wrote it for us */ goto continue_unlock; } @@ -1846,7 +1846,7 @@ continue_unlock: set_fsync_mark(&folio->page, 0); set_dentry_mark(&folio->page, 0); - if (!atomic || &folio->page == last_page) { + if (!atomic || folio == last_folio) { set_fsync_mark(&folio->page, 1); percpu_counter_inc(&sbi->rf_node_block_count); if (IS_INODE(&folio->page)) { @@ -1865,18 +1865,18 @@ continue_unlock: goto continue_unlock; ret = __write_node_page(&folio->page, atomic && - &folio->page == last_page, + folio == last_folio, &submitted, wbc, true, FS_NODE_IO, seq_id); if (ret) { folio_unlock(folio); - f2fs_put_page(last_page, 0); + f2fs_folio_put(last_folio, false); break; } else if (submitted) { nwritten++; } - if (&folio->page == last_page) { + if (folio == last_folio) { f2fs_folio_put(folio, false); marked = true; break; @@ -1890,11 +1890,11 @@ continue_unlock: } if (!ret && atomic && !marked) { f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx", - ino, page_folio(last_page)->index); - lock_page(last_page); - f2fs_wait_on_page_writeback(last_page, NODE, true, true); - set_page_dirty(last_page); - unlock_page(last_page); + ino, last_folio->index); + folio_lock(last_folio); + f2fs_folio_wait_writeback(last_folio, NODE, true, true); + folio_mark_dirty(last_folio); + folio_unlock(last_folio); goto retry; } out: From 8d77f68daeb19b5568e1ceee682a00327a6ca77c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:44 +0000 Subject: [PATCH 26/68] f2fs: Add f2fs_grab_cache_folio() Convert f2fs_grab_cache_page() into f2fs_grab_cache_folio() and add a wrapper. Removes several calls to deprecated functions. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 15ab02ea1016..954baa08957e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2776,33 +2776,46 @@ static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } -static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, - pgoff_t index, bool for_write) +static inline struct folio *f2fs_grab_cache_folio(struct address_space *mapping, + pgoff_t index, bool for_write) { - struct page *page; + struct folio *folio; unsigned int flags; if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { + fgf_t fgf_flags; + if (!for_write) - page = find_get_page_flags(mapping, index, - FGP_LOCK | FGP_ACCESSED); + fgf_flags = FGP_LOCK | FGP_ACCESSED; else - page = find_lock_page(mapping, index); - if (page) - return page; + fgf_flags = FGP_LOCK; + folio = __filemap_get_folio(mapping, index, fgf_flags, 0); + if (!IS_ERR(folio)) + return folio; if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) - return NULL; + return ERR_PTR(-ENOMEM); } if (!for_write) - return grab_cache_page(mapping, index); + return filemap_grab_folio(mapping, index); flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, index); + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); memalloc_nofs_restore(flags); - return page; + return folio; +} + +static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, + pgoff_t index, bool for_write) +{ + struct folio *folio = f2fs_grab_cache_folio(mapping, index, for_write); + + if (IS_ERR(folio)) + return NULL; + return &folio->page; } static inline struct page *f2fs_pagecache_get_page( From e33ce6bd4ea2703444509cafb3646a547573fbc3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:45 +0000 Subject: [PATCH 27/68] mm: Remove grab_cache_page_write_begin() All callers have now been converted to use folios, so remove this compatibility wrapper. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/pagemap.h | 3 --- mm/folio-compat.c | 8 -------- 2 files changed, 11 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a19d8e334194..45817e2106ee 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -990,9 +990,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); -struct page *grab_cache_page_write_begin(struct address_space *mapping, - pgoff_t index); - /* * Returns locked page at given index in given cache, creating it if needed. */ diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 5766d135af1e..45540942d148 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -84,11 +84,3 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, return folio_file_page(folio, index); } EXPORT_SYMBOL(pagecache_get_page); - -struct page *grab_cache_page_write_begin(struct address_space *mapping, - pgoff_t index) -{ - return pagecache_get_page(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); -} -EXPORT_SYMBOL(grab_cache_page_write_begin); From 48a34c5981039f34ea84dc5f4eab1d3911b78bca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:46 +0000 Subject: [PATCH 28/68] f2fs: Use a folio in __get_node_page() Retrieve a folio from the page cache and use it throughout. Saves six hidden calls to compound_head() and removes a reference to page->mapping. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e2157c289552..19581555e79b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1459,7 +1459,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, struct page *parent, int start) { - struct page *page; + struct folio *folio; int err; if (!nid) @@ -1467,11 +1467,11 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, if (f2fs_check_nid_range(sbi, nid)) return ERR_PTR(-EINVAL); repeat: - page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); - if (!page) - return ERR_PTR(-ENOMEM); + folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), nid, false); + if (IS_ERR(folio)) + return ERR_CAST(folio); - err = read_node_page(page, 0); + err = read_node_page(&folio->page, 0); if (err < 0) { goto out_put_err; } else if (err == LOCKED_PAGE) { @@ -1482,40 +1482,40 @@ repeat: if (parent) f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE); - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { + f2fs_folio_put(folio, true); goto repeat; } - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; goto out_err; } - if (!f2fs_inode_chksum_verify(sbi, page)) { + if (!f2fs_inode_chksum_verify(sbi, &folio->page)) { err = -EFSBADCRC; goto out_err; } page_hit: - if (likely(nid == nid_of_node(page))) - return page; + if (likely(nid == nid_of_node(&folio->page))) + return &folio->page; f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - nid, nid_of_node(page), ino_of_node(page), - ofs_of_node(page), cpver_of_node(page), - next_blkaddr_of_node(page)); + nid, nid_of_node(&folio->page), ino_of_node(&folio->page), + ofs_of_node(&folio->page), cpver_of_node(&folio->page), + next_blkaddr_of_node(&folio->page)); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); err = -EFSCORRUPTED; out_err: - ClearPageUptodate(page); + folio_clear_uptodate(folio); out_put_err: /* ENOENT comes from read_node_page which is not an error. */ if (err != -ENOENT) - f2fs_handle_page_eio(sbi, page_folio(page), NODE); - f2fs_put_page(page, 1); + f2fs_handle_page_eio(sbi, folio, NODE); + f2fs_folio_put(folio, true); return ERR_PTR(err); } From cd8f95718c89f45fb440422576e265e38f93bdbb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:47 +0000 Subject: [PATCH 29/68] f2fs: Use a folio in do_write_page() Convert fio->page to a folio then use it where folio APIs exist. Removes a reference to page->mapping and a hidden call to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fd1f6e924e0f..1abdebfe6b48 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3901,6 +3901,7 @@ static int log_type_to_seg_type(enum log_type type) static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { + struct folio *folio = page_folio(fio->page); enum log_type type = __get_segment_type(fio); int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && @@ -3911,9 +3912,9 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio)) { - if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host)) + if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); - end_page_writeback(fio->page); + folio_end_writeback(folio); if (f2fs_in_warm_node_list(fio->sbi, fio->page)) f2fs_del_fsync_node_entry(fio->sbi, fio->page); goto out; From fb9660481e3ccb26143c0596420487513383e940 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:48 +0000 Subject: [PATCH 30/68] f2fs: Convert f2fs_write_end_io() to use a folio_iter Iterate over each folio in the bio instead of each page. Follow the pattern in ext4 for handling bounce folios. Removes a few calls to compound_head() and references to page->mapping. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d36c876acd74..0e22595ef1a6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -319,8 +319,7 @@ static void f2fs_read_end_io(struct bio *bio) static void f2fs_write_end_io(struct bio *bio) { struct f2fs_sb_info *sbi; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; iostat_update_and_unbind_ctx(bio); sbi = bio->bi_private; @@ -328,34 +327,41 @@ static void f2fs_write_end_io(struct bio *bio) if (time_to_inject(sbi, FAULT_WRITE_IO)) bio->bi_status = BLK_STS_IOERR; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - enum count_type type = WB_DATA_TYPE(page, false); + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + enum count_type type; - fscrypt_finalize_bounce_page(&page); + if (fscrypt_is_bounce_folio(folio)) { + struct folio *io_folio = folio; + + folio = fscrypt_pagecache_folio(io_folio); + fscrypt_free_bounce_page(&io_folio->page); + } #ifdef CONFIG_F2FS_FS_COMPRESSION - if (f2fs_is_compressed_page(page)) { - f2fs_compress_write_end_io(bio, page); + if (f2fs_is_compressed_page(&folio->page)) { + f2fs_compress_write_end_io(bio, &folio->page); continue; } #endif + type = WB_DATA_TYPE(&folio->page, false); + if (unlikely(bio->bi_status)) { - mapping_set_error(page->mapping, -EIO); + mapping_set_error(folio->mapping, -EIO); if (type == F2FS_WB_CP_DATA) f2fs_stop_checkpoint(sbi, true, STOP_CP_REASON_WRITE_FAIL); } - f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && - page_folio(page)->index != nid_of_node(page)); + f2fs_bug_on(sbi, folio->mapping == NODE_MAPPING(sbi) && + folio->index != nid_of_node(&folio->page)); dec_page_count(sbi, type); - if (f2fs_in_warm_node_list(sbi, page)) - f2fs_del_fsync_node_entry(sbi, page); - clear_page_private_gcing(page); - end_page_writeback(page); + if (f2fs_in_warm_node_list(sbi, &folio->page)) + f2fs_del_fsync_node_entry(sbi, &folio->page); + clear_page_private_gcing(&folio->page); + folio_end_writeback(folio); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) From 521a468486906b5e16fd70ad030e4826d4009079 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:49 +0000 Subject: [PATCH 31/68] f2fs: Mark some functions as taking a const page pointer The compiler can make some optimisations if we tell it that a function call doesn't change this memory. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 954baa08957e..e0af6b44197a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2026,7 +2026,7 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) return (struct f2fs_checkpoint *)(sbi->ckpt); } -static inline struct f2fs_node *F2FS_NODE(struct page *page) +static inline struct f2fs_node *F2FS_NODE(const struct page *page) { return (struct f2fs_node *)page_address(page); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 6aea13024ac1..281d53c95c9a 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -248,7 +248,7 @@ static inline nid_t nid_of_node(struct page *node_page) return le32_to_cpu(rn->footer.nid); } -static inline unsigned int ofs_of_node(struct page *node_page) +static inline unsigned int ofs_of_node(const struct page *node_page) { struct f2fs_node *rn = F2FS_NODE(node_page); unsigned flag = le32_to_cpu(rn->footer.flag); @@ -342,7 +342,7 @@ static inline bool is_recoverable_dnode(struct page *page) * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) * `- direct node */ -static inline bool IS_DNODE(struct page *node_page) +static inline bool IS_DNODE(const struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); @@ -389,7 +389,7 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold data pages in page cache */ -static inline int is_node(struct page *page, int type) +static inline int is_node(const struct page *page, int type) { struct f2fs_node *rn = F2FS_NODE(page); return le32_to_cpu(rn->footer.flag) & BIT(type); From 1a58a41ccce6da41bd5b98ede50227998d3e8ca3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:50 +0000 Subject: [PATCH 32/68] f2fs: Convert f2fs_in_warm_node_list() to take a folio All its callers now have access to a folio, so pass it in. Removes an access to page->mapping. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/node.c | 8 ++++---- fs/f2fs/segment.c | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0e22595ef1a6..383b97f30221 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -358,7 +358,7 @@ static void f2fs_write_end_io(struct bio *bio) folio->index != nid_of_node(&folio->page)); dec_page_count(sbi, type); - if (f2fs_in_warm_node_list(sbi, &folio->page)) + if (f2fs_in_warm_node_list(sbi, folio)) f2fs_del_fsync_node_entry(sbi, &folio->page); clear_page_private_gcing(&folio->page); folio_end_writeback(folio); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e0af6b44197a..8c7400516164 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3678,7 +3678,8 @@ struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); -bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page); +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, + const struct folio *folio); void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page); void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 19581555e79b..cd37eeaf01a1 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -310,10 +310,10 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } -bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page) +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, const struct folio *folio) { - return NODE_MAPPING(sbi) == page->mapping && - IS_DNODE(page) && is_cold_node(page); + return NODE_MAPPING(sbi) == folio->mapping && + IS_DNODE(&folio->page) && is_cold_node(&folio->page); } void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) @@ -1701,7 +1701,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, fio.op_flags |= REQ_PREFLUSH | REQ_FUA; /* should add to global list before clearing PAGECACHE status */ - if (f2fs_in_warm_node_list(sbi, page)) { + if (f2fs_in_warm_node_list(sbi, folio)) { seq = f2fs_add_fsync_node_entry(sbi, page); if (seq_id) *seq_id = seq; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1abdebfe6b48..50a346f7cb93 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3915,7 +3915,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); - if (f2fs_in_warm_node_list(fio->sbi, fio->page)) + if (f2fs_in_warm_node_list(fio->sbi, folio)) f2fs_del_fsync_node_entry(fio->sbi, fio->page); goto out; } From 4d417ae2bfce4a778cf4e65d87ec124ba871b3fb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:51 +0000 Subject: [PATCH 33/68] f2fs: Add f2fs_get_node_folio() Change __get_node_page() to return a folio and convert back to a page in f2fs_get_node_page() and f2fs_get_node_page_ra(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8c7400516164..46eac4d3386b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3700,6 +3700,7 @@ struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); int f2fs_move_node_page(struct page *node_page, int gc_type); void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cd37eeaf01a1..2969c1644ad3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1456,7 +1456,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, +static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, struct page *parent, int start) { struct folio *folio; @@ -1469,7 +1469,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, repeat: folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), nid, false); if (IS_ERR(folio)) - return ERR_CAST(folio); + return folio; err = read_node_page(&folio->page, 0); if (err < 0) { @@ -1500,7 +1500,7 @@ repeat: } page_hit: if (likely(nid == nid_of_node(&folio->page))) - return &folio->page; + return folio; f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(&folio->page), ino_of_node(&folio->page), @@ -1519,17 +1519,25 @@ out_put_err: return ERR_PTR(err); } +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid) +{ + return __get_node_folio(sbi, nid, NULL, 0); +} + struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - return __get_node_page(sbi, nid, NULL, 0); + struct folio *folio = __get_node_folio(sbi, nid, NULL, 0); + + return &folio->page; } struct page *f2fs_get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_P_SB(parent); nid_t nid = get_nid(parent, start, false); + struct folio *folio = __get_node_folio(sbi, nid, parent, start); - return __get_node_page(sbi, nid, parent, start); + return &folio->page; } static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) From 520b17e093f42e5d71fdc36e5203cec69188ea56 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:52 +0000 Subject: [PATCH 34/68] f2fs: Use a folio throughout f2fs_truncate_inode_blocks() Use f2fs_get_node_folio() to get a folio and use it throughout. Remove a few calls to compound_head() and a reference to page->mapping. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 2969c1644ad3..36614a1c2590 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1130,7 +1130,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) unsigned int nofs = 0; struct f2fs_inode *ri; struct dnode_of_data dn; - struct page *page; + struct folio *folio; trace_f2fs_truncate_inode_blocks_enter(inode, from); @@ -1147,16 +1147,16 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) return level; } - page = f2fs_get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); - return PTR_ERR(page); + folio = f2fs_get_node_folio(sbi, inode->i_ino); + if (IS_ERR(folio)) { + trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(folio)); + return PTR_ERR(folio); } - set_new_dnode(&dn, inode, page, NULL, 0); - unlock_page(page); + set_new_dnode(&dn, inode, &folio->page, NULL, 0); + folio_unlock(folio); - ri = F2FS_INODE(page); + ri = F2FS_INODE(&folio->page); switch (level) { case 0: case 1: @@ -1185,7 +1185,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) skip_partial: while (cont) { - dn.nid = get_nid(page, offset[0], true); + dn.nid = get_nid(&folio->page, offset[0], true); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -1206,7 +1206,7 @@ skip_partial: BUG(); } if (err == -ENOENT) { - set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); + set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); f2fs_err_ratelimited(sbi, "truncate node fail, ino:%lu, nid:%u, " @@ -1217,18 +1217,18 @@ skip_partial: } if (err < 0) goto fail; - if (offset[1] == 0 && get_nid(page, offset[0], true)) { - lock_page(page); - BUG_ON(page->mapping != NODE_MAPPING(sbi)); - set_nid(page, offset[0], 0, true); - unlock_page(page); + if (offset[1] == 0 && get_nid(&folio->page, offset[0], true)) { + folio_lock(folio); + BUG_ON(folio->mapping != NODE_MAPPING(sbi)); + set_nid(&folio->page, offset[0], 0, true); + folio_unlock(folio); } offset[1] = 0; offset[0]++; nofs += err; } fail: - f2fs_put_page(page, 0); + f2fs_folio_put(folio, false); trace_f2fs_truncate_inode_blocks_exit(inode, err); return err > 0 ? 0 : err; } From 922e24acb49e5e32924c13d27b565a4807d777a4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:53 +0000 Subject: [PATCH 35/68] f2fs: Use a folio throughout __get_meta_page() Use f2fs_grab_cache_folio() to get a folio and use it throughout, removing seven calls to compound_head() and a reference to page->mapping. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index bd890738b94d..75b7196d2c81 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -58,7 +58,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, bool is_meta) { struct address_space *mapping = META_MAPPING(sbi); - struct page *page; + struct folio *folio; struct f2fs_io_info fio = { .sbi = sbi, .type = META, @@ -74,37 +74,37 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, if (unlikely(!is_meta)) fio.op_flags &= ~REQ_META; repeat: - page = f2fs_grab_cache_page(mapping, index, false); - if (!page) { + folio = f2fs_grab_cache_folio(mapping, index, false); + if (IS_ERR(folio)) { cond_resched(); goto repeat; } - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out; - fio.page = page; + fio.page = &folio->page; err = f2fs_submit_page_bio(&fio); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return ERR_PTR(err); } f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); - lock_page(page); - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) { + f2fs_folio_put(folio, true); goto repeat; } - if (unlikely(!PageUptodate(page))) { - f2fs_handle_page_eio(sbi, page_folio(page), META); - f2fs_put_page(page, 1); + if (unlikely(!folio_test_uptodate(folio))) { + f2fs_handle_page_eio(sbi, folio, META); + f2fs_folio_put(folio, true); return ERR_PTR(-EIO); } out: - return page; + return &folio->page; } struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) From b8fcb8423053adaa27723010260aea90474b431a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:54 +0000 Subject: [PATCH 36/68] f2fs: Hoist the page_folio() call to the start of f2fs_merge_page_bio() Remove one call to compound_head() and a reference to page->mapping by calling page_folio() early on. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 383b97f30221..a7bd9b07235a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -888,6 +888,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) struct bio *bio = *fio->bio; struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; + struct folio *folio = page_folio(fio->page); if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) @@ -901,8 +902,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_VECS); - f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, - page_folio(fio->page)->index, fio, GFP_NOIO); + f2fs_set_bio_crypt_ctx(bio, folio->mapping->host, + folio->index, fio, GFP_NOIO); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { @@ -911,8 +912,7 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), - PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); From 4ae71b1996ef2668de28945a31e0337b5abc93be Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:55 +0000 Subject: [PATCH 37/68] f2fs: Add f2fs_get_read_data_folio() Convert f2fs_get_read_data_page() into f2fs_get_read_data_folio() and add a compatibility wrapper. Saves seven hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 35 +++++++++++++++++------------------ fs/f2fs/f2fs.h | 14 ++++++++++++-- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a7bd9b07235a..8a26e586f719 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1203,18 +1203,17 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write, - pgoff_t *next_pgofs) +struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; - struct page *page; + struct folio *folio; int err; - page = f2fs_grab_cache_page(mapping, index, for_write); - if (!page) - return ERR_PTR(-ENOMEM); + folio = f2fs_grab_cache_folio(mapping, index, for_write); + if (IS_ERR(folio)) + return folio; if (f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { @@ -1249,9 +1248,9 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, goto put_err; } got_it: - if (PageUptodate(page)) { - unlock_page(page); - return page; + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + return folio; } /* @@ -1262,21 +1261,21 @@ got_it: * f2fs_init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_SIZE); - if (!PageUptodate(page)) - SetPageUptodate(page); - unlock_page(page); - return page; + folio_zero_segment(folio, 0, folio_size(folio)); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + folio_unlock(folio); + return folio; } - err = f2fs_submit_page_read(inode, page_folio(page), dn.data_blkaddr, + err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr, op_flags, for_write); if (err) goto put_err; - return page; + return folio; put_err: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return ERR_PTR(err); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 46eac4d3386b..8da92054baa0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3906,8 +3906,8 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); +struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, pgoff_t *next_pgofs); struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, @@ -3937,6 +3937,16 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); extern const struct iomap_ops f2fs_iomap_ops; +static inline struct page *f2fs_get_read_data_page(struct inode *inode, + pgoff_t index, blk_opf_t op_flags, bool for_write, + pgoff_t *next_pgofs) +{ + struct folio *folio = f2fs_get_read_data_folio(inode, index, op_flags, + for_write, next_pgofs); + + return &folio->page; +} + /* * gc.c */ From 20f974cd2124bd4e2eb599f047465232f63b57b0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:56 +0000 Subject: [PATCH 38/68] f2fs: Add f2fs_get_lock_data_folio() Convert f2fs_get_lock_data_page() to f2fs_get_lock_data_folio() and add a compatibility wrapper. Removes three hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 +++++++++--------- fs/f2fs/f2fs.h | 10 +++++++++- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8a26e586f719..0359b0c300d6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1310,23 +1310,23 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ -struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, +struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; - page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); - if (IS_ERR(page)) - return page; + folio = f2fs_get_read_data_folio(inode, index, 0, for_write, NULL); + if (IS_ERR(folio)) + return folio; /* wait for read completion */ - lock_page(page); - if (unlikely(page->mapping != mapping || !PageUptodate(page))) { - f2fs_put_page(page, 1); + folio_lock(folio); + if (unlikely(folio->mapping != mapping || !folio_test_uptodate(folio))) { + f2fs_folio_put(folio, true); return ERR_PTR(-EIO); } - return page; + return folio; } /* diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8da92054baa0..8939cfb5bc90 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3910,7 +3910,7 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, pgoff_t *next_pgofs); -struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, +struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write); struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); @@ -3947,6 +3947,14 @@ static inline struct page *f2fs_get_read_data_page(struct inode *inode, return &folio->page; } +static inline struct page *f2fs_get_lock_data_page(struct inode *inode, + pgoff_t index, bool for_write) +{ + struct folio *folio = f2fs_get_lock_data_folio(inode, index, for_write); + + return &folio->page; +} + /* * gc.c */ From 6d1ba45c8db084348e033db531161f883676b859 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:57 +0000 Subject: [PATCH 39/68] f2fs: Convert move_data_page() to use a folio Fetch a folio from the page cache and use it throughout, saving eight hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index faf9fa1c804d..d0fffa2bd9f0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1449,14 +1449,14 @@ out: } static int move_data_page(struct inode *inode, block_t bidx, int gc_type, - unsigned int segno, int off) + unsigned int segno, int off) { - struct page *page; + struct folio *folio; int err = 0; - page = f2fs_get_lock_data_page(inode, bidx, true); - if (IS_ERR(page)) - return PTR_ERR(page); + folio = f2fs_get_lock_data_folio(inode, bidx, true); + if (IS_ERR(folio)) + return PTR_ERR(folio); if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { err = -ENOENT; @@ -1468,12 +1468,12 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; if (gc_type == BG_GC) { - if (folio_test_writeback(page_folio(page))) { + if (folio_test_writeback(folio)) { err = -EAGAIN; goto out; } - set_page_dirty(page); - set_page_private_gcing(page); + folio_mark_dirty(folio); + set_page_private_gcing(&folio->page); } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -1483,37 +1483,37 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, .old_blkaddr = NULL_ADDR, - .page = page, + .page = &folio->page, .encrypted_page = NULL, .need_lock = LOCK_REQ, .io_type = FS_GC_DATA_IO, }; - bool is_dirty = PageDirty(page); + bool is_dirty = folio_test_dirty(folio); retry: - f2fs_wait_on_page_writeback(page, DATA, true, true); + f2fs_folio_wait_writeback(folio, DATA, true, true); - set_page_dirty(page); - if (clear_page_dirty_for_io(page)) { + folio_mark_dirty(folio); + if (folio_clear_dirty_for_io(folio)) { inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); } - set_page_private_gcing(page); + set_page_private_gcing(&folio->page); err = f2fs_do_write_data_page(&fio); if (err) { - clear_page_private_gcing(page); + clear_page_private_gcing(&folio->page); if (err == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto retry; } if (is_dirty) - set_page_dirty(page); + folio_mark_dirty(folio); } } out: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return err; } From ab907aa2a2f3224c9ad47e768d8b911382e0ec83 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:58 +0000 Subject: [PATCH 40/68] f2fs: Convert truncate_partial_data_page() to use a folio Retrieve a folio from the page cache and use it throughout. Saves five hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 014cb7660a9a..82b21baf5628 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -707,31 +707,33 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, loff_t offset = from & (PAGE_SIZE - 1); pgoff_t index = from >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; if (!offset && !cache_only) return 0; if (cache_only) { - page = find_lock_page(mapping, index); - if (page && PageUptodate(page)) + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) + return 0; + if (folio_test_uptodate(folio)) goto truncate_out; - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return 0; } - page = f2fs_get_lock_data_page(inode, index, true); - if (IS_ERR(page)) - return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); + folio = f2fs_get_lock_data_folio(inode, index, true); + if (IS_ERR(folio)) + return PTR_ERR(folio) == -ENOENT ? 0 : PTR_ERR(folio); truncate_out: - f2fs_wait_on_page_writeback(page, DATA, true, true); - zero_user(page, offset, PAGE_SIZE - offset); + f2fs_folio_wait_writeback(folio, DATA, true, true); + folio_zero_segment(folio, offset, folio_size(folio)); /* An encrypted inode should have a key and truncate the last page. */ f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode)); if (!cache_only) - set_page_dirty(page); - f2fs_put_page(page, 1); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); return 0; } From a86e109ee2c6214c9be5520285525710a6163f42 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:59 +0000 Subject: [PATCH 41/68] f2fs: Convert gc_data_segment() to use a folio Use f2fs_get_read_data_folio() instead of f2fs_get_read_data_page(). Saves a hidden call to compound_head() in f2fs_put_page(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d0fffa2bd9f0..2b8f9239bede 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1542,7 +1542,6 @@ next_step: entry = sum; for (off = 0; off < usable_blks_in_seg; off++, entry++) { - struct page *data_page; struct inode *inode; struct node_info dni; /* dnode info for the data */ unsigned int ofs_in_node, nofs; @@ -1585,6 +1584,7 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { + struct folio *data_folio; int err; inode = f2fs_iget(sb, dni.ino); @@ -1635,15 +1635,15 @@ next_step: continue; } - data_page = f2fs_get_read_data_page(inode, start_bidx, + data_folio = f2fs_get_read_data_folio(inode, start_bidx, REQ_RAHEAD, true, NULL); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (IS_ERR(data_page)) { + if (IS_ERR(data_folio)) { iput(inode); continue; } - f2fs_put_page(data_page, 0); + f2fs_folio_put(data_folio, false); add_gc_inode(gc_list, inode); continue; } From 0cd402baa03b0cd790ddbfbce5aadb2ab6373263 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:52:00 +0000 Subject: [PATCH 42/68] f2fs: Add f2fs_find_data_folio() Convert f2fs_find_data_page() to f2fs_find_data_folio() and add a compatibility wrapper. Saves six hidden calls to compound_head(). This was the last caller of f2fs_get_read_data_page(), so remove the compatibility wrapper. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 33 ++++++++++++++++++--------------- fs/f2fs/f2fs.h | 12 +++++------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0359b0c300d6..09437dbd1b42 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1279,30 +1279,33 @@ put_err: return ERR_PTR(err); } -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, +struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; - page = find_get_page_flags(mapping, index, FGP_ACCESSED); - if (page && PageUptodate(page)) - return page; - f2fs_put_page(page, 0); + folio = __filemap_get_folio(mapping, index, FGP_ACCESSED, 0); + if (IS_ERR(folio)) + goto read; + if (folio_test_uptodate(folio)) + return folio; + f2fs_folio_put(folio, false); - page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs); - if (IS_ERR(page)) - return page; +read: + folio = f2fs_get_read_data_folio(inode, index, 0, false, next_pgofs); + if (IS_ERR(folio)) + return folio; - if (PageUptodate(page)) - return page; + if (folio_test_uptodate(folio)) + return folio; - wait_on_page_locked(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 0); + folio_wait_locked(folio); + if (unlikely(!folio_test_uptodate(folio))) { + f2fs_folio_put(folio, false); return ERR_PTR(-EIO); } - return page; + return folio; } /* diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8939cfb5bc90..c6cc2694f9ac 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3908,8 +3908,8 @@ int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, - pgoff_t *next_pgofs); +struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs); struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write); struct page *f2fs_get_new_data_page(struct inode *inode, @@ -3937,12 +3937,10 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); extern const struct iomap_ops f2fs_iomap_ops; -static inline struct page *f2fs_get_read_data_page(struct inode *inode, - pgoff_t index, blk_opf_t op_flags, bool for_write, - pgoff_t *next_pgofs) +static inline struct page *f2fs_find_data_page(struct inode *inode, + pgoff_t index, pgoff_t *next_pgofs) { - struct folio *folio = f2fs_get_read_data_folio(inode, index, op_flags, - for_write, next_pgofs); + struct folio *folio = f2fs_find_data_folio(inode, index, next_pgofs); return &folio->page; } From d96e2802a802bea49973f82d921b45de910ecaca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:52:01 +0000 Subject: [PATCH 43/68] mm: Remove wait_on_page_locked() This compatibility wrapper has no callers left, so remove it. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/pagemap.h | 5 ----- mm/filemap.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 45817e2106ee..b8c6fa320ee3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1242,11 +1242,6 @@ static inline int folio_wait_locked_killable(struct folio *folio) return folio_wait_bit_killable(folio, PG_locked); } -static inline void wait_on_page_locked(struct page *page) -{ - folio_wait_locked(page_folio(page)); -} - void folio_end_read(struct folio *folio, bool success); void wait_on_page_writeback(struct page *page); void folio_wait_writeback(struct folio *folio); diff --git a/mm/filemap.c b/mm/filemap.c index 804d7365680c..09a2c59a1ccb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1379,7 +1379,7 @@ repeat: * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is - * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except + * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this * should be called while holding the ptl for the migration entry referencing * the page. From 8a2d9f00d502e6ef68c6d52f0863856040ddd2db Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 3 Mar 2025 22:16:21 +0000 Subject: [PATCH 44/68] f2fs: set highest IO priority for checkpoint thread The checkpoint is the top priority thread which can stop all the filesystem operations. Let's make it RT priority. Reviewed-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 75b7196d2c81..a35595f8d3f5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -21,7 +21,7 @@ #include "iostat.h" #include -#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3)) static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; From c2ecba026586cda6c7dc0fe9e6e60e7e9386c3bd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 8 Mar 2025 13:18:46 +0800 Subject: [PATCH 45/68] f2fs: control nat_bits feature via mount option Introduce a new mount option "nat_bits" to control nat_bits feature, by default nat_bits feature is disabled. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 2 ++ fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 3 +++ fs/f2fs/super.c | 15 +++++++++++++++ 4 files changed, 21 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index fb7d2ee022bc..aad08eff0502 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -365,6 +365,8 @@ errors=%s Specify f2fs behavior on critical errors. This supports modes: pending node write drop keep N/A pending meta write keep keep N/A ====================== =============== =============== ======== +nat_bits Enable nat_bits feature to enhance full/empty nat blocks access, + by default it's disabled. ======================== ============================================================ Debugfs Entries diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c6cc2694f9ac..20be0ff17dea 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -114,6 +114,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_GC_MERGE 0x02000000 #define F2FS_MOUNT_COMPRESS_CACHE 0x04000000 #define F2FS_MOUNT_AGE_EXTENT_CACHE 0x08000000 +#define F2FS_MOUNT_NAT_BITS 0x10000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 36614a1c2590..af9900bc3714 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -3306,6 +3306,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi) if (!nm_i->nat_bitmap) return -ENOMEM; + if (!test_opt(sbi, NAT_BITS)) + disable_nat_bits(sbi, true); + err = __get_nat_bitmaps(sbi); if (err) return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3c875dc07266..a2808f5fd705 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -190,6 +190,7 @@ enum { Opt_memory_mode, Opt_age_extent_cache, Opt_errors, + Opt_nat_bits, Opt_err, }; @@ -269,6 +270,7 @@ static match_table_t f2fs_tokens = { {Opt_memory_mode, "memory=%s"}, {Opt_age_extent_cache, "age_extent_cache"}, {Opt_errors, "errors=%s"}, + {Opt_nat_bits, "nat_bits"}, {Opt_err, NULL}, }; @@ -1322,6 +1324,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_nat_bits: + set_opt(sbi, NAT_BITS); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -2135,6 +2140,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC) seq_printf(seq, ",errors=%s", "panic"); + if (test_opt(sbi, NAT_BITS)) + seq_puts(seq, ",nat_bits"); + return 0; } @@ -2325,6 +2333,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool block_unit_discard = f2fs_block_unit_discard(sbi); + bool no_nat_bits = !test_opt(sbi, NAT_BITS); #ifdef CONFIG_QUOTA int i, j; #endif @@ -2453,6 +2462,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (no_nat_bits == !!test_opt(sbi, NAT_BITS)) { + err = -EINVAL; + f2fs_warn(sbi, "switch nat_bits option is not allowed"); + goto restore_opts; + } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); From 1cf6b5670af1f4e9d5bf2f7201e368733c59cbdd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Mar 2025 17:11:46 +0800 Subject: [PATCH 46/68] f2fs: do sanity check on inode footer in f2fs_get_inode_page() This patch introduces a new wrapper f2fs_get_inode_page(), then, caller can use it to load inode block to page cache, meanwhile it will do sanity check on inode footer. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 ++--- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/file.c | 2 +- fs/f2fs/inline.c | 22 ++++++++--------- fs/f2fs/inode.c | 4 ++-- fs/f2fs/node.c | 61 ++++++++++++++++++++++++++++++++---------------- fs/f2fs/node.h | 6 +++++ fs/f2fs/xattr.c | 4 ++-- 9 files changed, 69 insertions(+), 41 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 09437dbd1b42..bb701fa977bf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3404,7 +3404,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, restart: /* check inline_data */ - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; @@ -3467,7 +3467,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index, struct page *ipage; int err = 0; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -3497,7 +3497,7 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 54dd52de7269..5a63ff0df03b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -551,7 +551,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, goto put_error; } } else { - page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino); + page = f2fs_get_inode_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 20be0ff17dea..59e8fdcf4abe 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3701,7 +3701,8 @@ struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); -struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid); +struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); +struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino); struct page *f2fs_get_node_page_ra(struct page *parent, int start); int f2fs_move_node_page(struct page *node_page, int gc_type); void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 82b21baf5628..2ddb93d1a10c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -761,7 +761,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) if (lock) f2fs_lock_op(sbi); - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3e3c35d4c98b..ad92e9008781 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -119,7 +119,7 @@ int f2fs_read_inline_data(struct inode *inode, struct folio *folio) { struct page *ipage; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { folio_unlock(folio); return PTR_ERR(ipage); @@ -237,7 +237,7 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_lock_op(sbi); - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; @@ -265,7 +265,7 @@ int f2fs_write_inline_data(struct inode *inode, struct folio *folio) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *ipage; - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -312,7 +312,7 @@ int f2fs_recover_inline_data(struct inode *inode, struct page *npage) if (f2fs_has_inline_data(inode) && ri && (ri->i_inline & F2FS_INLINE_DATA)) { process_inline: - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -331,7 +331,7 @@ process_inline: } if (f2fs_has_inline_data(inode)) { - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); f2fs_truncate_inline_inode(inode, ipage, 0); @@ -361,7 +361,7 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, struct page *ipage; void *inline_dentry; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { *res_page = ipage; return NULL; @@ -609,7 +609,7 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) if (err) goto out; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out_fname; @@ -644,7 +644,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, struct page *page = NULL; int err = 0; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -734,7 +734,7 @@ bool f2fs_empty_inline_dir(struct inode *dir) void *inline_dentry; struct f2fs_dentry_ptr d; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; @@ -765,7 +765,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (ctx->pos == d.max) return 0; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -797,7 +797,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, struct page *ipage; int err = 0; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d6ad7810df69..30259e73145b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -410,7 +410,7 @@ static int do_read_inode(struct inode *inode) if (f2fs_check_nid_range(sbi, inode->i_ino)) return -EINVAL; - node_page = f2fs_get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); @@ -757,7 +757,7 @@ void f2fs_update_inode_page(struct inode *inode) struct page *node_page; int count = 0; retry: - node_page = f2fs_get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index af9900bc3714..89f1b4b92776 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -778,7 +778,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) npage[0] = dn->inode_page; if (!npage[0]) { - npage[0] = f2fs_get_node_page(sbi, nids[0]); + npage[0] = f2fs_get_inode_page(sbi, nids[0]); if (IS_ERR(npage[0])) return PTR_ERR(npage[0]); } @@ -1147,7 +1147,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) return level; } - folio = f2fs_get_node_folio(sbi, inode->i_ino); + folio = f2fs_get_inode_folio(sbi, inode->i_ino); if (IS_ERR(folio)) { trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(folio)); return PTR_ERR(folio); @@ -1456,8 +1456,27 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } +static int sanity_check_node_footer(struct f2fs_sb_info *sbi, + struct page *page, pgoff_t nid, + enum node_type ntype) +{ + if (unlikely(nid != nid_of_node(page) || + (ntype == NODE_TYPE_INODE && !IS_INODE(page)))) { + f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " + "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + ntype, nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + return -EFSCORRUPTED; + } + return 0; +} + static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, - struct page *parent, int start) + struct page *parent, int start, + enum node_type ntype) { struct folio *folio; int err; @@ -1499,16 +1518,9 @@ repeat: goto out_err; } page_hit: - if (likely(nid == nid_of_node(&folio->page))) + err = sanity_check_node_footer(sbi, &folio->page, nid, ntype); + if (!err) return folio; - - f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - nid, nid_of_node(&folio->page), ino_of_node(&folio->page), - ofs_of_node(&folio->page), cpver_of_node(&folio->page), - next_blkaddr_of_node(&folio->page)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); - err = -EFSCORRUPTED; out_err: folio_clear_uptodate(folio); out_put_err: @@ -1519,14 +1531,22 @@ out_put_err: return ERR_PTR(err); } -struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid) -{ - return __get_node_folio(sbi, nid, NULL, 0); -} - struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - struct folio *folio = __get_node_folio(sbi, nid, NULL, 0); + struct folio *folio = __get_node_folio(sbi, nid, NULL, 0, + NODE_TYPE_REGULAR); + + return &folio->page; +} + +struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino) +{ + return __get_node_folio(sbi, ino, NULL, 0, NODE_TYPE_INODE); +} + +struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino) +{ + struct folio *folio = f2fs_get_inode_folio(sbi, ino); return &folio->page; } @@ -1535,7 +1555,8 @@ struct page *f2fs_get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_P_SB(parent); nid_t nid = get_nid(parent, start, false); - struct folio *folio = __get_node_folio(sbi, nid, parent, start); + struct folio *folio = __get_node_folio(sbi, nid, parent, start, + NODE_TYPE_REGULAR); return &folio->page; } @@ -2727,7 +2748,7 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct page *page) struct page *ipage; struct f2fs_inode *ri; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 281d53c95c9a..5079c6a2298d 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -52,6 +52,12 @@ enum { IS_PREALLOC, /* nat entry is preallocated */ }; +/* For node type in __get_node_folio() */ +enum node_type { + NODE_TYPE_REGULAR, + NODE_TYPE_INODE, +}; + /* * For node information */ diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3f3874943679..d5b42e1005d8 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -282,7 +282,7 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - page = f2fs_get_node_page(sbi, inode->i_ino); + page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); @@ -449,7 +449,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - in_page = f2fs_get_node_page(sbi, inode->i_ino); + in_page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(in_page)) { f2fs_alloc_nid_failed(sbi, new_nid); return PTR_ERR(in_page); From 2aac2538a97d35b0a1beb60dce6001f5625b82e6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Mar 2025 17:11:47 +0800 Subject: [PATCH 47/68] f2fs: do sanity check on xattr node footer in f2fs_get_xnode_page() This patch introduces a new wrapper f2fs_get_xnode_page(), then, caller can use it to load xattr block to page cache, meanwhile it will do sanity check on xattr node footer. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 14 ++++++++++++-- fs/f2fs/node.h | 1 + fs/f2fs/xattr.c | 4 ++-- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 59e8fdcf4abe..9ddf73c627f5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3703,6 +3703,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino); +struct page *f2fs_get_xnode_page(struct f2fs_sb_info *sbi, pgoff_t xnid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); int f2fs_move_node_page(struct page *node_page, int gc_type); void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 89f1b4b92776..f958ce9f58ae 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1245,7 +1245,7 @@ int f2fs_truncate_xattr_node(struct inode *inode) if (!nid) return 0; - npage = f2fs_get_node_page(sbi, nid); + npage = f2fs_get_xnode_page(sbi, nid); if (IS_ERR(npage)) return PTR_ERR(npage); @@ -1461,7 +1461,9 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, enum node_type ntype) { if (unlikely(nid != nid_of_node(page) || - (ntype == NODE_TYPE_INODE && !IS_INODE(page)))) { + (ntype == NODE_TYPE_INODE && !IS_INODE(page)) || + (ntype == NODE_TYPE_XATTR && + !f2fs_has_xattr_block(ofs_of_node(page))))) { f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", ntype, nid, nid_of_node(page), ino_of_node(page), @@ -1551,6 +1553,14 @@ struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino) return &folio->page; } +struct page *f2fs_get_xnode_page(struct f2fs_sb_info *sbi, pgoff_t xnid) +{ + struct folio *folio = __get_node_folio(sbi, xnid, NULL, 0, + NODE_TYPE_XATTR); + + return &folio->page; +} + struct page *f2fs_get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_P_SB(parent); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 5079c6a2298d..103a437e6425 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -56,6 +56,7 @@ enum { enum node_type { NODE_TYPE_REGULAR, NODE_TYPE_INODE, + NODE_TYPE_XATTR, }; /* diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index d5b42e1005d8..c691b35618ad 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -303,7 +303,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = f2fs_get_node_page(sbi, xnid); + xpage = f2fs_get_xnode_page(sbi, xnid); if (IS_ERR(xpage)) return PTR_ERR(xpage); @@ -475,7 +475,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* write to xattr node block */ if (F2FS_I(inode)->i_xattr_nid) { - xpage = f2fs_get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = f2fs_get_xnode_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); f2fs_alloc_nid_failed(sbi, new_nid); From 1788971e0bfae0911e356ba7f8a517d659d6709d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Mar 2025 17:11:48 +0800 Subject: [PATCH 48/68] f2fs: introduce FAULT_INCONSISTENT_FOOTER To simulate inconsistent node footer error. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 1 + Documentation/filesystems/f2fs.rst | 1 + fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 3 ++- fs/f2fs/super.c | 1 + 5 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 81deae2af84d..b9a000e5098a 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -734,6 +734,7 @@ Description: Support configuring fault injection type, should be FAULT_BLKADDR_VALIDITY 0x000040000 FAULT_BLKADDR_CONSISTENCE 0x000080000 FAULT_NO_SEGMENT 0x000100000 + FAULT_INCONSISTENT_FOOTER 0x000200000 =========================== =========== What: /sys/fs/f2fs//discard_io_aware_gran diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index aad08eff0502..e15c4275862a 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -206,6 +206,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_BLKADDR_VALIDITY 0x000040000 FAULT_BLKADDR_CONSISTENCE 0x000080000 FAULT_NO_SEGMENT 0x000100000 + FAULT_INCONSISTENT_FOOTER 0x000200000 =========================== =========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9ddf73c627f5..307ec68d8de0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -62,6 +62,7 @@ enum { FAULT_BLKADDR_VALIDITY, FAULT_BLKADDR_CONSISTENCE, FAULT_NO_SEGMENT, + FAULT_INCONSISTENT_FOOTER, FAULT_MAX, }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f958ce9f58ae..9ea40fa22b3f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1463,7 +1463,8 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, if (unlikely(nid != nid_of_node(page) || (ntype == NODE_TYPE_INODE && !IS_INODE(page)) || (ntype == NODE_TYPE_XATTR && - !f2fs_has_xattr_block(ofs_of_node(page))))) { + !f2fs_has_xattr_block(ofs_of_node(page))) || + time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) { f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", ntype, nid, nid_of_node(page), ino_of_node(page), diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a2808f5fd705..397df271885c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -63,6 +63,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_BLKADDR_VALIDITY] = "invalid blkaddr", [FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr", [FAULT_NO_SEGMENT] = "no free segment", + [FAULT_INCONSISTENT_FOOTER] = "inconsistent footer", }; int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, From 986c50f6bca109c6cf362b4e2babcb85aba958f6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 24 Feb 2025 18:29:23 +0800 Subject: [PATCH 49/68] f2fs: fix to avoid accessing uninitialized curseg syzbot reports a f2fs bug as below: F2FS-fs (loop3): Stopped filesystem due to reason: 7 kworker/u8:7: attempt to access beyond end of device BUG: unable to handle page fault for address: ffffed1604ea3dfa RIP: 0010:get_ckpt_valid_blocks fs/f2fs/segment.h:361 [inline] RIP: 0010:has_curseg_enough_space fs/f2fs/segment.h:570 [inline] RIP: 0010:__get_secs_required fs/f2fs/segment.h:620 [inline] RIP: 0010:has_not_enough_free_secs fs/f2fs/segment.h:633 [inline] RIP: 0010:has_enough_free_secs+0x575/0x1660 fs/f2fs/segment.h:649 f2fs_is_checkpoint_ready fs/f2fs/segment.h:671 [inline] f2fs_write_inode+0x425/0x540 fs/f2fs/inode.c:791 write_inode fs/fs-writeback.c:1525 [inline] __writeback_single_inode+0x708/0x10d0 fs/fs-writeback.c:1745 writeback_sb_inodes+0x820/0x1360 fs/fs-writeback.c:1976 wb_writeback+0x413/0xb80 fs/fs-writeback.c:2156 wb_do_writeback fs/fs-writeback.c:2303 [inline] wb_workfn+0x410/0x1080 fs/fs-writeback.c:2343 process_one_work kernel/workqueue.c:3236 [inline] process_scheduled_works+0xa66/0x1840 kernel/workqueue.c:3317 worker_thread+0x870/0xd30 kernel/workqueue.c:3398 kthread+0x7a9/0x920 kernel/kthread.c:464 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Commit 8b10d3653735 ("f2fs: introduce FAULT_NO_SEGMENT") allows to trigger no free segment fault in allocator, then it will update curseg->segno to NULL_SEGNO, though, CP_ERROR_FLAG has been set, f2fs_write_inode() missed to check the flag, and access invalid curseg->segno directly in below call path, then resulting in panic: - f2fs_write_inode - f2fs_is_checkpoint_ready - has_enough_free_secs - has_not_enough_free_secs - __get_secs_required - has_curseg_enough_space - get_ckpt_valid_blocks : access invalid curseg->segno To avoid this issue, let's: - check CP_ERROR_FLAG flag in prior to f2fs_is_checkpoint_ready() in f2fs_write_inode(). - in has_curseg_enough_space(), save curseg->segno into a temp variable, and verify its validation before use. Fixes: 8b10d3653735 ("f2fs: introduce FAULT_NO_SEGMENT") Reported-by: syzbot+b6b347b7a4ea1b2e29b6@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67973c2b.050a0220.11b1bb.0089.GAE@google.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 7 +++++++ fs/f2fs/segment.h | 9 ++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 30259e73145b..aa2f41696a88 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -793,6 +793,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) !is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; + /* + * no need to update inode page, ultimately f2fs_evict_inode() will + * clear dirty status of inode. + */ + if (f2fs_cp_error(sbi)) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) { f2fs_mark_inode_dirty_sync(inode, true); return -ENOSPC; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 943be4f1d6d2..0465dc00b349 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -559,13 +559,16 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, unsigned int node_blocks, unsigned int data_blocks, unsigned int dent_blocks) { - unsigned int segno, left_blocks, blocks; int i; /* check current data/node sections in the worst case. */ for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) { segno = CURSEG_I(sbi, i)->segno; + + if (unlikely(segno == NULL_SEGNO)) + return false; + left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); @@ -576,6 +579,10 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, /* check current data section for dentry blocks. */ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; + + if (unlikely(segno == NULL_SEGNO)) + return false; + left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); if (dent_blocks > left_blocks) From 19426c4988aa85298c1b4caf2889d37ec5c80fea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Mar 2025 19:07:12 +0800 Subject: [PATCH 50/68] Revert "f2fs: rebuild nat_bits during umount" This reverts commit 94c821fb286b545d37549ff30a0c341e066f0d6c. It reports that there is potential corruption in node footer, the most suspious feature is nat_bits, let's revert recovery related code. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 21 +++------ fs/f2fs/f2fs.h | 32 +++++++++++++- fs/f2fs/node.c | 101 ++++++++++--------------------------------- 3 files changed, 59 insertions(+), 95 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a35595f8d3f5..ac21461b22c9 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1346,21 +1346,13 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long flags; - if (cpc->reason & CP_UMOUNT) { - if (le32_to_cpu(ckpt->cp_pack_total_block_count) + - NM_I(sbi)->nat_bits_blocks > BLKS_PER_SEG(sbi)) { - clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Disable nat_bits due to no space"); - } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && - f2fs_nat_bitmap_enabled(sbi)) { - f2fs_enable_nat_bits(sbi); - set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Rebuild and enable nat_bits"); - } - } - spin_lock_irqsave(&sbi->cp_lock, flags); + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); else @@ -1543,8 +1535,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); /* write nat bits */ - if ((cpc->reason & CP_UMOUNT) && - is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { + if (enabled_nat_bits(sbi, cpc)) { __u64 cp_ver = cur_cp_version(ckpt); block_t blk; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 307ec68d8de0..99ded6512d8b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2232,6 +2232,36 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem) #endif } +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + unsigned long flags; + unsigned char *nat_bits; + + /* + * In order to re-enable nat_bits we need to call fsck.f2fs by + * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, + * so let's rely on regular fsck or unclean shutdown. + */ + + if (lock) + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + nat_bits = NM_I(sbi)->nat_bits; + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock_irqrestore(&sbi->cp_lock, flags); + + kvfree(nat_bits); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { f2fs_down_read(&sbi->cp_rwsem); @@ -3696,7 +3726,6 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id); -bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi); int f2fs_remove_inode_page(struct inode *inode); struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); @@ -3724,7 +3753,6 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9ea40fa22b3f..3b6f3d72456c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2311,24 +2311,6 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } -bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int i; - bool ret = true; - - f2fs_down_read(&nm_i->nat_tree_lock); - for (i = 0; i < nm_i->nat_blocks; i++) { - if (!test_bit_le(i, nm_i->nat_block_bitmap)) { - ret = false; - break; - } - } - f2fs_up_read(&nm_i->nat_tree_lock); - - return ret; -} - static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set, bool build) { @@ -3007,23 +2989,7 @@ add_out: list_add_tail(&nes->set_list, head); } -static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs, - unsigned int valid) -{ - if (valid == 0) { - __set_bit_le(nat_ofs, nm_i->empty_nat_bits); - __clear_bit_le(nat_ofs, nm_i->full_nat_bits); - return; - } - - __clear_bit_le(nat_ofs, nm_i->empty_nat_bits); - if (valid == NAT_ENTRY_PER_BLOCK) - __set_bit_le(nat_ofs, nm_i->full_nat_bits); - else - __clear_bit_le(nat_ofs, nm_i->full_nat_bits); -} - -static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -3032,7 +2998,7 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, int valid = 0; int i = 0; - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + if (!enabled_nat_bits(sbi, NULL)) return; if (nat_index == 0) { @@ -3043,36 +3009,17 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR) valid++; } - - __update_nat_bits(nm_i, nat_index, valid); -} - -void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int nat_ofs; - - f2fs_down_read(&nm_i->nat_tree_lock); - - for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { - unsigned int valid = 0, nid_ofs = 0; - - /* handle nid zero due to it should never be used */ - if (unlikely(nat_ofs == 0)) { - valid = 1; - nid_ofs = 1; - } - - for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) { - if (!test_bit_le(nid_ofs, - nm_i->free_nid_bitmap[nat_ofs])) - valid++; - } - - __update_nat_bits(nm_i, nat_ofs, valid); + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; } - f2fs_up_read(&nm_i->nat_tree_lock); + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -3091,7 +3038,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if ((cpc->reason & CP_UMOUNT) || + if (enabled_nat_bits(sbi, cpc) || !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; @@ -3138,7 +3085,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - update_nat_bits(sbi, start_nid, page); + __update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); } @@ -3169,7 +3116,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * during unmount, let's flush nat_bits before checking * nat_cnt[DIRTY_NAT]. */ - if (cpc->reason & CP_UMOUNT) { + if (enabled_nat_bits(sbi, cpc)) { f2fs_down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); f2fs_up_write(&nm_i->nat_tree_lock); @@ -3185,7 +3132,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (cpc->reason & CP_UMOUNT || + if (enabled_nat_bits(sbi, cpc) || !__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); @@ -3222,18 +3169,15 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) __u64 cp_ver = cur_cp_version(ckpt); block_t nat_bits_addr; + if (!enabled_nat_bits(sbi, NULL)) + return 0; + nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kvzalloc(sbi, F2FS_BLK_TO_BYTES(nm_i->nat_bits_blocks), GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; - nm_i->full_nat_bits = nm_i->nat_bits + 8; - nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; - - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) - return 0; - nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { @@ -3250,12 +3194,13 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) cp_ver |= (cur_cp_crc(ckpt) << 32); if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { - clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)", - cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits)); + disable_nat_bits(sbi, true); return 0; } + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + f2fs_notice(sbi, "Found nat_bits in checkpoint"); return 0; } @@ -3266,7 +3211,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) unsigned int i = 0; nid_t nid, last_nid; - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + if (!enabled_nat_bits(sbi, NULL)) return; for (i = 0; i < nm_i->nat_blocks; i++) { From 448a834f89add24191f73661e6133062cc580317 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 7 Mar 2025 18:21:47 +0000 Subject: [PATCH 51/68] f2fs: Remove check for ->writepage We're almost able to remove a_ops->writepage. This check is unnecessary as we'll never call into __f2fs_write_data_pages() for character devices. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bb701fa977bf..8575358f5190 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3280,10 +3280,6 @@ static int __f2fs_write_data_pages(struct address_space *mapping, int ret; bool locked = false; - /* deal with chardevs and other special file */ - if (!mapping->a_ops->writepage) - return 0; - /* skip writing if there is no dirty page in this inode */ if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; From 6ad3ddbee892884706ab227af3f5c75deb206442 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 7 Mar 2025 18:21:48 +0000 Subject: [PATCH 52/68] f2fs: Remove f2fs_write_data_page() Mappings which implement writepages should not implement writepage as it can only harm writeback patterns. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8575358f5190..d872f785a996 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2935,29 +2935,6 @@ redirty_out: return err; } -static int f2fs_write_data_page(struct page *page, - struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); -#ifdef CONFIG_F2FS_FS_COMPRESSION - struct inode *inode = folio->mapping->host; - - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) - goto out; - - if (f2fs_compressed_file(inode)) { - if (f2fs_is_compressed_cluster(inode, folio->index)) { - folio_redirty_for_writepage(wbc, folio); - return AOP_WRITEPAGE_ACTIVATE; - } - } -out: -#endif - - return f2fs_write_single_data_page(folio, NULL, NULL, NULL, - wbc, FS_DATA_IO, 0, true); -} - /* * This function was copied from write_cache_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from @@ -4111,7 +4088,6 @@ static void f2fs_swap_deactivate(struct file *file) const struct address_space_operations f2fs_dblock_aops = { .read_folio = f2fs_read_data_folio, .readahead = f2fs_readahead, - .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, From 3b47398d9861db170931a0f9e3ec894eebbbb1f0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 7 Mar 2025 18:21:49 +0000 Subject: [PATCH 53/68] f2fs: Remove f2fs_write_meta_page() Mappings which implement writepages should not implement writepage as it can only harm writeback patterns. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ac21461b22c9..cf77987d0698 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -381,12 +381,6 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) -{ - return __f2fs_write_meta_page(page, wbc, FS_META_IO); -} - static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -507,7 +501,6 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping, } const struct address_space_operations f2fs_meta_aops = { - .writepage = f2fs_write_meta_page, .writepages = f2fs_write_meta_pages, .dirty_folio = f2fs_dirty_meta_folio, .invalidate_folio = f2fs_invalidate_folio, From 7ff0104a805245d76cd2bdcbb9e2ca4f4fcff3e4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 7 Mar 2025 18:21:50 +0000 Subject: [PATCH 54/68] f2fs: Remove f2fs_write_node_page() Mappings which implement writepages should not implement writepage as it can only harm writeback patterns. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3b6f3d72456c..5f15c224bf78 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1816,13 +1816,6 @@ release_page: return err; } -static int f2fs_write_node_page(struct page *page, - struct writeback_control *wbc) -{ - return __write_node_page(page, false, NULL, wbc, false, - FS_NODE_IO, NULL); -} - int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, unsigned int *seq_id) @@ -2249,7 +2242,6 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, * Structure of the f2fs node operations */ const struct address_space_operations f2fs_node_aops = { - .writepage = f2fs_write_node_page, .writepages = f2fs_write_node_pages, .dirty_folio = f2fs_dirty_node_folio, .invalidate_folio = f2fs_invalidate_folio, From f7f8932ca6bb22494ef6db671633ad3b4d982271 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Mar 2025 17:01:25 +0800 Subject: [PATCH 55/68] f2fs: fix to avoid running out of free segments If checkpoint is disabled, GC can not reclaim any segments, we need to detect such condition and bail out from fallocate() of a pinfile, rather than letting allocator running out of free segment, which may cause f2fs to be shutdown. reproducer: mkfs.f2fs -f /dev/vda 16777216 mount -o checkpoint=disable:10% /dev/vda /mnt/f2fs for ((i=0;i<4096;i++)) do { dd if=/dev/zero of=/mnt/f2fs/$i bs=1M count=1; } done sync for ((i=0;i<4096;i+=2)) do { rm /mnt/f2fs/$i; } done sync touch /mnt/f2fs/pinfile f2fs_io pinfile set /mnt/f2fs/pinfile f2fs_io fallocate 0 0 4201644032 /mnt/f2fs/pinfile cat /sys/kernel/debug/f2fs/status output: - Free: 0 (0) Fixes: f5a53edcf01e ("f2fs: support aligned pinned file") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2ddb93d1a10c..abbcbb5865a3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1838,6 +1838,18 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, next_alloc: f2fs_down_write(&sbi->pin_sem); + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (has_not_enough_free_secs(sbi, 0, 0)) { + f2fs_up_write(&sbi->pin_sem); + err = -ENOSPC; + f2fs_warn_ratelimited(sbi, + "ino:%lu, start:%lu, end:%lu, need to trigger GC to " + "reclaim enough free segment when checkpoint is enabled", + inode->i_ino, pg_start, pg_end); + goto out_err; + } + } + if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ? ZONED_PIN_SEC_REQUIRED_COUNT : GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { From d7b549def0eb42a950eebd3bd5343f5c8088c305 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 11 Mar 2025 11:29:31 -0700 Subject: [PATCH 56/68] f2fs: add carve_out sysfs node For several zoned storage devices, vendors will provide extra space which was used for device level GC than specs and F2FS can use this space for filesystem level GC. To do that, we can reserve the space using reserved_blocks. However, it is not enough, since this extra space should not be shown to users. So, with this new sysfs node, we can hide the space by substracting reserved_blocks from total bytes. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 10 ++++++++++ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/super.c | 3 ++- fs/f2fs/sysfs.c | 2 ++ 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index b9a000e5098a..2c85c6b8da8d 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -836,3 +836,13 @@ Contact: "Jaegeuk Kim" Description: It reclaims the given KBs of file-backed pages registered by ioctl(F2FS_IOC_DONATE_RANGE). For example, writing N tries to drop N KBs spaces in LRU. + +What: /sys/fs/f2fs//carve_out +Date: March 2025 +Contact: "Daeho Jeong" +Description: For several zoned storage devices, vendors will provide extra space which + was used for device level GC than specs and F2FS can use this space for + filesystem level GC. To do that, we can reserve the space using + reserved_blocks. However, it is not enough, since this extra space should + not be shown to users. So, with this new sysfs node, we can hide the space + by substracting reserved_blocks from total bytes. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 99ded6512d8b..a8c7fb46222e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1813,6 +1813,9 @@ struct f2fs_sb_info { u64 committed_atomic_block; u64 revoked_atomic_block; + /* carve out reserved_blocks from total blocks */ + bool carve_out; + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 397df271885c..1da1f154e5d4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1849,7 +1849,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; spin_lock(&sbi->stat_lock); - + if (sbi->carve_out) + buf->f_blocks -= sbi->current_reserved_blocks; user_block_count = sbi->user_block_count; total_valid_node_count = valid_node_count(sbi); avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index b27336acf519..c69161366467 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1122,6 +1122,7 @@ F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count); F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif +F2FS_SBI_GENERAL_RW_ATTR(carve_out); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1309,6 +1310,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(warm_data_age_threshold), ATTR_LIST(last_age_weight), ATTR_LIST(max_read_extent_count), + ATTR_LIST(carve_out), NULL, }; ATTRIBUTE_GROUPS(f2fs); From 64ee7503cbf662a3f4d6f464178de1607849e37e Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 3 Mar 2025 11:12:11 -0600 Subject: [PATCH 57/68] f2fs: use f2fs_sb_has_device_alias during option parsing Rather than using F2FS_HAS_FEATURE directly, use f2fs_sb_has_device_alias macro during option parsing for consistency. Signed-off-by: Eric Sandeen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1da1f154e5d4..40c78f526f94 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -841,7 +841,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: - if (F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_DEVICE_ALIAS)) { + if (f2fs_sb_has_device_alias(sbi)) { f2fs_err(sbi, "device aliasing requires extent cache"); return -EINVAL; } From 277352b6cbeda5c0976e56d25e3fcd775db96305 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 3 Mar 2025 11:12:12 -0600 Subject: [PATCH 58/68] f2fs: consolidate unsupported option handling errors When certain build-time options are disabled, some mount options are not accepted. For quota and compression, all related options are dismissed with a single error message. For xattr, acl, and fault injection, each option is handled individually. In addition, inline_xattr_size was missed when CONFIG_F2FS_FS_XATTR was disabled. Collapse xattr, acl, and fault injection errors into a single string, for simplicity, and handle the missing inline_xattr_size case. Signed-off-by: Eric Sandeen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 40c78f526f94..1d48900d8cd3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -775,16 +775,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #else case Opt_user_xattr: - f2fs_info(sbi, "user_xattr options not supported"); - break; case Opt_nouser_xattr: - f2fs_info(sbi, "nouser_xattr options not supported"); - break; case Opt_inline_xattr: - f2fs_info(sbi, "inline_xattr options not supported"); - break; case Opt_noinline_xattr: - f2fs_info(sbi, "noinline_xattr options not supported"); + case Opt_inline_xattr_size: + f2fs_info(sbi, "xattr options not supported"); break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL @@ -796,10 +791,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #else case Opt_acl: - f2fs_info(sbi, "acl options not supported"); - break; case Opt_noacl: - f2fs_info(sbi, "noacl options not supported"); + f2fs_info(sbi, "acl options not supported"); break; #endif case Opt_active_logs: @@ -922,11 +915,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #else case Opt_fault_injection: - f2fs_info(sbi, "fault_injection options not supported"); - break; - case Opt_fault_type: - f2fs_info(sbi, "fault_type options not supported"); + f2fs_info(sbi, "fault injection options not supported"); break; #endif case Opt_lazytime: From abd0e040e9a516fe1205d12ee33e1778ec546941 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 3 Mar 2025 11:12:13 -0600 Subject: [PATCH 59/68] f2fs: factor out an f2fs_default_check function The current options parsing function both parses options and validates them - factor the validation out to reduce the size of the function and make transition to the new mount API possible, because under the new mount API, options are parsed one at a time, and cannot all be tested at the end of the parsing function. Signed-off-by: Eric Sandeen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1d48900d8cd3..bdce7940fe20 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -690,7 +690,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) int ret; if (!options) - goto default_check; + return 0; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -1324,7 +1324,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } } -default_check: + return 0; +} + +static int f2fs_default_check(struct f2fs_sb_info *sbi) +{ #ifdef CONFIG_QUOTA if (f2fs_check_quota_options(sbi)) return -EINVAL; @@ -2384,6 +2388,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } #endif + err = f2fs_default_check(sbi); + if (err) + goto restore_opts; + /* flush outstanding errors before changing fs state */ flush_work(&sbi->s_error_work); @@ -4539,6 +4547,10 @@ try_onemore: if (err) goto free_options; + err = f2fs_default_check(sbi); + if (err) + goto free_options; + sb->s_maxbytes = max_file_blocks(NULL) << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; From 7d6ee503307125b5fb30b4863d3e8ffd0d808ca4 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 3 Mar 2025 11:12:14 -0600 Subject: [PATCH 60/68] f2fs: make INLINECRYPT a mount option flag Set INLINECRYPT into sbi during parsing, and transfer it to the sb in fill_super, so that an sb is not required during option parsing. Signed-off-by: Eric Sandeen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a8c7fb46222e..0978fbb7f885 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -116,6 +116,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_COMPRESS_CACHE 0x04000000 #define F2FS_MOUNT_AGE_EXTENT_CACHE 0x08000000 #define F2FS_MOUNT_NAT_BITS 0x10000000 +#define F2FS_MOUNT_INLINECRYPT 0x20000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bdce7940fe20..55ccb7a35f02 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1039,7 +1039,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - sb->s_flags |= SB_INLINECRYPT; + set_opt(sbi, INLINECRYPT); #else f2fs_info(sbi, "inline encryption not supported"); #endif @@ -4585,6 +4585,9 @@ try_onemore: sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); + if (test_opt(sbi, INLINECRYPT)) + sb->s_flags |= SB_INLINECRYPT; + super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid)); super_set_sysfs_name_bdev(sb); sb->s_iflags |= SB_I_CGROUPWB; From 9100adf326fa246beb03a26e5b374af4a6b4047d Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 3 Mar 2025 11:12:15 -0600 Subject: [PATCH 61/68] f2fs: make LAZYTIME a mount option flag Set LAZYTIME into sbi during parsing, and transfer it to the sb in fill_super, so that an sb is not required during option parsing. (Note: While lazytime is normally handled via mount flag in the vfs, some f2fs users do expect to be able to use it as an explicit mount option string via the mount syscall, so this option must remain.) Signed-off-by: Eric Sandeen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/super.c | 11 ++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0978fbb7f885..f1576dc6ec67 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -117,6 +117,11 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_AGE_EXTENT_CACHE 0x08000000 #define F2FS_MOUNT_NAT_BITS 0x10000000 #define F2FS_MOUNT_INLINECRYPT 0x20000000 +/* + * Some f2fs environments expect to be able to pass the "lazytime" option + * string rather than using the MS_LAZYTIME flag, so this must remain. + */ +#define F2FS_MOUNT_LAZYTIME 0x40000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 55ccb7a35f02..500244eb94b0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -920,10 +920,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #endif case Opt_lazytime: - sb->s_flags |= SB_LAZYTIME; + set_opt(sbi, LAZYTIME); break; case Opt_nolazytime: - sb->s_flags &= ~SB_LAZYTIME; + clear_opt(sbi, LAZYTIME); break; #ifdef CONFIG_QUOTA case Opt_quota: @@ -2186,8 +2186,8 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount) set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, MERGE_CHECKPOINT); + set_opt(sbi, LAZYTIME); F2FS_OPTION(sbi).unusable_cap = 0; - sbi->sb->s_flags |= SB_LAZYTIME; if (!f2fs_is_readonly(sbi)) set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_has_blkzoned(sbi)) @@ -4588,6 +4588,11 @@ try_onemore: if (test_opt(sbi, INLINECRYPT)) sb->s_flags |= SB_INLINECRYPT; + if (test_opt(sbi, LAZYTIME)) + sb->s_flags |= SB_LAZYTIME; + else + sb->s_flags &= ~SB_LAZYTIME; + super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid)); super_set_sysfs_name_bdev(sb); sb->s_iflags |= SB_I_CGROUPWB; From 0edcb2197e761db482d485bc0e0f5fd42cc1bc3d Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 3 Mar 2025 11:12:16 -0600 Subject: [PATCH 62/68] f2fs: Pass sbi rather than sb to f2fs_set_test_dummy_encryption This removes another sb instance from parse_options() Signed-off-by: Eric Sandeen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 500244eb94b0..4ffa95dd41df 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -486,12 +486,11 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) } #endif -static int f2fs_set_test_dummy_encryption(struct super_block *sb, +static int f2fs_set_test_dummy_encryption(struct f2fs_sb_info *sbi, const char *opt, const substring_t *arg, bool is_remount) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); struct fs_parameter param = { .type = fs_value_is_string, .string = arg->from ? arg->from : "", @@ -1032,7 +1031,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) kfree(name); break; case Opt_test_dummy_encryption: - ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], + ret = f2fs_set_test_dummy_encryption(sbi, p, &args[0], is_remount); if (ret) return ret; From 9cca49875997a1a7e92800a828a62bacb0f577b9 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 3 Mar 2025 11:12:17 -0600 Subject: [PATCH 63/68] f2fs: defer readonly check vs norecovery Defer the readonly-vs-norecovery check until after option parsing is done so that option parsing does not require an active superblock for the test. Add a helpful message, while we're at it. (I think could be moved back into parsing after we switch to the new mount API if desired, as the fs context will have RO state available.) Signed-off-by: Eric Sandeen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4ffa95dd41df..3589d1ff97a3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -730,10 +730,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, DISABLE_ROLL_FORWARD); break; case Opt_norecovery: - /* this option mounts f2fs with ro */ + /* requires ro mount, checked in f2fs_default_check */ set_opt(sbi, NORECOVERY); - if (!f2fs_readonly(sb)) - return -EINVAL; break; case Opt_discard: if (!f2fs_hw_support_discard(sbi)) { @@ -1417,6 +1415,12 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; } + + if (test_opt(sbi, NORECOVERY) && !f2fs_readonly(sbi->sb)) { + f2fs_err(sbi, "norecovery requires readonly mount"); + return -EINVAL; + } + return 0; } From b7de231b9df4eafc87ec693312c8889fc20f4e55 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 3 Mar 2025 11:12:18 -0600 Subject: [PATCH 64/68] f2fs: pass sbi rather than sb to quota qf_name helpers With the new mount api we will not have the superblock available during option parsing. Prepare for this by passing *sbi rather than *sb. For now, we are parsing after fill_super has been done, so sbi->sb will exist. Under the new mount API this will require more care, but do the simple change for now. Signed-off-by: Eric Sandeen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3589d1ff97a3..31590a1739d5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -386,10 +386,10 @@ static void init_once(void *foo) #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) -static int f2fs_set_qf_name(struct super_block *sb, int qtype, +static int f2fs_set_qf_name(struct f2fs_sb_info *sbi, int qtype, substring_t *args) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct super_block *sb = sbi->sb; char *qname; int ret = -EINVAL; @@ -427,9 +427,9 @@ errout: return ret; } -static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +static int f2fs_clear_qf_name(struct f2fs_sb_info *sbi, int qtype) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct super_block *sb = sbi->sb; if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); @@ -934,32 +934,32 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, PRJQUOTA); break; case Opt_usrjquota: - ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + ret = f2fs_set_qf_name(sbi, USRQUOTA, &args[0]); if (ret) return ret; break; case Opt_grpjquota: - ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + ret = f2fs_set_qf_name(sbi, GRPQUOTA, &args[0]); if (ret) return ret; break; case Opt_prjjquota: - ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + ret = f2fs_set_qf_name(sbi, PRJQUOTA, &args[0]); if (ret) return ret; break; case Opt_offusrjquota: - ret = f2fs_clear_qf_name(sb, USRQUOTA); + ret = f2fs_clear_qf_name(sbi, USRQUOTA); if (ret) return ret; break; case Opt_offgrpjquota: - ret = f2fs_clear_qf_name(sb, GRPQUOTA); + ret = f2fs_clear_qf_name(sbi, GRPQUOTA); if (ret) return ret; break; case Opt_offprjjquota: - ret = f2fs_clear_qf_name(sb, PRJQUOTA); + ret = f2fs_clear_qf_name(sbi, PRJQUOTA); if (ret) return ret; break; From 71e9bd3d5c04c19ef80d0bb33bf7ff1a2f0eeafb Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 3 Mar 2025 11:12:19 -0600 Subject: [PATCH 65/68] f2fs: pass sbi rather than sb to parse_options() With the new mount API the sb will not be available during initial option parsing, which will happen before fill_super reads sb from disk. Now that the sb is no longer directly referenced in parse_options, switch it to use sbi. (Note that all calls to f2fs_sb_has_* originating from parse_options will need to be deferred to later before we can use the new mount API.) Signed-off-by: Eric Sandeen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 31590a1739d5..7aff579893e4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -673,9 +673,8 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) #endif #endif -static int parse_options(struct super_block *sb, char *options, bool is_remount) +static int parse_options(struct f2fs_sb_info *sbi, char *options, bool is_remount) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; @@ -2376,7 +2375,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi, true); /* parse mount options */ - err = parse_options(sb, data, true); + err = parse_options(sbi, data, true); if (err) goto restore_opts; @@ -4546,7 +4545,7 @@ try_onemore: goto free_sb_buf; } - err = parse_options(sb, options, false); + err = parse_options(sbi, options, false); if (err) goto free_options; From f098aeba04c9328571567dca45159358a250240c Mon Sep 17 00:00:00 2001 From: Yeongjin Gil Date: Fri, 14 Mar 2025 21:06:51 +0900 Subject: [PATCH 66/68] f2fs: fix to avoid atomicity corruption of atomic file In the case of the following call stack for an atomic file, FI_DIRTY_INODE is set, but FI_ATOMIC_DIRTIED is not subsequently set. f2fs_file_write_iter f2fs_map_blocks f2fs_reserve_new_blocks inc_valid_block_count __mark_inode_dirty(dquot) f2fs_dirty_inode If FI_ATOMIC_DIRTIED is not set, atomic file can encounter corruption due to a mismatch between old file size and new data. To resolve this issue, I changed to set FI_ATOMIC_DIRTIED when FI_DIRTY_INODE is set. This ensures that FI_DIRTY_INODE, which was previously cleared by the Writeback thread during the commit atomic, is set and i_size is updated. Cc: Fixes: fccaa81de87e ("f2fs: prevent atomic file from being dirtied before commit") Reviewed-by: Sungjong Seo Reviewed-by: Sunmin Jeong Signed-off-by: Yeongjin Gil Reviewed-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 4 +--- fs/f2fs/super.c | 4 ++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index aa2f41696a88..83f862578fc8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -34,10 +34,8 @@ void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) if (f2fs_inode_dirtied(inode, sync)) return; - if (f2fs_is_atomic_file(inode)) { - set_inode_flag(inode, FI_ATOMIC_DIRTIED); + if (f2fs_is_atomic_file(inode)) return; - } mark_inode_dirty_sync(inode); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7aff579893e4..f087b2b71c89 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1530,6 +1530,10 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync) inc_page_count(sbi, F2FS_DIRTY_IMETA); } spin_unlock(&sbi->inode_lock[DIRTY_META]); + + if (!ret && f2fs_is_atomic_file(inode)) + set_inode_flag(inode, FI_ATOMIC_DIRTIED); + return ret; } From 351bc761338d9be29065c2b99e603bab336792b7 Mon Sep 17 00:00:00 2001 From: Yohan Joung Date: Mon, 17 Mar 2025 22:36:11 +0900 Subject: [PATCH 67/68] f2fs: optimize f2fs DIO overwrites this is unnecessary when we know we are overwriting already allocated blocks and the overhead of starting a transaction can be significant especially for multithreaded workloads doing small writes. Signed-off-by: Yohan Joung Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d872f785a996..d99ec9370b1d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4181,7 +4181,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); - if (flags & IOMAP_WRITE) + + /* + * If the blocks being overwritten are already allocated, + * f2fs_map_lock and f2fs_balance_fs are not necessary. + */ + if ((flags & IOMAP_WRITE) && + !f2fs_overwrite_io(inode, offset, length)) map.m_may_create = true; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO); From 21263d035ff21fa0ccf79adba20bab9cd8cca0f2 Mon Sep 17 00:00:00 2001 From: Chunhai Guo Date: Mon, 17 Mar 2025 04:16:24 -0600 Subject: [PATCH 68/68] f2fs: fix missing discard for active segments During a checkpoint, the current active segment X may not be handled properly. This occurs when segment X has 0 valid blocks and a non-zero number of discard blocks, for the following reasons: locate_dirty_segment() does not mark any active segment as a prefree segment. As a result, segment X is not included in dirty_segmap[PRE], and f2fs_clear_prefree_segments() skips it when handling prefree segments. add_discard_addrs() skips any segment with 0 valid blocks, so segment X is also skipped. Consequently, no `struct discard_cmd` is actually created for segment X. However, the ckpt_valid_map and cur_valid_map of segment X are synced by seg_info_to_raw_sit() during the current checkpoint process. As a result, it cannot find the missing discard bits even in subsequent checkpoints. Consequently, the value of sbi->discard_blks remains non-zero. Thus, when f2fs is umounted, CP_TRIMMED_FLAG will not be set due to the non-zero sbi->discard_blks. Relevant code process: f2fs_write_checkpoint() f2fs_flush_sit_entries() list_for_each_entry_safe(ses, tmp, head, set_list) { for_each_set_bit_from(segno, bitmap, end) { ... add_discard_addrs(sbi, cpc, false); // skip segment X due to its 0 valid blocks ... seg_info_to_raw_sit(); // sync ckpt_valid_map with cur_valid_map for segment X ... } } f2fs_clear_prefree_segments(); // segment X is not included in dirty_segmap[PRE] and is skipped This issue is easy to reproduce with the following operations: root # mkfs.f2fs -f /dev/f2fs_dev root # mount -t f2fs /dev/f2fs_dev /mnt_point root # dd if=/dev/blk_dev of=/mnt_point/1.bin bs=4k count=256 root # sync root # rm /mnt_point/1.bin root # umount /mnt_point root # dump.f2fs /dev/f2fs_dev | grep "checkpoint state" Info: checkpoint state = 45 : crc compacted_summary unmount ---- 'trimmed' flag is missing Since add_discard_addrs() can handle active segments with non-zero valid blocks, it is reasonable to fix this issue by allowing it to also handle active segments with 0 valid blocks. Fixes: b29555505d81 ("f2fs: add key functions for small discards") Signed-off-by: Chunhai Guo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 50a346f7cb93..396ef71f41e3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2096,7 +2096,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, return false; if (!force) { - if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks || + if (!f2fs_realtime_discard_enable(sbi) || + (!se->valid_blocks && + !IS_CURSEG(sbi, cpc->trim_start)) || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false;