diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 3dfee94e0618..cbeac1bebe2f 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -377,3 +377,35 @@ Description: This gives a control to limit the bio size in f2fs. Default is zero, which will follow underlying block layer limit, whereas, if it has a certain bytes value, f2fs won't submit a bio larger than that size. + +What: /sys/fs/f2fs//stat/sb_status +Date: December 2020 +Contact: "Chao Yu" +Description: Show status of f2fs superblock in real time. + + ====== ===================== ================================= + value sb status macro description + 0x1 SBI_IS_DIRTY dirty flag for checkpoint + 0x2 SBI_IS_CLOSE specify unmounting + 0x4 SBI_NEED_FSCK need fsck.f2fs to fix + 0x8 SBI_POR_DOING recovery is doing or not + 0x10 SBI_NEED_SB_WRITE need to recover superblock + 0x20 SBI_NEED_CP need to checkpoint + 0x40 SBI_IS_SHUTDOWN shutdown by ioctl + 0x80 SBI_IS_RECOVERED recovered orphan/data + 0x100 SBI_CP_DISABLED CP was disabled last mount + 0x200 SBI_CP_DISABLED_QUICK CP was disabled quickly + 0x400 SBI_QUOTA_NEED_FLUSH need to flush quota info in CP + 0x800 SBI_QUOTA_SKIP_FLUSH skip flushing quota in current CP + 0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted + 0x2000 SBI_IS_RESIZEFS resizefs is in process + ====== ===================== ================================= + +What: /sys/fs/f2fs//ckpt_thread_ioprio +Date: January 2021 +Contact: "Daeho Jeong" +Description: Give a way to change checkpoint merge daemon's io priority. + Its default value is "be,3", which means "BE" I/O class and + I/O priority "3". We can select the class between "rt" and "be", + and set the I/O priority within valid range of it. "," delimiter + is necessary in between I/O class and priority number. diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index dae15c96e659..35ed01a5fbc9 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -179,7 +179,6 @@ fault_type=%d Support configuring fault injection type, should be FAULT_KVMALLOC 0x000000002 FAULT_PAGE_ALLOC 0x000000004 FAULT_PAGE_GET 0x000000008 - FAULT_ALLOC_BIO 0x000000010 FAULT_ALLOC_NID 0x000000020 FAULT_ORPHAN 0x000000040 FAULT_BLOCK 0x000000080 @@ -247,8 +246,24 @@ checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enabl hide up to all remaining free space. The actual space that would be unusable can be viewed at /sys/fs/f2fs//unusable This space is reclaimed once checkpoint=enable. +checkpoint_merge When checkpoint is enabled, this can be used to create a kernel + daemon and make it to merge concurrent checkpoint requests as + much as possible to eliminate redundant checkpoint issues. Plus, + we can eliminate the sluggish issue caused by slow checkpoint + operation when the checkpoint is done in a process context in + a cgroup having low i/o budget and cpu shares. To make this + do better, we set the default i/o priority of the kernel daemon + to "3", to give one higher priority than other kernel threads. + This is the same way to give a I/O priority to the jbd2 + journaling thread of ext4 filesystem. +nocheckpoint_merge Disable checkpoint merge feature. compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo", "lz4", "zstd" and "lzo-rle" algorithm. +compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only + "lz4" and "zstd" support compress level config. + algorithm level range + lz4 3 - 16 + zstd 1 - 22 compress_log_size=%u Support configuring compress cluster size, the size will be 4KB * (1 << %u), 16KB is minimum size, also it's default size. @@ -831,7 +846,7 @@ This is the default option. f2fs does automatic compression in the writeback of compression enabled files. 2) compress_mode=user -This disables the automaic compression and gives the user discretion of choosing the +This disables the automatic compression and gives the user discretion of choosing the target file and the timing. The user can do manual compression/decompression on the compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE ioctls like the below. diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index e0204a23e997..1d831e3cbcb3 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -217,6 +217,82 @@ FS_IOC_MEASURE_VERITY can fail with the following errors: - ``EOVERFLOW``: the digest is longer than the specified ``digest_size`` bytes. Try providing a larger buffer. +FS_IOC_READ_VERITY_METADATA +--------------------------- + +The FS_IOC_READ_VERITY_METADATA ioctl reads verity metadata from a +verity file. This ioctl is available since Linux v5.12. + +This ioctl allows writing a server program that takes a verity file +and serves it to a client program, such that the client can do its own +fs-verity compatible verification of the file. This only makes sense +if the client doesn't trust the server and if the server needs to +provide the storage for the client. + +This is a fairly specialized use case, and most fs-verity users won't +need this ioctl. + +This ioctl takes in a pointer to the following structure:: + + #define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 + #define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2 + #define FS_VERITY_METADATA_TYPE_SIGNATURE 3 + + struct fsverity_read_metadata_arg { + __u64 metadata_type; + __u64 offset; + __u64 length; + __u64 buf_ptr; + __u64 __reserved; + }; + +``metadata_type`` specifies the type of metadata to read: + +- ``FS_VERITY_METADATA_TYPE_MERKLE_TREE`` reads the blocks of the + Merkle tree. The blocks are returned in order from the root level + to the leaf level. Within each level, the blocks are returned in + the same order that their hashes are themselves hashed. + See `Merkle tree`_ for more information. + +- ``FS_VERITY_METADATA_TYPE_DESCRIPTOR`` reads the fs-verity + descriptor. See `fs-verity descriptor`_. + +- ``FS_VERITY_METADATA_TYPE_SIGNATURE`` reads the signature which was + passed to FS_IOC_ENABLE_VERITY, if any. See `Built-in signature + verification`_. + +The semantics are similar to those of ``pread()``. ``offset`` +specifies the offset in bytes into the metadata item to read from, and +``length`` specifies the maximum number of bytes to read from the +metadata item. ``buf_ptr`` is the pointer to the buffer to read into, +cast to a 64-bit integer. ``__reserved`` must be 0. On success, the +number of bytes read is returned. 0 is returned at the end of the +metadata item. The returned length may be less than ``length``, for +example if the ioctl is interrupted. + +The metadata returned by FS_IOC_READ_VERITY_METADATA isn't guaranteed +to be authenticated against the file digest that would be returned by +`FS_IOC_MEASURE_VERITY`_, as the metadata is expected to be used to +implement fs-verity compatible verification anyway (though absent a +malicious disk, the metadata will indeed match). E.g. to implement +this ioctl, the filesystem is allowed to just read the Merkle tree +blocks from disk without actually verifying the path to the root node. + +FS_IOC_READ_VERITY_METADATA can fail with the following errors: + +- ``EFAULT``: the caller provided inaccessible memory +- ``EINTR``: the ioctl was interrupted before any data was read +- ``EINVAL``: reserved fields were set, or ``offset + length`` + overflowed +- ``ENODATA``: the file is not a verity file, or + FS_VERITY_METADATA_TYPE_SIGNATURE was requested but the file doesn't + have a built-in signature +- ``ENOTTY``: this type of filesystem does not implement fs-verity, or + this ioctl is not yet implemented on it +- ``EOPNOTSUPP``: the kernel was not configured with fs-verity + support, or the filesystem superblock has not had the 'verity' + feature enabled on it. (See `Filesystem support`_.) + FS_IOC_GETFLAGS --------------- diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index ba4e51853cba..e7af0363d154 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -97,11 +97,6 @@ struct mapped_device { */ struct workqueue_struct *wq; - /* - * freeze/thaw support require holding onto a super block - */ - struct super_block *frozen_sb; - /* forced geometry settings */ struct hd_geometry geometry; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index eaf4587ad2aa..2322741035ca 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2407,27 +2407,19 @@ static int lock_fs(struct mapped_device *md) { int r; - WARN_ON(md->frozen_sb); + WARN_ON(test_bit(DMF_FROZEN, &md->flags)); - md->frozen_sb = freeze_bdev(md->bdev); - if (IS_ERR(md->frozen_sb)) { - r = PTR_ERR(md->frozen_sb); - md->frozen_sb = NULL; - return r; - } - - set_bit(DMF_FROZEN, &md->flags); - - return 0; + r = freeze_bdev(md->bdev); + if (!r) + set_bit(DMF_FROZEN, &md->flags); + return r; } static void unlock_fs(struct mapped_device *md) { if (!test_bit(DMF_FROZEN, &md->flags)) return; - - thaw_bdev(md->bdev, md->frozen_sb); - md->frozen_sb = NULL; + thaw_bdev(md->bdev); clear_bit(DMF_FROZEN, &md->flags); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 2ea189c1b4ff..ff6d1fa3e5d3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -556,55 +556,47 @@ EXPORT_SYMBOL(fsync_bdev); * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze * actually. */ -struct super_block *freeze_bdev(struct block_device *bdev) +int freeze_bdev(struct block_device *bdev) { struct super_block *sb; int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (++bdev->bd_fsfreeze_count > 1) { - /* - * We don't even need to grab a reference - the first call - * to freeze_bdev grab an active reference and only the last - * thaw_bdev drops it. - */ - sb = get_super(bdev); - if (sb) - drop_super(sb); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; - } + if (++bdev->bd_fsfreeze_count > 1) + goto done; sb = get_active_super(bdev); if (!sb) - goto out; + goto sync; if (sb->s_op->freeze_super) error = sb->s_op->freeze_super(sb); else error = freeze_super(sb); - if (error) { - deactivate_super(sb); - bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); - } deactivate_super(sb); - out: + + if (error) { + bdev->bd_fsfreeze_count--; + goto done; + } + bdev->bd_fsfreeze_sb = sb; + +sync: sync_blockdev(bdev); +done: mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; /* thaw_bdev releases s->s_umount */ + return error; } EXPORT_SYMBOL(freeze_bdev); /** * thaw_bdev -- unlock filesystem * @bdev: blockdevice to unlock - * @sb: associated superblock * * Unlocks the filesystem and marks it writeable again after freeze_bdev(). */ -int thaw_bdev(struct block_device *bdev, struct super_block *sb) +int thaw_bdev(struct block_device *bdev) { + struct super_block *sb; int error = -EINVAL; mutex_lock(&bdev->bd_fsfreeze_mutex); @@ -615,6 +607,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) if (--bdev->bd_fsfreeze_count > 0) goto out; + sb = bdev->bd_fsfreeze_sb; if (!sb) goto out; diff --git a/fs/buffer.c b/fs/buffer.c index af01c0f0cc78..85fc97361b45 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -524,7 +524,7 @@ repeat: void emergency_thaw_bdev(struct super_block *sb) { - while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 106bf149e8ca..713b1ae44c1a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -624,7 +624,7 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) case EXT4_GOING_FLAGS_DEFAULT: freeze_bdev(sb->s_bdev); set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); - thaw_bdev(sb->s_bdev, sb); + thaw_bdev(sb->s_bdev); break; case EXT4_GOING_FLAGS_LOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); @@ -1309,6 +1309,12 @@ out: return -EOPNOTSUPP; return fsverity_ioctl_measure(filp, (void __user *)arg); + case FS_IOC_READ_VERITY_METADATA: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_read_metadata(filp, + (const void __user *)arg); + default: return -ENOTTY; } @@ -1391,6 +1397,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GETFSMAP: case FS_IOC_ENABLE_VERITY: case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index d13c5c6a9787..62e638a49bbf 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -76,16 +76,6 @@ config F2FS_CHECK_FS If you want to improve the performance, say N. -config F2FS_IO_TRACE - bool "F2FS IO tracer" - depends on F2FS_FS - depends on FUNCTION_TRACER - help - F2FS IO trace is based on a function trace, which gathers process - information and block IO patterns in the filesystem level. - - If unsure, say N. - config F2FS_FAULT_INJECTION bool "F2FS fault injection facility" depends on F2FS_FS @@ -119,6 +109,16 @@ config F2FS_FS_LZ4 help Support LZ4 compress algorithm, if unsure, say Y. +config F2FS_FS_LZ4HC + bool "LZ4HC compression support" + depends on F2FS_FS_COMPRESSION + depends on F2FS_FS_LZ4 + select LZ4HC_COMPRESS + default y + help + Support LZ4HC compress algorithm, LZ4HC has compatible on-disk + layout with LZ4, if unsure, say Y. + config F2FS_FS_ZSTD bool "ZSTD compression support" depends on F2FS_FS_COMPRESSION diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ee7316b42f69..e5295746208b 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -7,6 +7,5 @@ f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o -f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o f2fs-$(CONFIG_FS_VERITY) += verity.o f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 1e5e9b1136ee..732ec10e7890 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -200,6 +200,27 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) return __f2fs_get_acl(inode, type, NULL); } +static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) + mode = F2FS_I(inode)->i_acl_mode; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} + static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { @@ -213,7 +234,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = f2fs_acl_update_mode(inode, &mode, &acl); if (error) return error; set_acl_inode(inode, mode); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 617d0f6b0836..c7fc8a33616f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -13,13 +13,15 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" #include +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; @@ -443,7 +445,6 @@ static int f2fs_set_meta_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); return 1; } return 0; @@ -1017,7 +1018,6 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) spin_unlock(&sbi->inode_lock[type]); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); } void f2fs_remove_dirty_inode(struct inode *inode) @@ -1387,8 +1387,7 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, static inline u64 get_sectors_written(struct block_device *bdev) { - return bdev->bd_part ? - (u64)part_stat_read(bdev->bd_part, sectors[STAT_WRITE]) : 0; + return (u64)part_stat_read(bdev->bd_part, sectors[STAT_WRITE]); } u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) @@ -1708,3 +1707,174 @@ void f2fs_destroy_checkpoint_caches(void) kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(f2fs_inode_entry_slab); } + +static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) +{ + struct cp_control cpc = { .reason = CP_SYNC, }; + int err; + + down_write(&sbi->gc_lock); + err = f2fs_write_checkpoint(sbi, &cpc); + up_write(&sbi->gc_lock); + + return err; +} + +static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req *req, *next; + struct llist_node *dispatch_list; + u64 sum_diff = 0, diff, count = 0; + int ret; + + dispatch_list = llist_del_all(&cprc->issue_list); + if (!dispatch_list) + return; + dispatch_list = llist_reverse_order(dispatch_list); + + ret = __write_checkpoint_sync(sbi); + atomic_inc(&cprc->issued_ckpt); + + llist_for_each_entry_safe(req, next, dispatch_list, llnode) { + diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); + req->ret = ret; + complete(&req->wait); + + sum_diff += diff; + count++; + } + atomic_sub(count, &cprc->queued_ckpt); + atomic_add(count, &cprc->total_ckpt); + + spin_lock(&cprc->stat_lock); + cprc->cur_time = (unsigned int)div64_u64(sum_diff, count); + if (cprc->peak_time < cprc->cur_time) + cprc->peak_time = cprc->cur_time; + spin_unlock(&cprc->stat_lock); +} + +static int issue_checkpoint_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct ckpt_req_control *cprc = &sbi->cprc_info; + wait_queue_head_t *q = &cprc->ckpt_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + if (!llist_empty(&cprc->issue_list)) + __checkpoint_and_complete_reqs(sbi); + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&cprc->issue_list)); + goto repeat; +} + +static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi, + struct ckpt_req *wait_req) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (!llist_empty(&cprc->issue_list)) { + __checkpoint_and_complete_reqs(sbi); + } else { + /* already dispatched by issue_checkpoint_thread */ + if (wait_req) + wait_for_completion(&wait_req->wait); + } +} + +static void init_ckpt_req(struct ckpt_req *req) +{ + memset(req, 0, sizeof(struct ckpt_req)); + + init_completion(&req->wait); + req->queue_time = ktime_get(); +} + +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req req; + struct cp_control cpc; + + cpc.reason = __get_cp_reason(sbi); + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { + int ret; + + down_write(&sbi->gc_lock); + ret = f2fs_write_checkpoint(sbi, &cpc); + up_write(&sbi->gc_lock); + + return ret; + } + + if (!cprc->f2fs_issue_ckpt) + return __write_checkpoint_sync(sbi); + + init_ckpt_req(&req); + + llist_add(&req.llnode, &cprc->issue_list); + atomic_inc(&cprc->queued_ckpt); + + /* update issue_list before we wake up issue_checkpoint thread */ + smp_mb(); + + if (waitqueue_active(&cprc->ckpt_wait_queue)) + wake_up(&cprc->ckpt_wait_queue); + + if (cprc->f2fs_issue_ckpt) + wait_for_completion(&req.wait); + else + flush_remained_ckpt_reqs(sbi, &req); + + return req.ret; +} + +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) + return 0; + + cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, + "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(cprc->f2fs_issue_ckpt)) { + cprc->f2fs_issue_ckpt = NULL; + return -ENOMEM; + } + + set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); + + return 0; +} + +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) { + struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt; + + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); + + flush_remained_ckpt_reqs(sbi, NULL); + } +} + +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + atomic_set(&cprc->issued_ckpt, 0); + atomic_set(&cprc->total_ckpt, 0); + atomic_set(&cprc->queued_ckpt, 0); + cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO; + init_waitqueue_head(&cprc->ckpt_wait_queue); + init_llist_head(&cprc->issue_list); + spin_lock_init(&cprc->stat_lock); +} diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 4bcbacfe3325..77fa342de38f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -252,8 +252,14 @@ static const struct f2fs_compress_ops f2fs_lzo_ops = { #ifdef CONFIG_F2FS_FS_LZ4 static int lz4_init_compress_ctx(struct compress_ctx *cc) { - cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), - LZ4_MEM_COMPRESS, GFP_NOFS); + unsigned int size = LZ4_MEM_COMPRESS; + +#ifdef CONFIG_F2FS_FS_LZ4HC + if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET) + size = LZ4HC_MEM_COMPRESS; +#endif + + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS); if (!cc->private) return -ENOMEM; @@ -272,10 +278,34 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) cc->private = NULL; } +#ifdef CONFIG_F2FS_FS_LZ4HC +static int lz4hc_compress_pages(struct compress_ctx *cc) +{ + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; + int len; + + if (level) + len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, level, cc->private); + else + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); + if (!len) + return -EAGAIN; + + cc->clen = len; + return 0; +} +#endif + static int lz4_compress_pages(struct compress_ctx *cc) { int len; +#ifdef CONFIG_F2FS_FS_LZ4HC + return lz4hc_compress_pages(cc); +#endif len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, cc->private); if (!len) @@ -325,8 +355,13 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) ZSTD_CStream *stream; void *workspace; unsigned int workspace_size; + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; - params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0); + if (!level) + level = F2FS_ZSTD_DEFAULT_CLEVEL; + + params = ZSTD_getParams(level, cc->rlen, 0); workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams); workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), @@ -721,38 +756,27 @@ out: return ret; } -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) +static void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); - struct f2fs_inode_info *fi= F2FS_I(dic->inode); + struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; int ret; int i; - dec_page_count(sbi, F2FS_RD_DATA); - - if (bio->bi_status || PageError(page)) - dic->failed = true; - - if (atomic_dec_return(&dic->pending_pages)) - return; - trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); - /* submit partial compressed pages */ if (dic->failed) { ret = -EIO; - goto out_free_dic; + goto out_end_io; } dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); if (!dic->tpages) { ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } for (i = 0; i < dic->cluster_size; i++) { @@ -764,20 +788,20 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) dic->tpages[i] = f2fs_compress_alloc_page(); if (!dic->tpages[i]) { ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } } if (cops->init_decompress_ctx) { ret = cops->init_decompress_ctx(dic); if (ret) - goto out_free_dic; + goto out_end_io; } dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); if (!dic->rbuf) { ret = -ENOMEM; - goto destroy_decompress_ctx; + goto out_destroy_decompress_ctx; } dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); @@ -816,18 +840,34 @@ out_vunmap_cbuf: vm_unmap_ram(dic->cbuf, dic->nr_cpages); out_vunmap_rbuf: vm_unmap_ram(dic->rbuf, dic->cluster_size); -destroy_decompress_ctx: +out_destroy_decompress_ctx: if (cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); -out_free_dic: - if (!verity) - f2fs_decompress_end_io(dic->rpages, dic->cluster_size, - ret, false); - +out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); - if (!verity) - f2fs_free_dic(dic); + f2fs_decompress_end_io(dic, ret); +} + +/* + * This is called when a page of a compressed cluster has been read from disk + * (or failed to be read from disk). It checks whether this page was the last + * page being waited on in the cluster, and if so, it decompresses the cluster + * (or in the case of a failure, cleans up without actually decompressing). + */ +void f2fs_end_read_compressed_page(struct page *page, bool failed) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + + dec_page_count(sbi, F2FS_RD_DATA); + + if (failed) + WRITE_ONCE(dic->failed, true); + + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -1415,7 +1455,7 @@ retry_write: ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, NULL, NULL, wbc, io_type, - compr_blocks); + compr_blocks, false); if (ret) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(cc->rpages[i]); @@ -1450,6 +1490,9 @@ retry_write: *submitted += _submitted; } + + f2fs_balance_fs(F2FS_M_SB(mapping), true); + return 0; out_err: for (++i; i < cc->cluster_size; i++) { @@ -1494,6 +1537,8 @@ destroy_out: return err; } +static void f2fs_free_dic(struct decompress_io_ctx *dic); + struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; @@ -1512,12 +1557,14 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; - atomic_set(&dic->pending_pages, cc->nr_cpages); + atomic_set(&dic->remaining_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; dic->nr_cpages = cc->nr_cpages; + refcount_set(&dic->refcnt, 1); dic->failed = false; + dic->need_verity = f2fs_need_verity(cc->inode, start_idx); for (i = 0; i < dic->cluster_size; i++) dic->rpages[i] = cc->rpages[i]; @@ -1546,7 +1593,7 @@ out_free: return ERR_PTR(-ENOMEM); } -void f2fs_free_dic(struct decompress_io_ctx *dic) +static void f2fs_free_dic(struct decompress_io_ctx *dic) { int i; @@ -1574,30 +1621,88 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) kmem_cache_free(dic_entry_slab, dic); } -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity) +static void f2fs_put_dic(struct decompress_io_ctx *dic) +{ + if (refcount_dec_and_test(&dic->refcnt)) + f2fs_free_dic(dic); +} + +/* + * Update and unlock the cluster's pagecache pages, and release the reference to + * the decompress_io_ctx that was being held for I/O completion. + */ +static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) { int i; - for (i = 0; i < cluster_size; i++) { - struct page *rpage = rpages[i]; + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; if (!rpage) continue; - if (err || PageError(rpage)) - goto clear_uptodate; - - if (!verity || fsverity_verify_page(rpage)) { + /* PG_error was set if verity failed. */ + if (failed || PageError(rpage)) { + ClearPageUptodate(rpage); + /* will re-read again later */ + ClearPageError(rpage); + } else { SetPageUptodate(rpage); - goto unlock; } -clear_uptodate: - ClearPageUptodate(rpage); - ClearPageError(rpage); -unlock: unlock_page(rpage); } + + f2fs_put_dic(dic); +} + +static void f2fs_verify_cluster(struct work_struct *work) +{ + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, verity_work); + int i; + + /* Verify the cluster's decompressed pages with fs-verity. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (rpage && !fsverity_verify_page(rpage)) + SetPageError(rpage); + } + + __f2fs_decompress_end_io(dic, false); +} + +/* + * This is called when a compressed cluster has been decompressed + * (or failed to be read and/or decompressed). + */ +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +{ + if (!failed && dic->need_verity) { + /* + * Note that to avoid deadlocks, the verity work can't be done + * on the decompression workqueue. This is because verifying + * the data pages can involve reading metadata pages from the + * file, and these metadata pages may be compressed. + */ + INIT_WORK(&dic->verity_work, f2fs_verify_cluster); + fsverity_enqueue_verify_work(&dic->verity_work); + } else { + __f2fs_decompress_end_io(dic, failed); + } +} + +/* + * Put a reference to a compressed page's decompress_io_ctx. + * + * This is called when the page is no longer needed and can be freed. + */ +void f2fs_put_page_dic(struct page *page) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + + f2fs_put_dic(dic); } int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d1e83f119338..0aef7c21931a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -25,7 +25,6 @@ #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" #include #include @@ -51,27 +50,6 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } -static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, - unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); -} - -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio) -{ - if (noio) { - /* No failure on bio allocation */ - return __f2fs_bio_alloc(GFP_NOIO, npages); - } - - if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { - f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); - return NULL; - } - - return __f2fs_bio_alloc(GFP_KERNEL, npages); -} - static bool __is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; @@ -116,10 +94,21 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { - STEP_DECRYPT, - STEP_DECOMPRESS_NOWQ, /* handle normal cluster data inplace */ - STEP_DECOMPRESS, /* handle compressed cluster data in workqueue */ - STEP_VERITY, +#ifdef CONFIG_FS_ENCRYPTION + STEP_DECRYPT = 1 << 0, +#else + STEP_DECRYPT = 0, /* compile out the decryption-related code */ +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + STEP_DECOMPRESS = 1 << 1, +#else + STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ +#endif +#ifdef CONFIG_FS_VERITY + STEP_VERITY = 1 << 2, +#else + STEP_VERITY = 0, /* compile out the verity-related code */ +#endif }; struct bio_post_read_ctx { @@ -129,25 +118,26 @@ struct bio_post_read_ctx { unsigned int enabled_steps; }; -static void __read_end_io(struct bio *bio, bool compr, bool verity) +static void f2fs_finish_read_bio(struct bio *bio) { - struct page *page; struct bio_vec *bv; struct bvec_iter_all iter_all; + /* + * Update and unlock the bio's pagecache pages, and put the + * decompression context for any compressed pages. + */ bio_for_each_segment_all(bv, bio, iter_all) { - page = bv->bv_page; + struct page *page = bv->bv_page; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (compr && f2fs_is_compressed_page(page)) { - f2fs_decompress_pages(bio, page, verity); + if (f2fs_is_compressed_page(page)) { + if (bio->bi_status) + f2fs_end_read_compressed_page(page, true); + f2fs_put_page_dic(page); continue; } - if (verity) - continue; -#endif - /* PG_error was set if any post_read step failed */ + /* PG_error was set if decryption or verity failed. */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); /* will re-read again later */ @@ -158,106 +148,104 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity) dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } + + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); } -static void f2fs_release_read_bio(struct bio *bio); -static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity) -{ - if (!compr) - __read_end_io(bio, false, verity); - f2fs_release_read_bio(bio); -} - -static void f2fs_decompress_bio(struct bio *bio, bool verity) -{ - __read_end_io(bio, true, verity); -} - -static void bio_post_read_processing(struct bio_post_read_ctx *ctx); - -static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx) -{ - fscrypt_decrypt_bio(ctx->bio); -} - -static void f2fs_decompress_work(struct bio_post_read_ctx *ctx) -{ - f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY)); -} - -#ifdef CONFIG_F2FS_FS_COMPRESSION -static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size) -{ - f2fs_decompress_end_io(rpages, cluster_size, false, true); -} - -static void f2fs_verify_bio(struct bio *bio) -{ - struct bio_vec *bv; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - struct decompress_io_ctx *dic; - - dic = (struct decompress_io_ctx *)page_private(page); - - if (dic) { - if (atomic_dec_return(&dic->verity_pages)) - continue; - f2fs_verify_pages(dic->rpages, - dic->cluster_size); - f2fs_free_dic(dic); - continue; - } - - if (bio->bi_status || PageError(page)) - goto clear_uptodate; - - if (fsverity_verify_page(page)) { - SetPageUptodate(page); - goto unlock; - } -clear_uptodate: - ClearPageUptodate(page); - ClearPageError(page); -unlock: - dec_page_count(F2FS_P_SB(page), __read_io_type(page)); - unlock_page(page); - } -} -#endif - -static void f2fs_verity_work(struct work_struct *work) +static void f2fs_verify_bio(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; -#ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int enabled_steps = ctx->enabled_steps; -#endif + bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* * fsverity_verify_bio() may call readpages() again, and while verity - * will be disabled for this, decryption may still be needed, resulting - * in another bio_post_read_ctx being allocated. So to prevent - * deadlocks we need to release the current ctx to the mempool first. - * This assumes that verity is the last post-read step. + * will be disabled for this, decryption and/or decompression may still + * be needed, resulting in another bio_post_read_ctx being allocated. + * So to prevent deadlocks we need to release the current ctx to the + * mempool first. This assumes that verity is the last post-read step. */ mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; -#ifdef CONFIG_F2FS_FS_COMPRESSION - /* previous step is decompression */ - if (enabled_steps & (1 << STEP_DECOMPRESS)) { - f2fs_verify_bio(bio); - f2fs_release_read_bio(bio); - return; - } -#endif + /* + * Verify the bio's pages with fs-verity. Exclude compressed pages, + * as those were handled separately by f2fs_end_read_compressed_page(). + */ + if (may_have_compressed_pages) { + struct bio_vec *bv; + struct bvec_iter_all iter_all; - fsverity_verify_bio(bio); - __f2fs_read_end_io(bio, false, false); + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + + if (!f2fs_is_compressed_page(page) && + !PageError(page) && !fsverity_verify_page(page)) + SetPageError(page); + } + } else { + fsverity_verify_bio(bio); + } + + f2fs_finish_read_bio(bio); +} + +/* + * If the bio's data needs to be verified with fs-verity, then enqueue the + * verity work for the bio. Otherwise finish the bio now. + * + * Note that to avoid deadlocks, the verity work can't be done on the + * decryption/decompression workqueue. This is because verifying the data pages + * can involve reading verity metadata pages from the file, and these verity + * metadata pages may be encrypted and/or compressed. + */ +static void f2fs_verify_and_finish_bio(struct bio *bio) +{ + struct bio_post_read_ctx *ctx = bio->bi_private; + + if (ctx && (ctx->enabled_steps & STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verify_bio); + fsverity_enqueue_verify_work(&ctx->work); + } else { + f2fs_finish_read_bio(bio); + } +} + +/* + * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last + * remaining page was read by @ctx->bio. + * + * Note that a bio may span clusters (even a mix of compressed and uncompressed + * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates + * that the bio includes at least one compressed page. The actual decompression + * is done on a per-cluster basis, not a per-bio basis. + */ +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) +{ + struct bio_vec *bv; + struct bvec_iter_all iter_all; + bool all_compressed = true; + + bio_for_each_segment_all(bv, ctx->bio, iter_all) { + struct page *page = bv->bv_page; + + /* PG_error was set if decryption failed. */ + if (f2fs_is_compressed_page(page)) + f2fs_end_read_compressed_page(page, PageError(page)); + else + all_compressed = false; + } + + /* + * Optimization: if all the bio's pages are compressed, then scheduling + * the per-bio verity work is unnecessary, as verity will be fully + * handled at the compression cluster level. + */ + if (all_compressed) + ctx->enabled_steps &= ~STEP_VERITY; } static void f2fs_post_read_work(struct work_struct *work) @@ -265,74 +253,36 @@ static void f2fs_post_read_work(struct work_struct *work) struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); - if (ctx->enabled_steps & (1 << STEP_DECRYPT)) - f2fs_decrypt_work(ctx); + if (ctx->enabled_steps & STEP_DECRYPT) + fscrypt_decrypt_bio(ctx->bio); - if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) - f2fs_decompress_work(ctx); + if (ctx->enabled_steps & STEP_DECOMPRESS) + f2fs_handle_step_decompress(ctx); - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - - __f2fs_read_end_io(ctx->bio, - ctx->enabled_steps & (1 << STEP_DECOMPRESS), false); -} - -static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi, - struct work_struct *work) -{ - queue_work(sbi->post_read_wq, work); -} - -static void bio_post_read_processing(struct bio_post_read_ctx *ctx) -{ - /* - * We use different work queues for decryption and for verity because - * verity may require reading metadata pages that need decryption, and - * we shouldn't recurse to the same workqueue. - */ - - if (ctx->enabled_steps & (1 << STEP_DECRYPT) || - ctx->enabled_steps & (1 << STEP_DECOMPRESS)) { - INIT_WORK(&ctx->work, f2fs_post_read_work); - f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work); - return; - } - - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - - __f2fs_read_end_io(ctx->bio, false, false); -} - -static bool f2fs_bio_post_read_required(struct bio *bio) -{ - return bio->bi_private; + f2fs_verify_and_finish_bio(ctx->bio); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + struct bio_post_read_ctx *ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) { f2fs_show_injection_info(sbi, FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } - if (f2fs_bio_post_read_required(bio)) { - struct bio_post_read_ctx *ctx = bio->bi_private; - - bio_post_read_processing(ctx); + if (bio->bi_status) { + f2fs_finish_read_bio(bio); return; } - __f2fs_read_end_io(bio, false, false); + if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + } else { + f2fs_verify_and_finish_bio(bio); + } } static void f2fs_write_end_io(struct bio *bio) @@ -443,7 +393,7 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) struct f2fs_sb_info *sbi = fio->sbi; struct bio *bio; - bio = f2fs_bio_alloc(sbi, npages, true); + bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); f2fs_target_device(sbi, fio->new_blkaddr, bio); if (is_read_io(fio->op)) { @@ -504,7 +454,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (f2fs_lfs_mode(sbi) && current->plug) blk_finish_plug(current->plug); - if (F2FS_IO_ALIGNED(sbi)) + if (!F2FS_IO_ALIGNED(sbi)) goto submit_io; start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; @@ -712,7 +662,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); /* Allocate a new bio */ bio = __bio_alloc(fio, 1); @@ -917,7 +866,6 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) @@ -1014,7 +962,6 @@ alloc_new: wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; - f2fs_trace_ios(fio, 0); trace_f2fs_submit_page_write(fio->page, fio); skip: @@ -1027,24 +974,18 @@ out: up_write(&io->io_rwsem); } -static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) -{ - return fsverity_active(inode) && - idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); -} - static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, unsigned op_flag, - pgoff_t first_idx, bool for_write, - bool for_verity) + pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; struct bio_post_read_ctx *ctx; unsigned int post_read_steps = 0; - bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), - for_write); + bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, + min_t(int, nr_pages, BIO_MAX_PAGES), + &f2fs_bioset); if (!bio) return ERR_PTR(-ENOMEM); @@ -1055,13 +996,19 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (fscrypt_inode_uses_fs_layer_crypto(inode)) - post_read_steps |= 1 << STEP_DECRYPT; - if (f2fs_compressed_file(inode)) - post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ; - if (for_verity && f2fs_need_verity(inode, first_idx)) - post_read_steps |= 1 << STEP_VERITY; + post_read_steps |= STEP_DECRYPT; - if (post_read_steps) { + if (f2fs_need_verity(inode, first_idx)) + post_read_steps |= STEP_VERITY; + + /* + * STEP_DECOMPRESS is handled specially, since a compressed file might + * contain both compressed and uncompressed clusters. We'll allocate a + * bio_post_read_ctx if the file is compressed, but the caller is + * responsible for enabling STEP_DECOMPRESS if it's actually needed. + */ + + if (post_read_steps || f2fs_compressed_file(inode)) { /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; @@ -1073,13 +1020,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, return bio; } -static void f2fs_release_read_bio(struct bio *bio) -{ - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); -} - /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr, int op_flags, bool for_write) @@ -1088,7 +1028,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, struct bio *bio; bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, - page->index, for_write, true); + page->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -1969,6 +1909,7 @@ next: } if (size) { + flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; @@ -2126,7 +2067,7 @@ submit_and_realloc: if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, is_readahead ? REQ_RAHEAD : 0, page->index, - false, true); + false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2172,8 +2113,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; - struct bio_post_read_ctx *ctx; - bool for_verity = false; int i; int ret = 0; @@ -2239,29 +2178,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } - /* - * It's possible to enable fsverity on the fly when handling a cluster, - * which requires complicated error handling. Instead of adding more - * complexity, let's give a rule where end_io post-processes fsverity - * per cluster. In order to do that, we need to submit bio, if previous - * bio sets a different post-process policy. - */ - if (fsverity_active(cc->inode)) { - atomic_set(&dic->verity_pages, cc->nr_cpages); - for_verity = true; - - if (bio) { - ctx = bio->bi_private; - if (!(ctx->enabled_steps & (1 << STEP_VERITY))) { - __submit_bio(sbi, bio, DATA); - bio = NULL; - } - } - } - for (i = 0; i < dic->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; + struct bio_post_read_ctx *ctx; blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); @@ -2277,31 +2197,10 @@ submit_and_realloc: if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, - page->index, for_write, for_verity); + page->index, for_write); if (IS_ERR(bio)) { - unsigned int remained = dic->nr_cpages - i; - bool release = false; - ret = PTR_ERR(bio); - dic->failed = true; - - if (for_verity) { - if (!atomic_sub_return(remained, - &dic->verity_pages)) - release = true; - } else { - if (!atomic_sub_return(remained, - &dic->pending_pages)) - release = true; - } - - if (release) { - f2fs_decompress_end_io(dic->rpages, - cc->cluster_size, true, - false); - f2fs_free_dic(dic); - } - + f2fs_decompress_end_io(dic, ret); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; @@ -2313,10 +2212,9 @@ submit_and_realloc: if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; - /* tag STEP_DECOMPRESS to handle IO in wq */ ctx = bio->bi_private; - if (!(ctx->enabled_steps & (1 << STEP_DECOMPRESS))) - ctx->enabled_steps |= 1 << STEP_DECOMPRESS; + ctx->enabled_steps |= STEP_DECOMPRESS; + refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); @@ -2333,7 +2231,13 @@ submit_and_realloc: out_put_dnode: f2fs_put_dnode(&dn); out: - f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false); + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) { + ClearPageUptodate(cc->rpages[i]); + ClearPageError(cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + } *bio_ret = bio; return ret; } @@ -2342,11 +2246,6 @@ out: /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. - * - * Note that the aops->readpages() function is ONLY used for read-ahead. If - * this function ever deviates from doing just read-ahead, it should either - * use ->readpage() or do the necessary surgery to decouple ->readpages() - * from read-ahead. */ static int f2fs_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct page *page) @@ -2369,7 +2268,6 @@ static int f2fs_mpage_readpages(struct inode *inode, unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; - bool drop_ra = false; map.m_pblk = 0; map.m_lblk = 0; @@ -2380,26 +2278,10 @@ static int f2fs_mpage_readpages(struct inode *inode, map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - /* - * Two readahead threads for same address range can cause race condition - * which fragments sequential read IOs. So let's avoid each other. - */ - if (rac && readahead_count(rac)) { - if (READ_ONCE(F2FS_I(inode)->ra_offset) == readahead_index(rac)) - drop_ra = true; - else - WRITE_ONCE(F2FS_I(inode)->ra_offset, - readahead_index(rac)); - } - for (; nr_pages; nr_pages--) { if (rac) { page = readahead_page(rac); prefetchw(&page->flags); - if (drop_ra) { - f2fs_put_page(page, 1); - continue; - } } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2462,9 +2344,6 @@ next_page: } if (bio) __submit_bio(F2FS_I_SB(inode), bio, DATA); - - if (rac && readahead_count(rac) && !drop_ra) - WRITE_ONCE(F2FS_I(inode)->ra_offset, -1); return ret; } @@ -2748,7 +2627,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks) + int compr_blocks, + bool allow_balance) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2886,7 +2766,7 @@ out: } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task) + !F2FS_I(inode)->cp_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -2933,7 +2813,7 @@ out: #endif return f2fs_write_single_data_page(page, NULL, NULL, NULL, - wbc, FS_DATA_IO, 0); + wbc, FS_DATA_IO, 0, true); } /* @@ -3101,7 +2981,8 @@ continue_unlock: } #endif ret = f2fs_write_single_data_page(page, &submitted, - &bio, &last_block, wbc, io_type, 0); + &bio, &last_block, wbc, io_type, + 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) unlock_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -3877,7 +3758,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); /* Block number less than F2FS MAX BLOCKS */ - if (unlikely(block >= F2FS_I_SB(inode)->max_file_blocks)) + if (unlikely(block >= max_file_blocks(inode))) goto out; if (f2fs_compressed_file(inode)) { @@ -4154,12 +4035,13 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!f2fs_disable_compressed_file(inode)) return -EINVAL; + f2fs_precache_extents(inode); + ret = check_swap_activate(sis, file, span); if (ret < 0) return ret; set_inode_flag(inode, FI_PIN_FILE); - f2fs_precache_extents(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 197c914119da..91855d5721cd 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -120,6 +120,13 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; } + si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt); + si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt); + si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt); + spin_lock(&sbi->cprc_info.stat_lock); + si->cur_ckpt_time = sbi->cprc_info.cur_time; + si->peak_ckpt_time = sbi->cprc_info.peak_time; + spin_unlock(&sbi->cprc_info.stat_lock); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -417,6 +424,11 @@ static int stat_show(struct seq_file *s, void *v) si->meta_count[META_NAT]); seq_printf(s, " - ssa blocks : %u\n", si->meta_count[META_SSA]); + seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " + "Cur time: %4d(ms), Peak time: %4d(ms))\n", + si->nr_queued_ckpt, si->nr_issued_ckpt, + si->nr_total_ckpt, si->cur_ckpt_time, + si->peak_ckpt_time); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5130423a13e7..52b7c7d0a351 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -43,7 +43,6 @@ enum { FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, - FAULT_ALLOC_BIO, FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, @@ -97,6 +96,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_MOUNT_ATGC 0x08000000 +#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -146,6 +146,7 @@ struct f2fs_mount_info { /* For compression */ unsigned char compress_algorithm; /* algorithm type */ unsigned char compress_log_size; /* cluster log size */ + unsigned char compress_level; /* compress level */ bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ int compress_mode; /* compression mode */ @@ -266,6 +267,26 @@ struct fsync_node_entry { unsigned int seq_id; /* sequence id */ }; +struct ckpt_req { + struct completion wait; /* completion for checkpoint done */ + struct llist_node llnode; /* llist_node to be linked in wait queue */ + int ret; /* return code of checkpoint */ + ktime_t queue_time; /* request queued time */ +}; + +struct ckpt_req_control { + struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ + int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */ + wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ + atomic_t issued_ckpt; /* # of actually issued ckpts */ + atomic_t total_ckpt; /* # of total ckpts */ + atomic_t queued_ckpt; /* # of queued ckpts */ + struct llist_head issue_list; /* list for command issue */ + spinlock_t stat_lock; /* lock for below checkpoint time stats */ + unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */ + unsigned int peak_time; /* peak wait time in msec until now */ +}; + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -717,7 +738,6 @@ struct f2fs_inode_info { struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ - pgoff_t ra_offset; /* ongoing readahead offset */ struct extent_tree *extent_tree; /* cached extent_tree entry */ /* avoid racing between foreground op and gc */ @@ -735,6 +755,7 @@ struct f2fs_inode_info { atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ unsigned short i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ }; @@ -1310,6 +1331,8 @@ struct compress_data { #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 +#define COMPRESS_LEVEL_OFFSET 8 + /* compress context */ struct compress_ctx { struct inode *inode; /* inode the context belong to */ @@ -1337,7 +1360,7 @@ struct compress_io_ctx { atomic_t pending_pages; /* in-flight compressed page count */ }; -/* decompress io context for read IO path */ +/* Context for decompressing one cluster on the read IO path */ struct decompress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ @@ -1353,11 +1376,37 @@ struct decompress_io_ctx { struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ - atomic_t pending_pages; /* in-flight compressed page count */ - atomic_t verity_pages; /* in-flight page count for verity */ - bool failed; /* indicate IO error during decompression */ + + /* + * The number of compressed pages remaining to be read in this cluster. + * This is initially nr_cpages. It is decremented by 1 each time a page + * has been read (or failed to be read). When it reaches 0, the cluster + * is decompressed (or an error is reported). + * + * If an error occurs before all the pages have been submitted for I/O, + * then this will never reach 0. In this case the I/O submitter is + * responsible for calling f2fs_decompress_end_io() instead. + */ + atomic_t remaining_pages; + + /* + * Number of references to this decompress_io_ctx. + * + * One reference is held for I/O completion. This reference is dropped + * after the pagecache pages are updated and unlocked -- either after + * decompression (and verity if enabled), or after an error. + * + * In addition, each compressed page holds a reference while it is in a + * bio. These references are necessary prevent compressed pages from + * being freed while they are still in a bio. + */ + refcount_t refcnt; + + bool failed; /* IO error occurred before decompression? */ + bool need_verity; /* need fs-verity verification after decompression? */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ + struct work_struct verity_work; /* work to verify the decompressed pages */ }; #define NULL_CLUSTER ((unsigned int)(~0)) @@ -1404,6 +1453,7 @@ struct f2fs_sb_info { wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ + struct ckpt_req_control cprc_info; /* for checkpoint request control */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ @@ -1444,7 +1494,6 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - loff_t max_file_blocks; /* max block index of file */ int dir_level; /* directory level */ int readdir_ra; /* readahead inode in readdir */ u64 max_io_bytes; /* max io bytes to merge IOs */ @@ -1541,9 +1590,12 @@ struct f2fs_sb_info { unsigned int node_io_flag; /* For sysfs suppport */ - struct kobject s_kobj; + struct kobject s_kobj; /* /sys/fs/f2fs/ */ struct completion s_kobj_unregister; + struct kobject s_stat_kobj; /* /sys/fs/f2fs//stat */ + struct completion s_stat_kobj_unregister; + /* For shrinker support */ struct list_head s_list; int s_ndevs; /* number of devices */ @@ -3232,6 +3284,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); +loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); @@ -3418,13 +3471,16 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); void f2fs_destroy_checkpoint_caches(void); +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi); +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); /* * data.c */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, @@ -3469,7 +3525,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks); + int compr_blocks, bool allow_balance); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -3530,6 +3586,8 @@ struct f2fs_stat_info { int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; + int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; + unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode; unsigned long long compr_blocks; @@ -3715,8 +3773,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) -#define stat_inc_atomic_write(inode) do { } while (0) -#define stat_dec_atomic_write(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_volatile_write(inode) do { } while (0) #define stat_dec_volatile_write(inode) do { } while (0) @@ -3876,7 +3932,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity); +void f2fs_end_read_compressed_page(struct page *page, bool failed); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); @@ -3889,9 +3945,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); -void f2fs_free_dic(struct decompress_io_ctx *dic); -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); +void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); @@ -3915,6 +3970,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } +static inline void f2fs_end_read_compressed_page(struct page *page, bool failed) +{ + WARN_ON_ONCE(1); +} +static inline void f2fs_put_page_dic(struct page *page) +{ + WARN_ON_ONCE(1); +} static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } @@ -3934,6 +3997,11 @@ static inline void set_compress_context(struct inode *inode) 1 << COMPRESS_CHKSUM : 0; F2FS_I(inode)->i_cluster_size = 1 << F2FS_I(inode)->i_log_cluster_size; + if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 && + F2FS_OPTION(sbi).compress_level) + F2FS_I(inode)->i_compress_flag |= + F2FS_OPTION(sbi).compress_level << + COMPRESS_LEVEL_OFFSET; F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); @@ -4118,6 +4186,12 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, return false; } +static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + #ifdef CONFIG_F2FS_FAULT_INJECTION extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, unsigned int type); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 16ea10f2bcf5..471a6ff0c937 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -29,7 +29,6 @@ #include "xattr.h" #include "acl.h" #include "gc.h" -#include "trace.h" #include #include @@ -60,6 +59,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) bool need_alloc = true; int err = 0; + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto err; @@ -70,6 +72,10 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto err; } + err = f2fs_convert_inline_inode(inode); + if (err) + goto err; + #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret = f2fs_is_compressed_cluster(inode, page->index); @@ -366,7 +372,6 @@ flush_out: f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); - f2fs_trace_ios(NULL, 1); return ret; } @@ -483,6 +488,9 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; + if (f2fs_compressed_file(inode)) + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + switch (whence) { case SEEK_SET: case SEEK_CUR: @@ -502,7 +510,6 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - int err; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -510,11 +517,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - /* we don't need to use inline_data strictly */ - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; set_inode_flag(inode, FI_MMAP_FILE); @@ -667,7 +669,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BLK_ALIGN(from); - if (free_from >= sbi->max_file_blocks) + if (free_from >= max_file_blocks(inode)) goto free_partial; if (lock) @@ -767,6 +769,10 @@ int f2fs_truncate(struct inode *inode) return -EIO; } + err = dquot_initialize(inode); + if (err) + return err; + /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -848,7 +854,8 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } @@ -865,6 +872,14 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (unlikely(IS_APPEND(inode) && + (attr->ia_valid & (ATTR_MODE | ATTR_UID | + ATTR_GID | ATTR_TIMES_SET)))) + return -EPERM; + if ((attr->ia_valid & ATTR_SIZE) && !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; @@ -949,8 +964,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) { err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); - if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + if (!err) + inode->i_mode = F2FS_I(inode)->i_acl_mode; clear_inode_flag(inode, FI_ACL_MODE); } } @@ -2236,16 +2253,12 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) switch (in) { case F2FS_GOING_DOWN_FULLSYNC: - sb = freeze_bdev(sb->s_bdev); - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); + ret = freeze_bdev(sb->s_bdev); + if (ret) goto out; - } - if (sb) { - f2fs_stop_checkpoint(sbi, false); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); - thaw_bdev(sb->s_bdev, sb); - } + f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + thaw_bdev(sb->s_bdev); break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ @@ -2734,7 +2747,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) return -EINVAL; if (unlikely((range.start + range.len) >> PAGE_SHIFT > - sbi->max_file_blocks)) + max_file_blocks(inode))) return -EINVAL; err = mnt_want_write_file(filp); @@ -3297,7 +3310,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_next_extent = &m_next_extent; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - end = F2FS_I_SB(inode)->max_file_blocks; + end = max_file_blocks(inode); while (map.m_lblk < end) { map.m_len = end - map.m_lblk; @@ -3361,6 +3374,14 @@ static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg) return fsverity_ioctl_measure(filp, (void __user *)arg); } +static int f2fs_ioc_read_verity_metadata(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fsverity_ioctl_read_metadata(filp, (const void __user *)arg); +} + static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -4047,8 +4068,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) for (i = 0; i < page_len; i++, redirty_idx++) { page = find_lock_page(mapping, redirty_idx); - if (!page) - ret = -ENOENT; + if (!page) { + ret = -ENOMEM; + break; + } set_page_dirty(page); f2fs_put_page(page, 1); f2fs_put_page(page, 0); @@ -4276,6 +4299,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_enable_verity(filp, arg); case FS_IOC_MEASURE_VERITY: return f2fs_ioc_measure_verity(filp, arg); + case FS_IOC_READ_VERITY_METADATA: + return f2fs_ioc_read_verity_metadata(filp, arg); case FS_IOC_GETFSLABEL: return f2fs_ioc_getfslabel(filp, arg); case FS_IOC_SETFSLABEL: @@ -4353,6 +4378,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } + if (unlikely(IS_IMMUTABLE(inode))) { + ret = -EPERM; + goto unlock; + } + ret = generic_write_checks(iocb, from); if (ret > 0) { bool preallocated = false; @@ -4417,6 +4447,7 @@ write: if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } +unlock: inode_unlock(inode); out: trace_f2fs_file_write_iter(inode, iocb->ki_pos, @@ -4527,6 +4558,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_RESIZE_FS: case FS_IOC_ENABLE_VERITY: case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: case FS_IOC_GETFSLABEL: case FS_IOC_SETFSLABEL: case F2FS_IOC_GET_COMPRESS_BLOCKS: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3ef84e6ded41..39330ad3c44e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1169,8 +1169,6 @@ static int move_data_block(struct inode *inode, block_t bidx, if (err) goto put_out; - set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); - /* read page */ fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; @@ -1207,6 +1205,9 @@ static int move_data_block(struct inode *inode, block_t bidx, } } + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* allocate block address */ f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, type, NULL); @@ -1233,9 +1234,6 @@ static int move_data_block(struct inode *inode, block_t bidx, set_page_writeback(fio.encrypted_page); ClearPageError(page); - /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 0a8f64feefe4..e8281a01449d 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -210,6 +210,10 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) return 0; + err = dquot_initialize(inode); + if (err) + return err; + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 6edb1ab579a1..887804968576 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -855,7 +855,11 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, if (whiteout) { f2fs_i_links_write(inode, false); + + spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + *whiteout = inode; } else { d_tmpfile(dentry, inode); @@ -1041,7 +1045,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; + + spin_lock(&whiteout->i_lock); whiteout->i_state &= ~I_LINKABLE; + spin_unlock(&whiteout->i_lock); + iput(whiteout); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3a24423ac65f..a8a0fb890e8d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -17,7 +17,6 @@ #include "node.h" #include "segment.h" #include "xattr.h" -#include "trace.h" #include #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) @@ -2089,7 +2088,6 @@ static int f2fs_set_node_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); return 1; } return 0; @@ -2696,7 +2694,7 @@ retry: src = F2FS_INODE(page); dst = F2FS_INODE(ipage); - memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); dst->i_size = 0; dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index deca74cb17df..4c35fefaf4d3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -20,7 +20,6 @@ #include "segment.h" #include "node.h" #include "gc.h" -#include "trace.h" #include #define __reverse_ffz(x) __reverse_ffs(~(x)) @@ -187,8 +186,6 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - f2fs_trace_pid(page); - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -566,17 +563,7 @@ do_sync: static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio; - int ret; - - bio = f2fs_bio_alloc(sbi, 0, false); - if (!bio) - return -ENOMEM; - - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; - bio_set_dev(bio, bdev); - ret = submit_bio_wait(bio); - bio_put(bio); + int ret = blkdev_issue_flush(bdev, GFP_NOFS); trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); @@ -610,8 +597,6 @@ repeat: if (kthread_should_stop()) return 0; - sb_start_intwrite(sbi->sb); - if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -632,8 +617,6 @@ repeat: fcc->dispatch_list = NULL; } - sb_end_intwrite(sbi->sb); - wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e81eb0748e2a..229814b4f4a6 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -101,11 +101,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) #define GET_SEC_FROM_SEG(sbi, segno) \ - ((segno) / (sbi)->segs_per_sec) + (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) #define GET_ZONE_FROM_SEC(sbi, secno) \ - ((secno) / (sbi)->secs_per_zone) + (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ab9efd71ad11..c2ba4a087983 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -25,13 +25,14 @@ #include #include #include +#include +#include #include "f2fs.h" #include "node.h" #include "segment.h" #include "xattr.h" #include "gc.h" -#include "trace.h" #define CREATE_TRACE_POINTS #include @@ -45,7 +46,6 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", - [FAULT_ALLOC_BIO] = "alloc bio", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", @@ -143,6 +143,8 @@ enum { Opt_checkpoint_disable_cap, Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, + Opt_checkpoint_merge, + Opt_nocheckpoint_merge, Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, @@ -213,6 +215,8 @@ static match_table_t f2fs_tokens = { {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, {Opt_checkpoint_enable, "checkpoint=enable"}, + {Opt_checkpoint_merge, "checkpoint_merge"}, + {Opt_nocheckpoint_merge, "nocheckpoint_merge"}, {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, @@ -464,6 +468,74 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, return 0; } +#ifdef CONFIG_F2FS_FS_COMPRESSION +#ifdef CONFIG_F2FS_FS_LZ4 +static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) +{ +#ifdef CONFIG_F2FS_FS_LZ4HC + unsigned int level; +#endif + + if (strlen(str) == 3) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + +#ifdef CONFIG_F2FS_FS_LZ4HC + str += 3; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. :"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) { + f2fs_info(sbi, "invalid lz4hc compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +#else + f2fs_info(sbi, "kernel doesn't support lz4hc compression"); + return -EINVAL; +#endif +} +#endif + +#ifdef CONFIG_F2FS_FS_ZSTD +static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) +{ + unsigned int level; + int len = 4; + + if (strlen(str) == len) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + + str += len; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. :"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (!level || level > ZSTD_maxCLevel()) { + f2fs_info(sbi, "invalid zstd compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +} +#endif +#endif + static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -872,6 +944,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_checkpoint_enable: clear_opt(sbi, DISABLE_CHECKPOINT); break; + case Opt_checkpoint_merge: + set_opt(sbi, MERGE_CHECKPOINT); + break; + case Opt_nocheckpoint_merge: + clear_opt(sbi, MERGE_CHECKPOINT); + break; #ifdef CONFIG_F2FS_FS_COMPRESSION case Opt_compress_algorithm: if (!f2fs_sb_has_compression(sbi)) { @@ -882,17 +960,45 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (!name) return -ENOMEM; if (!strcmp(name, "lzo")) { +#ifdef CONFIG_F2FS_FS_LZO + F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; - } else if (!strcmp(name, "lz4")) { +#else + f2fs_info(sbi, "kernel doesn't support lzo compression"); +#endif + } else if (!strncmp(name, "lz4", 3)) { +#ifdef CONFIG_F2FS_FS_LZ4 + ret = f2fs_set_lz4hc_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; - } else if (!strcmp(name, "zstd")) { +#else + f2fs_info(sbi, "kernel doesn't support lz4 compression"); +#endif + } else if (!strncmp(name, "zstd", 4)) { +#ifdef CONFIG_F2FS_FS_ZSTD + ret = f2fs_set_zstd_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_ZSTD; +#else + f2fs_info(sbi, "kernel doesn't support zstd compression"); +#endif } else if (!strcmp(name, "lzo-rle")) { +#ifdef CONFIG_F2FS_FS_LZORLE + F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZORLE; +#else + f2fs_info(sbi, "kernel doesn't support lzorle compression"); +#endif } else { kfree(name); return -EINVAL; @@ -1076,8 +1182,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; - fi->ra_offset = -1; - return &fi->vfs_inode; } @@ -1245,6 +1349,12 @@ static void f2fs_put_super(struct super_block *sb) /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); + /* + * flush all issued checkpoints and stop checkpoint issue thread. + * after then, all checkpoints should be done by each process context. + */ + f2fs_stop_ckpt_thread(sbi); + /* * We don't need to do checkpoint when superblock is clean. * But, the previous checkpoint was not done by umount, it needs to do @@ -1343,16 +1453,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return -EAGAIN; - if (sync) { - struct cp_control cpc; - - cpc.reason = __get_cp_reason(sbi); - - down_write(&sbi->gc_lock); - err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); - } - f2fs_trace_ios(NULL, 1); + if (sync) + err = f2fs_issue_checkpoint(sbi); return err; } @@ -1369,6 +1471,10 @@ static int f2fs_freeze(struct super_block *sb) /* must be clean, since sync_filesystem() was already called */ if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; + + /* ensure no checkpoint required */ + if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) + return -EINVAL; return 0; } @@ -1539,6 +1645,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, } seq_printf(seq, ",compress_algorithm=%s", algtype); + if (F2FS_OPTION(sbi).compress_level) + seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level); + seq_printf(seq, ",compress_log_size=%u", F2FS_OPTION(sbi).compress_log_size); @@ -1674,6 +1783,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISABLE_CHECKPOINT)) seq_printf(seq, ",checkpoint=disable:%u", F2FS_OPTION(sbi).unusable_cap); + if (test_opt(sbi, MERGE_CHECKPOINT)) + seq_puts(seq, ",checkpoint_merge"); + else + seq_puts(seq, ",nocheckpoint_merge"); if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) @@ -1957,6 +2070,19 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } + if (!test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto restore_gc; + } + } else { + f2fs_stop_ckpt_thread(sbi); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -2641,10 +2767,10 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_blocks(void) +loff_t max_file_blocks(struct inode *inode) { loff_t result = 0; - loff_t leaf_count = DEF_ADDRS_PER_BLOCK; + loff_t leaf_count; /* * note: previously, result is equal to (DEF_ADDRS_PER_INODE - @@ -2653,6 +2779,11 @@ static loff_t max_file_blocks(void) * result as zero. */ + if (inode && f2fs_compressed_file(inode)) + leaf_count = ADDRS_PER_BLOCK(inode); + else + leaf_count = DEF_ADDRS_PER_BLOCK; + /* two direct node blocks */ result += (leaf_count * 2); @@ -3536,8 +3667,7 @@ try_onemore: if (err) goto free_options; - sbi->max_file_blocks = max_file_blocks(); - sb->s_maxbytes = sbi->max_file_blocks << + sb->s_maxbytes = max_file_blocks(NULL) << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; @@ -3704,6 +3834,19 @@ try_onemore: f2fs_init_fsync_node_info(sbi); + /* setup checkpoint request control and start checkpoint issue thread */ + f2fs_init_ckpt_req_control(sbi); + if (!test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto stop_ckpt_thread; + } + } + /* setup f2fs internal modules */ err = f2fs_build_segment_manager(sbi); if (err) { @@ -3789,12 +3932,10 @@ try_onemore: * previous checkpoint was not done by clean system shutdown. */ if (f2fs_hw_is_readonly(sbi)) { - if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - err = -EROFS; + if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) f2fs_err(sbi, "Need to recover fsync data, but write access unavailable"); - goto free_meta; - } - f2fs_info(sbi, "write access unavailable, skipping recovery"); + else + f2fs_info(sbi, "write access unavailable, skipping recovery"); goto reset_checkpoint; } @@ -3913,6 +4054,8 @@ free_nm: free_sm: f2fs_destroy_segment_manager(sbi); f2fs_destroy_post_read_wq(sbi); +stop_ckpt_thread: + f2fs_stop_ckpt_thread(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); @@ -4027,8 +4170,6 @@ static int __init init_f2fs_fs(void) return -EINVAL; } - f2fs_build_trace_ios(); - err = init_inodecache(); if (err) goto fail; @@ -4121,7 +4262,6 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_segment_manager_caches(); f2fs_destroy_node_manager_caches(); destroy_inodecache(); - f2fs_destroy_trace_ios(); } module_init(init_f2fs_fs) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 989a649cfa8b..e38a7f6921dd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -34,6 +35,7 @@ enum { FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ + CPRC_INFO, /* struct ckpt_req_control */ }; struct f2fs_attr { @@ -70,6 +72,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) else if (struct_type == STAT_INFO) return (unsigned char *)F2FS_STAT(sbi); #endif + else if (struct_type == CPRC_INFO) + return (unsigned char *)&sbi->cprc_info; return NULL; } @@ -90,26 +94,23 @@ static ssize_t free_segments_show(struct f2fs_attr *a, static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return sprintf(buf, "0\n"); - return sprintf(buf, "%llu\n", (unsigned long long)(sbi->kbytes_written + ((f2fs_get_sectors_written(sbi) - sbi->sectors_written_start) >> 1))); } +static ssize_t sb_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%lx\n", sbi->s_flag); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct super_block *sb = sbi->sb; int len = 0; - if (!sb->s_bdev->bd_part) - return sprintf(buf, "0\n"); - if (f2fs_sb_has_encrypt(sbi)) len += scnprintf(buf, PAGE_SIZE - len, "%s", "encryption"); @@ -264,6 +265,23 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return len; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + struct ckpt_req_control *cprc = &sbi->cprc_info; + int len = 0; + int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio); + int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio); + + if (class == IOPRIO_CLASS_RT) + len += scnprintf(buf + len, PAGE_SIZE - len, "rt,"); + else if (class == IOPRIO_CLASS_BE) + len += scnprintf(buf + len, PAGE_SIZE - len, "be,"); + else + return -EINVAL; + + len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data); + return len; + } + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -317,6 +335,38 @@ out: return ret ? ret : count; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + const char *name = strim((char *)buf); + struct ckpt_req_control *cprc = &sbi->cprc_info; + int class; + long data; + int ret; + + if (!strncmp(name, "rt,", 3)) + class = IOPRIO_CLASS_RT; + else if (!strncmp(name, "be,", 3)) + class = IOPRIO_CLASS_BE; + else + return -EINVAL; + + name += 3; + ret = kstrtol(name, 10, &data); + if (ret) + return ret; + if (data >= IOPRIO_BE_NR || data < 0) + return -EINVAL; + + cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); + if (test_opt(sbi, MERGE_CHECKPOINT)) { + ret = set_task_ioprio(cprc->f2fs_issue_ckpt, + cprc->ckpt_thread_ioprio); + if (ret) + return ret; + } + + return count; + } + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); @@ -576,6 +626,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); @@ -661,6 +712,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), + ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), ATTR_LIST(unusable), @@ -711,6 +763,13 @@ static struct attribute *f2fs_feat_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs_feat); +F2FS_GENERAL_RO_ATTR(sb_status); +static struct attribute *f2fs_stat_attrs[] = { + ATTR_LIST(sb_status), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_stat); + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -739,6 +798,44 @@ static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; +static ssize_t f2fs_stat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_stat_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + complete(&sbi->s_stat_kobj_unregister); +} + +static const struct sysfs_ops f2fs_stat_attr_ops = { + .show = f2fs_stat_attr_show, + .store = f2fs_stat_attr_store, +}; + +static struct kobj_type f2fs_stat_ktype = { + .default_groups = f2fs_stat_groups, + .sysfs_ops = &f2fs_stat_attr_ops, + .release = f2fs_stat_kobj_release, +}; + static int __maybe_unused segment_info_seq_show(struct seq_file *seq, void *offset) { @@ -945,11 +1042,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, "%s", sb->s_id); - if (err) { - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - return err; - } + if (err) + goto put_sb_kobj; + + sbi->s_stat_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_stat_kobj_unregister); + err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype, + &sbi->s_kobj, "stat"); + if (err) + goto put_stat_kobj; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -965,6 +1066,13 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) victim_bits_seq_show, sb); } return 0; +put_stat_kobj: + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; } void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) @@ -976,6 +1084,11 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("victim_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } + + kobject_del(&sbi->s_stat_kobj); + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); + kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c deleted file mode 100644 index d0ab533a9ce8..000000000000 --- a/fs/f2fs/trace.c +++ /dev/null @@ -1,165 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim - */ -#include -#include -#include -#include - -#include "f2fs.h" -#include "trace.h" - -static RADIX_TREE(pids, GFP_ATOMIC); -static spinlock_t pids_lock; -static struct last_io_info last_io; - -static inline void __print_last_io(void) -{ - if (!last_io.len) - return; - - trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", - last_io.major, last_io.minor, - last_io.pid, "----------------", - last_io.type, - last_io.fio.op, last_io.fio.op_flags, - last_io.fio.new_blkaddr, - last_io.len); - memset(&last_io, 0, sizeof(last_io)); -} - -static int __file_type(struct inode *inode, pid_t pid) -{ - if (f2fs_is_atomic_file(inode)) - return __ATOMIC_FILE; - else if (f2fs_is_volatile_file(inode)) - return __VOLATILE_FILE; - else if (S_ISDIR(inode->i_mode)) - return __DIR_FILE; - else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode))) - return __NODE_FILE; - else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode))) - return __META_FILE; - else if (pid) - return __NORMAL_FILE; - else - return __MISC_FILE; -} - -void f2fs_trace_pid(struct page *page) -{ - struct inode *inode = page->mapping->host; - pid_t pid = task_pid_nr(current); - void *p; - - set_page_private(page, (unsigned long)pid); - -retry: - if (radix_tree_preload(GFP_NOFS)) - return; - - spin_lock(&pids_lock); - p = radix_tree_lookup(&pids, pid); - if (p == current) - goto out; - if (p) - radix_tree_delete(&pids, pid); - - if (radix_tree_insert(&pids, pid, current)) { - spin_unlock(&pids_lock); - radix_tree_preload_end(); - cond_resched(); - goto retry; - } - - trace_printk("%3x:%3x %4x %-16s\n", - MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), - pid, current->comm); -out: - spin_unlock(&pids_lock); - radix_tree_preload_end(); -} - -void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) -{ - struct inode *inode; - pid_t pid; - int major, minor; - - if (flush) { - __print_last_io(); - return; - } - - inode = fio->page->mapping->host; - pid = page_private(fio->page); - - major = MAJOR(inode->i_sb->s_dev); - minor = MINOR(inode->i_sb->s_dev); - - if (last_io.major == major && last_io.minor == minor && - last_io.pid == pid && - last_io.type == __file_type(inode, pid) && - last_io.fio.op == fio->op && - last_io.fio.op_flags == fio->op_flags && - last_io.fio.new_blkaddr + last_io.len == - fio->new_blkaddr) { - last_io.len++; - return; - } - - __print_last_io(); - - last_io.major = major; - last_io.minor = minor; - last_io.pid = pid; - last_io.type = __file_type(inode, pid); - last_io.fio = *fio; - last_io.len = 1; - return; -} - -void f2fs_build_trace_ios(void) -{ - spin_lock_init(&pids_lock); -} - -#define PIDVEC_SIZE 128 -static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, - unsigned int max_items) -{ - struct radix_tree_iter iter; - void **slot; - unsigned int ret = 0; - - if (unlikely(!max_items)) - return 0; - - radix_tree_for_each_slot(slot, &pids, &iter, first_index) { - results[ret] = iter.index; - if (++ret == max_items) - break; - } - return ret; -} - -void f2fs_destroy_trace_ios(void) -{ - pid_t pid[PIDVEC_SIZE]; - pid_t next_pid = 0; - unsigned int found; - - spin_lock(&pids_lock); - while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { - unsigned idx; - - next_pid = pid[found - 1] + 1; - for (idx = 0; idx < found; idx++) - radix_tree_delete(&pids, pid[idx]); - } - spin_unlock(&pids_lock); -} diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h deleted file mode 100644 index 789f6aa727fc..000000000000 --- a/fs/f2fs/trace.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim - */ -#ifndef __F2FS_TRACE_H__ -#define __F2FS_TRACE_H__ - -#ifdef CONFIG_F2FS_IO_TRACE -#include - -enum file_type { - __NORMAL_FILE, - __DIR_FILE, - __NODE_FILE, - __META_FILE, - __ATOMIC_FILE, - __VOLATILE_FILE, - __MISC_FILE, -}; - -struct last_io_info { - int major, minor; - pid_t pid; - enum file_type type; - struct f2fs_io_info fio; - block_t len; -}; - -extern void f2fs_trace_pid(struct page *); -extern void f2fs_trace_ios(struct f2fs_io_info *, int); -extern void f2fs_build_trace_ios(void); -extern void f2fs_destroy_trace_ios(void); -#else -#define f2fs_trace_pid(p) -#define f2fs_trace_ios(i, n) -#define f2fs_build_trace_ios() -#define f2fs_destroy_trace_ios() - -#endif -#endif /* __F2FS_TRACE_H__ */ diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index eb892dbe85e3..d0313592a8ae 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -327,7 +327,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, void *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int inline_size = inline_xattr_size(inode); - int err = 0; + int err; if (!xnid && !inline_size) return -ENODATA; @@ -515,7 +515,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { struct f2fs_xattr_entry *entry = NULL; - int error = 0; + int error; unsigned int size, len; void *base_addr = NULL; int base_size; @@ -562,7 +562,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct f2fs_xattr_entry *entry; void *base_addr, *last_base_addr; - int error = 0; + int error; size_t rest = buffer_size; down_read(&F2FS_I(inode)->i_xattr_sem); @@ -632,7 +632,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, int found, newsize; size_t len; __u32 new_hsize; - int error = 0; + int error; if (name == NULL) return -EINVAL; @@ -673,7 +673,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, } if (value && f2fs_xattr_value_same(here, value, size)) - goto exit; + goto same; } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; @@ -738,17 +738,20 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = current_time(inode); - clear_inode_flag(inode, FI_ACL_MODE); - } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); + +same: + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + inode->i_ctime = current_time(inode); + clear_inode_flag(inode, FI_ACL_MODE); + } + exit: kfree(base_addr); return error; diff --git a/fs/libfs.c b/fs/libfs.c index e52818fb276a..1b4a215f7b74 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1383,8 +1383,8 @@ static bool needs_casefold(const struct inode *dir) * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ -int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) +static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) { const struct dentry *parent = READ_ONCE(dentry->d_parent); const struct inode *dir = READ_ONCE(parent->d_inode); @@ -1421,7 +1421,6 @@ fallback: return 1; return !!memcmp(str, name->name, len); } -EXPORT_SYMBOL(generic_ci_d_compare); /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems @@ -1430,7 +1429,7 @@ EXPORT_SYMBOL(generic_ci_d_compare); * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ -int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) +static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; @@ -1445,7 +1444,6 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) return -EINVAL; return 0; } -EXPORT_SYMBOL(generic_ci_d_hash); static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, diff --git a/fs/verity/Makefile b/fs/verity/Makefile index 570e9136334d..435559a4fa9e 100644 --- a/fs/verity/Makefile +++ b/fs/verity/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_FS_VERITY) += enable.o \ init.o \ measure.o \ open.o \ + read_metadata.o \ verify.o obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 96f7b332f54f..a275cad1548a 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -122,12 +122,17 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, const u8 *salt, size_t salt_size); struct fsverity_info *fsverity_create_info(const struct inode *inode, - void *desc, size_t desc_size); + struct fsverity_descriptor *desc, + size_t desc_size); void fsverity_set_info(struct inode *inode, struct fsverity_info *vi); void fsverity_free_info(struct fsverity_info *vi); +int fsverity_get_descriptor(struct inode *inode, + struct fsverity_descriptor **desc_ret, + size_t *desc_size_ret); + int __init fsverity_init_info_cache(void); void __init fsverity_exit_info_cache(void); @@ -135,15 +140,13 @@ void __init fsverity_exit_info_cache(void); #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES int fsverity_verify_signature(const struct fsverity_info *vi, - const struct fsverity_descriptor *desc, - size_t desc_size); + const u8 *signature, size_t sig_size); int __init fsverity_init_signature(void); #else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ static inline int fsverity_verify_signature(const struct fsverity_info *vi, - const struct fsverity_descriptor *desc, - size_t desc_size) + const u8 *signature, size_t sig_size) { return 0; } diff --git a/fs/verity/open.c b/fs/verity/open.c index 228d0eca3e2e..60ff8af7219f 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -142,45 +142,17 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg, } /* - * Validate the given fsverity_descriptor and create a new fsverity_info from - * it. The signature (if present) is also checked. + * Create a new fsverity_info from the given fsverity_descriptor (with optional + * appended signature), and check the signature if present. The + * fsverity_descriptor must have already undergone basic validation. */ struct fsverity_info *fsverity_create_info(const struct inode *inode, - void *_desc, size_t desc_size) + struct fsverity_descriptor *desc, + size_t desc_size) { - struct fsverity_descriptor *desc = _desc; struct fsverity_info *vi; int err; - if (desc_size < sizeof(*desc)) { - fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", - desc_size); - return ERR_PTR(-EINVAL); - } - - if (desc->version != 1) { - fsverity_err(inode, "Unrecognized descriptor version: %u", - desc->version); - return ERR_PTR(-EINVAL); - } - - if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { - fsverity_err(inode, "Reserved bits set in descriptor"); - return ERR_PTR(-EINVAL); - } - - if (desc->salt_size > sizeof(desc->salt)) { - fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); - return ERR_PTR(-EINVAL); - } - - if (le64_to_cpu(desc->data_size) != inode->i_size) { - fsverity_err(inode, - "Wrong data_size: %llu (desc) != %lld (inode)", - le64_to_cpu(desc->data_size), inode->i_size); - return ERR_PTR(-EINVAL); - } - vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL); if (!vi) return ERR_PTR(-ENOMEM); @@ -209,7 +181,8 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, vi->tree_params.hash_alg->name, vi->tree_params.digest_size, vi->file_digest); - err = fsverity_verify_signature(vi, desc, desc_size); + err = fsverity_verify_signature(vi, desc->signature, + le32_to_cpu(desc->sig_size)); out: if (err) { fsverity_free_info(vi); @@ -245,15 +218,57 @@ void fsverity_free_info(struct fsverity_info *vi) kmem_cache_free(fsverity_info_cachep, vi); } -/* Ensure the inode has an ->i_verity_info */ -static int ensure_verity_info(struct inode *inode) +static bool validate_fsverity_descriptor(struct inode *inode, + const struct fsverity_descriptor *desc, + size_t desc_size) { - struct fsverity_info *vi = fsverity_get_info(inode); - struct fsverity_descriptor *desc; - int res; + if (desc_size < sizeof(*desc)) { + fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", + desc_size); + return false; + } - if (vi) - return 0; + if (desc->version != 1) { + fsverity_err(inode, "Unrecognized descriptor version: %u", + desc->version); + return false; + } + + if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { + fsverity_err(inode, "Reserved bits set in descriptor"); + return false; + } + + if (desc->salt_size > sizeof(desc->salt)) { + fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); + return false; + } + + if (le64_to_cpu(desc->data_size) != inode->i_size) { + fsverity_err(inode, + "Wrong data_size: %llu (desc) != %lld (inode)", + le64_to_cpu(desc->data_size), inode->i_size); + return false; + } + + if (le32_to_cpu(desc->sig_size) > desc_size - sizeof(*desc)) { + fsverity_err(inode, "Signature overflows verity descriptor"); + return false; + } + + return true; +} + +/* + * Read the inode's fsverity_descriptor (with optional appended signature) from + * the filesystem, and do basic validation of it. + */ +int fsverity_get_descriptor(struct inode *inode, + struct fsverity_descriptor **desc_ret, + size_t *desc_size_ret) +{ + int res; + struct fsverity_descriptor *desc; res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0); if (res < 0) { @@ -272,20 +287,46 @@ static int ensure_verity_info(struct inode *inode) res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res); if (res < 0) { fsverity_err(inode, "Error %d reading verity descriptor", res); - goto out_free_desc; + kfree(desc); + return res; } - vi = fsverity_create_info(inode, desc, res); + if (!validate_fsverity_descriptor(inode, desc, res)) { + kfree(desc); + return -EINVAL; + } + + *desc_ret = desc; + *desc_size_ret = res; + return 0; +} + +/* Ensure the inode has an ->i_verity_info */ +static int ensure_verity_info(struct inode *inode) +{ + struct fsverity_info *vi = fsverity_get_info(inode); + struct fsverity_descriptor *desc; + size_t desc_size; + int err; + + if (vi) + return 0; + + err = fsverity_get_descriptor(inode, &desc, &desc_size); + if (err) + return err; + + vi = fsverity_create_info(inode, desc, desc_size); if (IS_ERR(vi)) { - res = PTR_ERR(vi); + err = PTR_ERR(vi); goto out_free_desc; } fsverity_set_info(inode, vi); - res = 0; + err = 0; out_free_desc: kfree(desc); - return res; + return err; } /** diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c new file mode 100644 index 000000000000..7e2d0c7bdf0d --- /dev/null +++ b/fs/verity/read_metadata.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Ioctl to read verity metadata + * + * Copyright 2021 Google LLC + */ + +#include "fsverity_private.h" + +#include +#include +#include +#include + +static int fsverity_read_merkle_tree(struct inode *inode, + const struct fsverity_info *vi, + void __user *buf, u64 offset, int length) +{ + const struct fsverity_operations *vops = inode->i_sb->s_vop; + u64 end_offset; + unsigned int offs_in_page; + pgoff_t index, last_index; + int retval = 0; + int err = 0; + + end_offset = min(offset + length, vi->tree_params.tree_size); + if (offset >= end_offset) + return 0; + offs_in_page = offset_in_page(offset); + last_index = (end_offset - 1) >> PAGE_SHIFT; + + /* + * Iterate through each Merkle tree page in the requested range and copy + * the requested portion to userspace. Note that the Merkle tree block + * size isn't important here, as we are returning a byte stream; i.e., + * we can just work with pages even if the tree block size != PAGE_SIZE. + */ + for (index = offset >> PAGE_SHIFT; index <= last_index; index++) { + unsigned long num_ra_pages = + min_t(unsigned long, last_index - index + 1, + inode->i_sb->s_bdi->io_pages); + unsigned int bytes_to_copy = min_t(u64, end_offset - offset, + PAGE_SIZE - offs_in_page); + struct page *page; + const void *virt; + + page = vops->read_merkle_tree_page(inode, index, num_ra_pages); + if (IS_ERR(page)) { + err = PTR_ERR(page); + fsverity_err(inode, + "Error %d reading Merkle tree page %lu", + err, index); + break; + } + + virt = kmap(page); + if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) { + kunmap(page); + put_page(page); + err = -EFAULT; + break; + } + kunmap(page); + put_page(page); + + retval += bytes_to_copy; + buf += bytes_to_copy; + offset += bytes_to_copy; + + if (fatal_signal_pending(current)) { + err = -EINTR; + break; + } + cond_resched(); + offs_in_page = 0; + } + return retval ? retval : err; +} + +/* Copy the requested portion of the buffer to userspace. */ +static int fsverity_read_buffer(void __user *dst, u64 offset, int length, + const void *src, size_t src_length) +{ + if (offset >= src_length) + return 0; + src += offset; + src_length -= offset; + + length = min_t(size_t, length, src_length); + + if (copy_to_user(dst, src, length)) + return -EFAULT; + + return length; +} + +static int fsverity_read_descriptor(struct inode *inode, + void __user *buf, u64 offset, int length) +{ + struct fsverity_descriptor *desc; + size_t desc_size; + int res; + + res = fsverity_get_descriptor(inode, &desc, &desc_size); + if (res) + return res; + + /* don't include the signature */ + desc_size = offsetof(struct fsverity_descriptor, signature); + desc->sig_size = 0; + + res = fsverity_read_buffer(buf, offset, length, desc, desc_size); + + kfree(desc); + return res; +} + +static int fsverity_read_signature(struct inode *inode, + void __user *buf, u64 offset, int length) +{ + struct fsverity_descriptor *desc; + size_t desc_size; + int res; + + res = fsverity_get_descriptor(inode, &desc, &desc_size); + if (res) + return res; + + if (desc->sig_size == 0) { + res = -ENODATA; + goto out; + } + + /* + * Include only the signature. Note that fsverity_get_descriptor() + * already verified that sig_size is in-bounds. + */ + res = fsverity_read_buffer(buf, offset, length, desc->signature, + le32_to_cpu(desc->sig_size)); +out: + kfree(desc); + return res; +} + +/** + * fsverity_ioctl_read_metadata() - read verity metadata from a file + * @filp: file to read the metadata from + * @uarg: user pointer to fsverity_read_metadata_arg + * + * Return: length read on success, 0 on EOF, -errno on failure + */ +int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) +{ + struct inode *inode = file_inode(filp); + const struct fsverity_info *vi; + struct fsverity_read_metadata_arg arg; + int length; + void __user *buf; + + vi = fsverity_get_info(inode); + if (!vi) + return -ENODATA; /* not a verity file */ + /* + * Note that we don't have to explicitly check that the file is open for + * reading, since verity files can only be opened for reading. + */ + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (arg.__reserved) + return -EINVAL; + + /* offset + length must not overflow. */ + if (arg.offset + arg.length < arg.offset) + return -EINVAL; + + /* Ensure that the return value will fit in INT_MAX. */ + length = min_t(u64, arg.length, INT_MAX); + + buf = u64_to_user_ptr(arg.buf_ptr); + + switch (arg.metadata_type) { + case FS_VERITY_METADATA_TYPE_MERKLE_TREE: + return fsverity_read_merkle_tree(inode, vi, buf, arg.offset, + length); + case FS_VERITY_METADATA_TYPE_DESCRIPTOR: + return fsverity_read_descriptor(inode, buf, arg.offset, length); + case FS_VERITY_METADATA_TYPE_SIGNATURE: + return fsverity_read_signature(inode, buf, arg.offset, length); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL_GPL(fsverity_ioctl_read_metadata); diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 6e8b0592fbf9..e33f6c401373 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -26,6 +26,27 @@ static int fsverity_require_signatures; */ static struct key *fsverity_keyring; +/** + * fsverity_verify_signature() - check a verity file's signature + * @vi: the file's fsverity_info + * @signature: the file's built-in signature + * @sig_size: size of signature in bytes, or 0 if no signature + * + * If the file includes a signature of its fs-verity file digest, verify it + * against the certificates in the fs-verity keyring. + * + * Return: 0 on success (signature valid or not required); -errno on failure + */ +int fsverity_verify_signature(const struct fsverity_info *vi, + const u8 *signature, size_t sig_size) +{ + unsigned int digest_algorithm = + vi->tree_params.hash_alg - fsverity_hash_algs; + + return __fsverity_verify_signature(vi->inode, signature, sig_size, + vi->file_digest, digest_algorithm); +} + /** * __fsverity_verify_signature() - check a verity file's signature * @inode: the file's inode @@ -40,7 +61,7 @@ static struct key *fsverity_keyring; * Return: 0 on success (signature valid or not required); -errno on failure */ int __fsverity_verify_signature(const struct inode *inode, const u8 *signature, - u32 sig_size, const u8 *file_digest, + size_t sig_size, const u8 *file_digest, unsigned int digest_algorithm) { struct fsverity_formatted_digest *d; @@ -69,8 +90,7 @@ int __fsverity_verify_signature(const struct inode *inode, const u8 *signature, memcpy(d->digest, file_digest, hash_alg->digest_size); err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size, - signature, sig_size, - fsverity_keyring, + signature, sig_size, fsverity_keyring, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); kfree(d); @@ -95,34 +115,6 @@ int __fsverity_verify_signature(const struct inode *inode, const u8 *signature, } EXPORT_SYMBOL_GPL(__fsverity_verify_signature); -/** - * fsverity_verify_signature() - check a verity file's signature - * @vi: the file's fsverity_info - * @desc: the file's fsverity_descriptor - * @desc_size: size of @desc - * - * If the file's fs-verity descriptor includes a signature of the file digest, - * verify it against the certificates in the fs-verity keyring. - * - * Return: 0 on success (signature valid or not required); -errno on failure - */ -int fsverity_verify_signature(const struct fsverity_info *vi, - const struct fsverity_descriptor *desc, - size_t desc_size) -{ - const struct inode *inode = vi->inode; - const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg; - const u32 sig_size = le32_to_cpu(desc->sig_size); - - if (sig_size > desc_size - sizeof(*desc)) { - fsverity_err(inode, "Signature overflows verity descriptor"); - return -EBADMSG; - } - - return __fsverity_verify_signature(inode, desc->signature, sig_size, - vi->file_digest, hash_alg - fsverity_hash_algs); -} - #ifdef CONFIG_SYSCTL static struct ctl_table_header *fsverity_sysctl_header; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index ef1d5bb88b93..b7c5783a031c 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -433,13 +433,10 @@ xfs_fs_goingdown( { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { - struct super_block *sb = freeze_bdev(mp->m_super->s_bdev); - - if (sb && !IS_ERR(sb)) { + if (!freeze_bdev(mp->m_super->s_bdev)) { xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); - thaw_bdev(sb->s_bdev, sb); + thaw_bdev(mp->m_super->s_bdev); } - break; } case XFS_FSOP_GOING_FLAGS_LOGFLUSH: diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8bac50d6e8c5..e47f86a147a6 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -46,6 +46,7 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; + struct super_block *bd_fsfreeze_sb; } __randomize_layout; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 542471b76f41..668e6a732025 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -2032,7 +2032,7 @@ static inline int sync_blockdev(struct block_device *bdev) #endif int fsync_bdev(struct block_device *bdev); -struct super_block *freeze_bdev(struct block_device *bdev); -int thaw_bdev(struct block_device *bdev, struct super_block *sb); +int freeze_bdev(struct block_device *bdev); +int thaw_bdev(struct block_device *bdev); #endif /* _LINUX_BLKDEV_H */ diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 7dc2a06cf19a..c6cc0a566ef5 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -274,6 +274,9 @@ struct f2fs_inode { __u8 i_compress_algorithm; /* compress algorithm */ __u8 i_log_cluster_size; /* log of cluster size */ __le16 i_compress_flag; /* compress flag */ + /* 0 bit: chksum flag + * [10,15] bits: compress level + */ __le32 i_extra_end[0]; /* for attribute size calculation */ } __packed; __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 70d13ce3052b..342a93553eaf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3197,11 +3197,6 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); -#ifdef CONFIG_UNICODE -extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); -extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name); -#endif extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); #ifdef CONFIG_MIGRATION diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index c24581106331..19c118e5ad83 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -138,6 +138,10 @@ int fsverity_file_open(struct inode *inode, struct file *filp); int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); void fsverity_cleanup_inode(struct inode *inode); +/* read_metadata.c */ + +int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg); + /* verify.c */ bool fsverity_verify_page(struct page *page); @@ -183,6 +187,14 @@ static inline void fsverity_cleanup_inode(struct inode *inode) { } +/* read_metadata.c */ + +static inline int fsverity_ioctl_read_metadata(struct file *filp, + const void __user *uarg) +{ + return -EOPNOTSUPP; +} + /* verify.c */ static inline bool fsverity_verify_page(struct page *page) @@ -223,11 +235,11 @@ static inline bool fsverity_active(const struct inode *inode) #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES int __fsverity_verify_signature(const struct inode *inode, const u8 *signature, - u32 sig_size, const u8 *file_digest, + size_t sig_size, const u8 *file_digest, unsigned int digest_algorithm); #else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ static inline int __fsverity_verify_signature(const struct inode *inode, - const u8 *signature, u32 sig_size, + const u8 *signature, size_t sig_size, const u8 *file_digest, unsigned int digest_algorithm) { diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h index 33f44156f8ea..15384e22e331 100644 --- a/include/uapi/linux/fsverity.h +++ b/include/uapi/linux/fsverity.h @@ -83,7 +83,21 @@ struct fsverity_formatted_digest { __u8 digest[]; }; +#define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 +#define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2 +#define FS_VERITY_METADATA_TYPE_SIGNATURE 3 + +struct fsverity_read_metadata_arg { + __u64 metadata_type; + __u64 offset; + __u64 length; + __u64 buf_ptr; + __u64 __reserved; +}; + #define FS_IOC_ENABLE_VERITY _IOW('f', 133, struct fsverity_enable_arg) #define FS_IOC_MEASURE_VERITY _IOWR('f', 134, struct fsverity_digest) +#define FS_IOC_READ_VERITY_METADATA \ + _IOWR('f', 135, struct fsverity_read_metadata_arg) #endif /* _UAPI_LINUX_FSVERITY_H */