From a68cb18624503a09a1b10e72c3bcc90f2eeb3ded Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 16 Apr 2025 10:19:05 +0200 Subject: [PATCH 1/4] mount: add a comment about concurrent changes with statmount()/listmount() Add some comments in there highlighting a few non-obvious assumptions. Link: https://lore.kernel.org/20250416-zerknirschen-aluminium-14a55639076f@brauner Signed-off-by: Christian Brauner --- fs/namespace.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 14935a0500a2..ddb0a688633c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5831,13 +5831,29 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, return err; s->root = root; - s->idmap = mnt_idmap(s->mnt); - if (s->mask & STATMOUNT_SB_BASIC) - statmount_sb_basic(s); + /* + * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap + * can change concurrently as we only hold the read-side of the + * namespace semaphore and mount properties may change with only + * the mount lock held. + * + * We could sample the mount lock sequence counter to detect + * those changes and retry. But it's not worth it. Worst that + * happens is that the mnt->mnt_idmap pointer is already changed + * while mnt->mnt_flags isn't or vica versa. So what. + * + * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved + * via READ_ONCE()/WRITE_ONCE() and guard against theoretical + * torn read/write. That's all we care about right now. + */ + s->idmap = mnt_idmap(s->mnt); if (s->mask & STATMOUNT_MNT_BASIC) statmount_mnt_basic(s); + if (s->mask & STATMOUNT_SB_BASIC) + statmount_sb_basic(s); + if (s->mask & STATMOUNT_PROPAGATE_FROM) statmount_propagate_from(s); @@ -6149,6 +6165,10 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; + /* + * We only need to guard against mount topology changes as + * listmount() doesn't care about any mount properties. + */ scoped_guard(rwsem_read, &namespace_sem) ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); From 7fc711739eb8c30955fd8a85be0db7d5a0aa10ae Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 21 Apr 2025 04:35:09 +0100 Subject: [PATCH 2/4] ->mnt_devname is never NULL Not since 8f2918898eb5 "new helpers: vfs_create_mount(), fc_mount()" back in 2018. Get rid of the dead checks... Signed-off-by: Al Viro Link: https://lore.kernel.org/20250421033509.GV2023217@ZenIV Signed-off-by: Christian Brauner --- fs/namespace.c | 13 +++++++------ fs/proc_namespace.c | 12 ++++-------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ddb0a688633c..924298d11df7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -355,12 +355,13 @@ static struct mount *alloc_vfsmnt(const char *name) if (err) goto out_free_cache; - if (name) { + if (name) mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL_ACCOUNT); - if (!mnt->mnt_devname) - goto out_free_id; - } + else + mnt->mnt_devname = "none"; + if (!mnt->mnt_devname) + goto out_free_id; #ifdef CONFIG_SMP mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); @@ -1268,7 +1269,7 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) if (!fc->root) return ERR_PTR(-EINVAL); - mnt = alloc_vfsmnt(fc->source ?: "none"); + mnt = alloc_vfsmnt(fc->source); if (!mnt) return ERR_PTR(-ENOMEM); @@ -5483,7 +5484,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) seq->buf[seq->count] = '\0'; seq->count = start; seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); - } else if (r->mnt_devname) { + } else { seq_puts(seq, r->mnt_devname); } return 0; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index e133b507ddf3..5c555db68aa2 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -111,7 +111,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) if (err) goto out; } else { - mangle(m, r->mnt_devname ? r->mnt_devname : "none"); + mangle(m, r->mnt_devname); } seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ @@ -177,7 +177,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) if (err) goto out; } else { - mangle(m, r->mnt_devname ? r->mnt_devname : "none"); + mangle(m, r->mnt_devname); } seq_puts(m, sb_rdonly(sb) ? " ro" : " rw"); err = show_sb_opts(m, sb); @@ -199,17 +199,13 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) int err; /* device */ + seq_puts(m, "device "); if (sb->s_op->show_devname) { - seq_puts(m, "device "); err = sb->s_op->show_devname(m, mnt_path.dentry); if (err) goto out; } else { - if (r->mnt_devname) { - seq_puts(m, "device "); - mangle(m, r->mnt_devname); - } else - seq_puts(m, "no device"); + mangle(m, r->mnt_devname); } /* mount point */ From 101f2bbab541116ab861b9c3ac0ece07a7eaa756 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Wed, 7 May 2025 15:34:01 -0700 Subject: [PATCH 3/4] fs: convert mount flags to enum In prior kernel versions (5.8-6.8), commit 9f6c61f96f2d9 ("proc/mounts: add cursor") introduced MNT_CURSOR, a flag used by readers from /proc/mounts to keep their place while reading the file. Later, commit 2eea9ce4310d8 ("mounts: keep list of mounts in an rbtree") removed this flag and its value has since been repurposed. For debuggers iterating over the list of mounts, cursors should be skipped as they are irrelevant. Detecting whether an element is a cursor can be difficult. Since the MNT_CURSOR flag is a preprocessor constant, it's not present in debuginfo, and since its value is repurposed, we cannot hard-code it. For this specific issue, cursors are possible to detect in other ways, but ideally, we would be able to read the mount flag definitions out of the debuginfo. For that reason, convert the mount flags to an enum. Link: https://github.com/osandov/drgn/pull/496 Signed-off-by: Stephen Brennan Link: https://lore.kernel.org/20250507223402.2795029-1-stephen.s.brennan@oracle.com Signed-off-by: Christian Brauner --- include/linux/mount.h | 77 ++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/include/linux/mount.h b/include/linux/mount.h index dcc17ce8a959..6904ad33ee7a 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -22,48 +22,51 @@ struct fs_context; struct file; struct path; -#define MNT_NOSUID 0x01 -#define MNT_NODEV 0x02 -#define MNT_NOEXEC 0x04 -#define MNT_NOATIME 0x08 -#define MNT_NODIRATIME 0x10 -#define MNT_RELATIME 0x20 -#define MNT_READONLY 0x40 /* does the user want this to be r/o? */ -#define MNT_NOSYMFOLLOW 0x80 +enum mount_flags { + MNT_NOSUID = 0x01, + MNT_NODEV = 0x02, + MNT_NOEXEC = 0x04, + MNT_NOATIME = 0x08, + MNT_NODIRATIME = 0x10, + MNT_RELATIME = 0x20, + MNT_READONLY = 0x40, /* does the user want this to be r/o? */ + MNT_NOSYMFOLLOW = 0x80, -#define MNT_SHRINKABLE 0x100 -#define MNT_WRITE_HOLD 0x200 + MNT_SHRINKABLE = 0x100, + MNT_WRITE_HOLD = 0x200, -#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ -#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ -/* - * MNT_SHARED_MASK is the set of flags that should be cleared when a - * mount becomes shared. Currently, this is only the flag that says a - * mount cannot be bind mounted, since this is how we create a mount - * that shares events with another mount. If you add a new MNT_* - * flag, consider how it interacts with shared mounts. - */ -#define MNT_SHARED_MASK (MNT_UNBINDABLE) -#define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \ - | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \ - | MNT_READONLY | MNT_NOSYMFOLLOW) -#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) + MNT_SHARED = 0x1000, /* if the vfsmount is a shared mount */ + MNT_UNBINDABLE = 0x2000, /* if the vfsmount is a unbindable mount */ -#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) + MNT_INTERNAL = 0x4000, -#define MNT_INTERNAL 0x4000 + MNT_LOCK_ATIME = 0x040000, + MNT_LOCK_NOEXEC = 0x080000, + MNT_LOCK_NOSUID = 0x100000, + MNT_LOCK_NODEV = 0x200000, + MNT_LOCK_READONLY = 0x400000, + MNT_LOCKED = 0x800000, + MNT_DOOMED = 0x1000000, + MNT_SYNC_UMOUNT = 0x2000000, + MNT_MARKED = 0x4000000, + MNT_UMOUNT = 0x8000000, -#define MNT_LOCK_ATIME 0x040000 -#define MNT_LOCK_NOEXEC 0x080000 -#define MNT_LOCK_NOSUID 0x100000 -#define MNT_LOCK_NODEV 0x200000 -#define MNT_LOCK_READONLY 0x400000 -#define MNT_LOCKED 0x800000 -#define MNT_DOOMED 0x1000000 -#define MNT_SYNC_UMOUNT 0x2000000 -#define MNT_MARKED 0x4000000 -#define MNT_UMOUNT 0x8000000 + /* + * MNT_SHARED_MASK is the set of flags that should be cleared when a + * mount becomes shared. Currently, this is only the flag that says a + * mount cannot be bind mounted, since this is how we create a mount + * that shares events with another mount. If you add a new MNT_* + * flag, consider how it interacts with shared mounts. + */ + MNT_SHARED_MASK = MNT_UNBINDABLE, + MNT_USER_SETTABLE_MASK = MNT_NOSUID | MNT_NODEV | MNT_NOEXEC + | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME + | MNT_READONLY | MNT_NOSYMFOLLOW, + MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME, + + MNT_INTERNAL_FLAGS = MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED, +}; struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ From 2b3c61b87519ff5d52fd9d6eb2632975f4e18b04 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 12 May 2025 01:49:53 +0300 Subject: [PATCH 4/4] statmount: update STATMOUNT_SUPPORTED macro According to commit 8f6116b5b77b ("statmount: add a new supported_mask field"), STATMOUNT_SUPPORTED macro shall be updated whenever a new flag is added. Fixes: 7a54947e727b ("Merge patch series "fs: allow changing idmappings"") Signed-off-by: "Dmitry V. Levin" Link: https://lore.kernel.org/20250511224953.GA17849@strace.io Signed-off-by: Christian Brauner --- fs/namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 924298d11df7..28e193324ba4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5797,7 +5797,9 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) STATMOUNT_SB_SOURCE | \ STATMOUNT_OPT_ARRAY | \ STATMOUNT_OPT_SEC_ARRAY | \ - STATMOUNT_SUPPORTED_MASK) + STATMOUNT_SUPPORTED_MASK | \ + STATMOUNT_MNT_UIDMAP | \ + STATMOUNT_MNT_GIDMAP) static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns)