From 12b5b138d111db0588492002fdd8089af61b80e5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 3 Jun 2025 15:31:55 +0200 Subject: [PATCH 01/31] coredump: allow for flexible coredump handling Extend the coredump socket to allow the coredump server to tell the kernel how to process individual coredumps. When the crashing task connects to the coredump socket the kernel will send a struct coredump_req to the coredump server. The kernel will set the size member of struct coredump_req allowing the coredump server how much data can be read. The coredump server uses MSG_PEEK to peek the size of struct coredump_req. If the kernel uses a newer struct coredump_req the coredump server just reads the size it knows and discard any remaining bytes in the buffer. If the kernel uses an older struct coredump_req the coredump server just reads the size the kernel knows. The returned struct coredump_req will inform the coredump server what features the kernel supports. The coredump_req->mask member is set to the currently know features. The coredump server may only use features whose bits were raised by the kernel in coredump_req->mask. In response to a coredump_req from the kernel the coredump server sends a struct coredump_ack to the kernel. The kernel informs the coredump server what version of struct coredump_ack it supports by setting struct coredump_req->size_ack to the size it knows about. The coredump server may only send as many bytes as coredump_req->size_ack indicates (a smaller size is fine of course). The coredump server must set coredump_ack->size accordingly. The coredump server sets the features it wants to use in struct coredump_ack->mask. Only bits returned in struct coredump_req->mask may be used. In case an invalid struct coredump_ack is sent to the kernel a non-zero u32 integer is sent indicating the reason for the failure. If it was successful a zero u32 integer is sent. In the initial version the following features are supported in coredump_{req,ack}->mask: * COREDUMP_KERNEL The kernel will write the coredump data to the socket. * COREDUMP_USERSPACE The kernel will not write coredump data but will indicate to the parent that a coredump has been generated. This is used when userspace generates its own coredumps. * COREDUMP_REJECT The kernel will skip generating a coredump for this task. * COREDUMP_WAIT The kernel will prevent the task from exiting until the coredump server has shutdown the socket connection. The flexible coredump socket can be enabled by using the "@@" prefix instead of the single "@" prefix for the regular coredump socket: @@/run/systemd/coredump.socket will enable flexible coredump handling. Current kernels already enforce that "@" must be followed by "/" and will reject anything else. So extending this is backward and forward compatible. Link: https://lore.kernel.org/20250603-work-coredump-socket-protocol-v2-1-05a5f0c18ecc@kernel.org Acked-by: Lennart Poettering Reviewed-by: Alexander Mikhalitsyn Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/coredump.c | 195 +++++++++++++++++++++++++++++----- include/uapi/linux/coredump.h | 104 ++++++++++++++++++ 2 files changed, 272 insertions(+), 27 deletions(-) create mode 100644 include/uapi/linux/coredump.h diff --git a/fs/coredump.c b/fs/coredump.c index f217ebf2b3b6..b3eaa8c27ced 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -83,15 +84,17 @@ static int core_name_size = CORENAME_MAX_SIZE; unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT; enum coredump_type_t { - COREDUMP_FILE = 1, - COREDUMP_PIPE = 2, - COREDUMP_SOCK = 3, + COREDUMP_FILE = 1, + COREDUMP_PIPE = 2, + COREDUMP_SOCK = 3, + COREDUMP_SOCK_REQ = 4, }; struct core_name { char *corename; int used, size; enum coredump_type_t core_type; + u64 mask; }; static int expand_corename(struct core_name *cn, int size) @@ -235,6 +238,9 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, int pid_in_pattern = 0; int err = 0; + cn->mask = COREDUMP_KERNEL; + if (core_pipe_limit) + cn->mask |= COREDUMP_WAIT; cn->used = 0; cn->corename = NULL; if (*pat_ptr == '|') @@ -264,6 +270,13 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, pat_ptr++; if (!(*pat_ptr)) return -ENOMEM; + if (*pat_ptr == '@') { + pat_ptr++; + if (!(*pat_ptr)) + return -ENOMEM; + + cn->core_type = COREDUMP_SOCK_REQ; + } err = cn_printf(cn, "%s", pat_ptr); if (err) @@ -632,6 +645,135 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) return 0; } +#ifdef CONFIG_UNIX +static inline bool coredump_sock_recv(struct file *file, struct coredump_ack *ack, size_t size, int flags) +{ + struct msghdr msg = {}; + struct kvec iov = { .iov_base = ack, .iov_len = size }; + ssize_t ret; + + memset(ack, 0, size); + ret = kernel_recvmsg(sock_from_file(file), &msg, &iov, 1, size, flags); + return ret == size; +} + +static inline bool coredump_sock_send(struct file *file, struct coredump_req *req) +{ + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; + struct kvec iov = { .iov_base = req, .iov_len = sizeof(*req) }; + ssize_t ret; + + ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(*req)); + return ret == sizeof(*req); +} + +static_assert(sizeof(enum coredump_mark) == sizeof(__u32)); + +static inline bool coredump_sock_mark(struct file *file, enum coredump_mark mark) +{ + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; + struct kvec iov = { .iov_base = &mark, .iov_len = sizeof(mark) }; + ssize_t ret; + + ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(mark)); + return ret == sizeof(mark); +} + +static inline void coredump_sock_wait(struct file *file) +{ + ssize_t n; + + /* + * We use a simple read to wait for the coredump processing to + * finish. Either the socket is closed or we get sent unexpected + * data. In both cases, we're done. + */ + n = __kernel_read(file, &(char){ 0 }, 1, NULL); + if (n > 0) + coredump_report_failure("Coredump socket had unexpected data"); + else if (n < 0) + coredump_report_failure("Coredump socket failed"); +} + +static inline void coredump_sock_shutdown(struct file *file) +{ + struct socket *socket; + + socket = sock_from_file(file); + if (!socket) + return; + + /* Let userspace know we're done processing the coredump. */ + kernel_sock_shutdown(socket, SHUT_WR); +} + +static bool coredump_request(struct core_name *cn, struct coredump_params *cprm) +{ + struct coredump_req req = { + .size = sizeof(struct coredump_req), + .mask = COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT, + .size_ack = sizeof(struct coredump_ack), + }; + struct coredump_ack ack = {}; + ssize_t usize; + + if (cn->core_type != COREDUMP_SOCK_REQ) + return true; + + /* Let userspace know what we support. */ + if (!coredump_sock_send(cprm->file, &req)) + return false; + + /* Peek the size of the coredump_ack. */ + if (!coredump_sock_recv(cprm->file, &ack, sizeof(ack.size), + MSG_PEEK | MSG_WAITALL)) + return false; + + /* Refuse unknown coredump_ack sizes. */ + usize = ack.size; + if (usize < COREDUMP_ACK_SIZE_VER0) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_MINSIZE); + return false; + } + + if (usize > sizeof(ack)) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_MAXSIZE); + return false; + } + + /* Now retrieve the coredump_ack. */ + if (!coredump_sock_recv(cprm->file, &ack, usize, MSG_WAITALL)) + return false; + if (ack.size != usize) + return false; + + /* Refuse unknown coredump_ack flags. */ + if (ack.mask & ~req.mask) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED); + return false; + } + + /* Refuse mutually exclusive options. */ + if (hweight64(ack.mask & (COREDUMP_USERSPACE | COREDUMP_KERNEL | + COREDUMP_REJECT)) != 1) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_CONFLICTING); + return false; + } + + if (ack.spare) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED); + return false; + } + + cn->mask = ack.mask; + return coredump_sock_mark(cprm->file, COREDUMP_MARK_REQACK); +} +#else +static inline void coredump_sock_wait(struct file *file) { } +static inline void coredump_sock_shutdown(struct file *file) { } +#endif + void do_coredump(const kernel_siginfo_t *siginfo) { struct core_state core_state; @@ -850,6 +992,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) } break; } + case COREDUMP_SOCK_REQ: + fallthrough; case COREDUMP_SOCK: { #ifdef CONFIG_UNIX struct file *file __free(fput) = NULL; @@ -918,6 +1062,9 @@ void do_coredump(const kernel_siginfo_t *siginfo) cprm.limit = RLIM_INFINITY; cprm.file = no_free_ptr(file); + + if (!coredump_request(&cn, &cprm)) + goto close_fail; #else coredump_report_failure("Core dump socket support %s disabled", cn.corename); goto close_fail; @@ -929,12 +1076,17 @@ void do_coredump(const kernel_siginfo_t *siginfo) goto close_fail; } + /* Don't even generate the coredump. */ + if (cn.mask & COREDUMP_REJECT) + goto close_fail; + /* get us an unshared descriptor table; almost always a no-op */ /* The cell spufs coredump code reads the file descriptor tables */ retval = unshare_files(); if (retval) goto close_fail; - if (!dump_interrupted()) { + + if ((cn.mask & COREDUMP_KERNEL) && !dump_interrupted()) { /* * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would * have this set to NULL. @@ -962,38 +1114,27 @@ void do_coredump(const kernel_siginfo_t *siginfo) free_vma_snapshot(&cprm); } -#ifdef CONFIG_UNIX - /* Let userspace know we're done processing the coredump. */ - if (sock_from_file(cprm.file)) - kernel_sock_shutdown(sock_from_file(cprm.file), SHUT_WR); -#endif + coredump_sock_shutdown(cprm.file); + + /* Let the parent know that a coredump was generated. */ + if (cn.mask & COREDUMP_USERSPACE) + core_dumped = true; /* * When core_pipe_limit is set we wait for the coredump server * or usermodehelper to finish before exiting so it can e.g., * inspect /proc/. */ - if (core_pipe_limit) { + if (cn.mask & COREDUMP_WAIT) { switch (cn.core_type) { case COREDUMP_PIPE: wait_for_dump_helpers(cprm.file); break; -#ifdef CONFIG_UNIX - case COREDUMP_SOCK: { - ssize_t n; - - /* - * We use a simple read to wait for the coredump - * processing to finish. Either the socket is - * closed or we get sent unexpected data. In - * both cases, we're done. - */ - n = __kernel_read(cprm.file, &(char){ 0 }, 1, NULL); - if (n != 0) - coredump_report_failure("Unexpected data on coredump socket"); + case COREDUMP_SOCK_REQ: + fallthrough; + case COREDUMP_SOCK: + coredump_sock_wait(cprm.file); break; - } -#endif default: break; } @@ -1249,8 +1390,8 @@ static inline bool check_coredump_socket(void) if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns) return false; - /* Must be an absolute path. */ - if (*(core_pattern + 1) != '/') + /* Must be an absolute path or the socket request. */ + if (*(core_pattern + 1) != '/' && *(core_pattern + 1) != '@') return false; return true; diff --git a/include/uapi/linux/coredump.h b/include/uapi/linux/coredump.h new file mode 100644 index 000000000000..dc3789b78af0 --- /dev/null +++ b/include/uapi/linux/coredump.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_COREDUMP_H +#define _UAPI_LINUX_COREDUMP_H + +#include + +/** + * coredump_{req,ack} flags + * @COREDUMP_KERNEL: kernel writes coredump + * @COREDUMP_USERSPACE: userspace writes coredump + * @COREDUMP_REJECT: don't generate coredump + * @COREDUMP_WAIT: wait for coredump server + */ +enum { + COREDUMP_KERNEL = (1ULL << 0), + COREDUMP_USERSPACE = (1ULL << 1), + COREDUMP_REJECT = (1ULL << 2), + COREDUMP_WAIT = (1ULL << 3), +}; + +/** + * struct coredump_req - message kernel sends to userspace + * @size: size of struct coredump_req + * @size_ack: known size of struct coredump_ack on this kernel + * @mask: supported features + * + * When a coredump happens the kernel will connect to the coredump + * socket and send a coredump request to the coredump server. The @size + * member is set to the size of struct coredump_req and provides a hint + * to userspace how much data can be read. Userspace may use MSG_PEEK to + * peek the size of struct coredump_req and then choose to consume it in + * one go. Userspace may also simply read a COREDUMP_ACK_SIZE_VER0 + * request. If the size the kernel sends is larger userspace simply + * discards any remaining data. + * + * The coredump_req->mask member is set to the currently know features. + * Userspace may only set coredump_ack->mask to the bits raised by the + * kernel in coredump_req->mask. + * + * The coredump_req->size_ack member is set by the kernel to the size of + * struct coredump_ack the kernel knows. Userspace may only send up to + * coredump_req->size_ack bytes to the kernel and must set + * coredump_ack->size accordingly. + */ +struct coredump_req { + __u32 size; + __u32 size_ack; + __u64 mask; +}; + +enum { + COREDUMP_REQ_SIZE_VER0 = 16U, /* size of first published struct */ +}; + +/** + * struct coredump_ack - message userspace sends to kernel + * @size: size of the struct + * @spare: unused + * @mask: features kernel is supposed to use + * + * The @size member must be set to the size of struct coredump_ack. It + * may never exceed what the kernel returned in coredump_req->size_ack + * but it may of course be smaller (>= COREDUMP_ACK_SIZE_VER0 and <= + * coredump_req->size_ack). + * + * The @mask member must be set to the features the coredump server + * wants the kernel to use. Only bits the kernel returned in + * coredump_req->mask may be set. + */ +struct coredump_ack { + __u32 size; + __u32 spare; + __u64 mask; +}; + +enum { + COREDUMP_ACK_SIZE_VER0 = 16U, /* size of first published struct */ +}; + +/** + * enum coredump_mark - Markers for the coredump socket + * + * The kernel will place a single byte on the coredump socket. The + * markers notify userspace whether the coredump ack succeeded or + * failed. + * + * @COREDUMP_MARK_MINSIZE: the provided coredump_ack size was too small + * @COREDUMP_MARK_MAXSIZE: the provided coredump_ack size was too big + * @COREDUMP_MARK_UNSUPPORTED: the provided coredump_ack mask was invalid + * @COREDUMP_MARK_CONFLICTING: the provided coredump_ack mask has conflicting options + * @COREDUMP_MARK_REQACK: the coredump request and ack was successful + * @__COREDUMP_MARK_MAX: the maximum coredump mark value + */ +enum coredump_mark { + COREDUMP_MARK_REQACK = 0U, + COREDUMP_MARK_MINSIZE = 1U, + COREDUMP_MARK_MAXSIZE = 2U, + COREDUMP_MARK_UNSUPPORTED = 3U, + COREDUMP_MARK_CONFLICTING = 4U, + __COREDUMP_MARK_MAX = (1U << 31), +}; + +#endif /* _UAPI_LINUX_COREDUMP_H */ From 994dc26302ed744960ad74932eb206b49c0ebb44 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 3 Jun 2025 15:31:56 +0200 Subject: [PATCH 02/31] selftests/coredump: fix build Fix various warnings in the selftest build. Link: https://lore.kernel.org/20250603-work-coredump-socket-protocol-v2-2-05a5f0c18ecc@kernel.org Acked-by: Lennart Poettering Reviewed-by: Alexander Mikhalitsyn Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- tools/testing/selftests/coredump/Makefile | 2 +- .../testing/selftests/coredump/stackdump_test.c | 17 +++++------------ 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile index ed210037b29d..bc287a85b825 100644 --- a/tools/testing/selftests/coredump/Makefile +++ b/tools/testing/selftests/coredump/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS = $(KHDR_INCLUDES) +CFLAGS = -Wall -O0 $(KHDR_INCLUDES) TEST_GEN_PROGS := stackdump_test TEST_FILES := stackdump diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index 9984413be9f0..aa366e6f13a7 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -24,6 +24,8 @@ static void *do_nothing(void *) { while (1) pause(); + + return NULL; } static void crashing_child(void) @@ -46,9 +48,7 @@ FIXTURE(coredump) FIXTURE_SETUP(coredump) { - char buf[PATH_MAX]; FILE *file; - char *dir; int ret; self->pid_coredump_server = -ESRCH; @@ -106,7 +106,6 @@ fail: TEST_F_TIMEOUT(coredump, stackdump, 120) { - struct sigaction action = {}; unsigned long long stack; char *test_dir, *line; size_t line_length; @@ -171,11 +170,10 @@ TEST_F_TIMEOUT(coredump, stackdump, 120) TEST_F(coredump, socket) { - int fd, pidfd, ret, status; + int pidfd, ret, status; FILE *file; pid_t pid, pid_coredump_server; struct stat st; - char core_file[PATH_MAX]; struct pidfd_info info = {}; int ipc_sockets[2]; char c; @@ -356,11 +354,10 @@ TEST_F(coredump, socket) TEST_F(coredump, socket_detect_userspace_client) { - int fd, pidfd, ret, status; + int pidfd, ret, status; FILE *file; pid_t pid, pid_coredump_server; struct stat st; - char core_file[PATH_MAX]; struct pidfd_info info = {}; int ipc_sockets[2]; char c; @@ -384,7 +381,7 @@ TEST_F(coredump, socket_detect_userspace_client) pid_coredump_server = fork(); ASSERT_GE(pid_coredump_server, 0); if (pid_coredump_server == 0) { - int fd_server, fd_coredump, fd_peer_pidfd, fd_core_file; + int fd_server, fd_coredump, fd_peer_pidfd; socklen_t fd_peer_pidfd_len; close(ipc_sockets[0]); @@ -464,7 +461,6 @@ TEST_F(coredump, socket_detect_userspace_client) close(fd_coredump); close(fd_server); close(fd_peer_pidfd); - close(fd_core_file); _exit(EXIT_SUCCESS); } self->pid_coredump_server = pid_coredump_server; @@ -488,7 +484,6 @@ TEST_F(coredump, socket_detect_userspace_client) if (ret < 0) _exit(EXIT_FAILURE); - (void *)write(fd_socket, &(char){ 0 }, 1); close(fd_socket); _exit(EXIT_SUCCESS); } @@ -519,7 +514,6 @@ TEST_F(coredump, socket_enoent) int pidfd, ret, status; FILE *file; pid_t pid; - char core_file[PATH_MAX]; file = fopen("/proc/sys/kernel/core_pattern", "w"); ASSERT_NE(file, NULL); @@ -569,7 +563,6 @@ TEST_F(coredump, socket_no_listener) ASSERT_GE(pid_coredump_server, 0); if (pid_coredump_server == 0) { int fd_server; - socklen_t fd_peer_pidfd_len; close(ipc_sockets[0]); From 474dd09d22df1d3bae9211078185aab9b6f1635e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 3 Jun 2025 15:31:57 +0200 Subject: [PATCH 03/31] selftests/coredump: cleanup coredump tests Make the selftests we added this cycle easier to read. Link: https://lore.kernel.org/20250603-work-coredump-socket-protocol-v2-3-05a5f0c18ecc@kernel.org Acked-by: Lennart Poettering Reviewed-by: Alexander Mikhalitsyn Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- .../selftests/coredump/stackdump_test.c | 411 ++++++++---------- 1 file changed, 175 insertions(+), 236 deletions(-) diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index aa366e6f13a7..4d922e5f89fe 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include @@ -20,6 +21,10 @@ #define STACKDUMP_SCRIPT "stackdump" #define NUM_THREAD_SPAWN 128 +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + static void *do_nothing(void *) { while (1) @@ -109,7 +114,7 @@ TEST_F_TIMEOUT(coredump, stackdump, 120) unsigned long long stack; char *test_dir, *line; size_t line_length; - char buf[PATH_MAX]; + char buf[PAGE_SIZE]; int ret, i, status; FILE *file; pid_t pid; @@ -168,152 +173,163 @@ TEST_F_TIMEOUT(coredump, stackdump, 120) fclose(file); } +static int create_and_listen_unix_socket(const char *path) +{ + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + assert(strlen(path) < sizeof(addr.sun_path) - 1); + strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); + size_t addr_len = + offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1; + int fd, ret; + + fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (fd < 0) + goto out; + + ret = bind(fd, (const struct sockaddr *)&addr, addr_len); + if (ret < 0) + goto out; + + ret = listen(fd, 1); + if (ret < 0) + goto out; + + return fd; + +out: + if (fd >= 0) + close(fd); + return -1; +} + +static bool set_core_pattern(const char *pattern) +{ + FILE *file; + int ret; + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + if (!file) + return false; + + ret = fprintf(file, "%s", pattern); + fclose(file); + + return ret == strlen(pattern); +} + +static int get_peer_pidfd(int fd) +{ + int fd_peer_pidfd; + socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd); + int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd, + &fd_peer_pidfd_len); + if (ret < 0) { + fprintf(stderr, "%m - Failed to retrieve peer pidfd for coredump socket connection\n"); + return -1; + } + return fd_peer_pidfd; +} + +static bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info) +{ + memset(info, 0, sizeof(*info)); + info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; + return ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info) == 0; +} + +static void +wait_and_check_coredump_server(pid_t pid_coredump_server, + struct __test_metadata *const _metadata, + FIXTURE_DATA(coredump)* self) +{ + int status; + waitpid(pid_coredump_server, &status, 0); + self->pid_coredump_server = -ESRCH; + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); +} + TEST_F(coredump, socket) { int pidfd, ret, status; - FILE *file; pid_t pid, pid_coredump_server; struct stat st; struct pidfd_info info = {}; int ipc_sockets[2]; char c; - const struct sockaddr_un coredump_sk = { - .sun_family = AF_UNIX, - .sun_path = "/tmp/coredump.socket", - }; - size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) + - sizeof("/tmp/coredump.socket"); + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); ASSERT_EQ(ret, 0); - file = fopen("/proc/sys/kernel/core_pattern", "w"); - ASSERT_NE(file, NULL); - - ret = fprintf(file, "@/tmp/coredump.socket"); - ASSERT_EQ(ret, strlen("@/tmp/coredump.socket")); - ASSERT_EQ(fclose(file), 0); - pid_coredump_server = fork(); ASSERT_GE(pid_coredump_server, 0); if (pid_coredump_server == 0) { - int fd_server, fd_coredump, fd_peer_pidfd, fd_core_file; - socklen_t fd_peer_pidfd_len; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; close(ipc_sockets[0]); - fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); if (fd_server < 0) - _exit(EXIT_FAILURE); + goto out; - ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len); - if (ret < 0) { - fprintf(stderr, "Failed to bind coredump socket\n"); - close(fd_server); - close(ipc_sockets[1]); - _exit(EXIT_FAILURE); - } - - ret = listen(fd_server, 1); - if (ret < 0) { - fprintf(stderr, "Failed to listen on coredump socket\n"); - close(fd_server); - close(ipc_sockets[1]); - _exit(EXIT_FAILURE); - } - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) { - close(fd_server); - close(ipc_sockets[1]); - _exit(EXIT_FAILURE); - } + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + goto out; close(ipc_sockets[1]); fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) { - fprintf(stderr, "Failed to accept coredump socket connection\n"); - close(fd_server); - _exit(EXIT_FAILURE); - } + if (fd_coredump < 0) + goto out; - fd_peer_pidfd_len = sizeof(fd_peer_pidfd); - ret = getsockopt(fd_coredump, SOL_SOCKET, SO_PEERPIDFD, - &fd_peer_pidfd, &fd_peer_pidfd_len); - if (ret < 0) { - fprintf(stderr, "%m - Failed to retrieve peer pidfd for coredump socket connection\n"); - close(fd_coredump); - close(fd_server); - _exit(EXIT_FAILURE); - } + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) + goto out; - memset(&info, 0, sizeof(info)); - info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - ret = ioctl(fd_peer_pidfd, PIDFD_GET_INFO, &info); - if (ret < 0) { - fprintf(stderr, "Failed to retrieve pidfd info from peer pidfd for coredump socket connection\n"); - close(fd_coredump); - close(fd_server); - close(fd_peer_pidfd); - _exit(EXIT_FAILURE); - } + if (!get_pidfd_info(fd_peer_pidfd, &info)) + goto out; - if (!(info.mask & PIDFD_INFO_COREDUMP)) { - fprintf(stderr, "Missing coredump information from coredumping task\n"); - close(fd_coredump); - close(fd_server); - close(fd_peer_pidfd); - _exit(EXIT_FAILURE); - } + if (!(info.mask & PIDFD_INFO_COREDUMP)) + goto out; - if (!(info.coredump_mask & PIDFD_COREDUMPED)) { - fprintf(stderr, "Received connection from non-coredumping task\n"); - close(fd_coredump); - close(fd_server); - close(fd_peer_pidfd); - _exit(EXIT_FAILURE); - } + if (!(info.coredump_mask & PIDFD_COREDUMPED)) + goto out; fd_core_file = creat("/tmp/coredump.file", 0644); - if (fd_core_file < 0) { - fprintf(stderr, "Failed to create coredump file\n"); - close(fd_coredump); - close(fd_server); - close(fd_peer_pidfd); - _exit(EXIT_FAILURE); - } + if (fd_core_file < 0) + goto out; for (;;) { char buffer[4096]; ssize_t bytes_read, bytes_write; bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) { - close(fd_coredump); - close(fd_server); - close(fd_peer_pidfd); - close(fd_core_file); - _exit(EXIT_FAILURE); - } + if (bytes_read < 0) + goto out; if (bytes_read == 0) break; bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) { - close(fd_coredump); - close(fd_server); - close(fd_peer_pidfd); - close(fd_core_file); - _exit(EXIT_FAILURE); - } + if (bytes_read != bytes_write) + goto out; } - close(fd_coredump); - close(fd_server); - close(fd_peer_pidfd); - close(fd_core_file); - _exit(EXIT_SUCCESS); + exit_code = EXIT_SUCCESS; +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); } self->pid_coredump_server = pid_coredump_server; @@ -333,47 +349,27 @@ TEST_F(coredump, socket) ASSERT_TRUE(WIFSIGNALED(status)); ASSERT_TRUE(WCOREDUMP(status)); - info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - waitpid(pid_coredump_server, &status, 0); - self->pid_coredump_server = -ESRCH; - ASSERT_TRUE(WIFEXITED(status)); - ASSERT_EQ(WEXITSTATUS(status), 0); + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); ASSERT_GT(st.st_size, 0); - /* - * We should somehow validate the produced core file. - * For now just allow for visual inspection - */ system("file /tmp/coredump.file"); } TEST_F(coredump, socket_detect_userspace_client) { int pidfd, ret, status; - FILE *file; pid_t pid, pid_coredump_server; struct stat st; struct pidfd_info info = {}; int ipc_sockets[2]; char c; - const struct sockaddr_un coredump_sk = { - .sun_family = AF_UNIX, - .sun_path = "/tmp/coredump.socket", - }; - size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) + - sizeof("/tmp/coredump.socket"); - file = fopen("/proc/sys/kernel/core_pattern", "w"); - ASSERT_NE(file, NULL); - - ret = fprintf(file, "@/tmp/coredump.socket"); - ASSERT_EQ(ret, strlen("@/tmp/coredump.socket")); - ASSERT_EQ(fclose(file), 0); + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); ASSERT_EQ(ret, 0); @@ -381,87 +377,46 @@ TEST_F(coredump, socket_detect_userspace_client) pid_coredump_server = fork(); ASSERT_GE(pid_coredump_server, 0); if (pid_coredump_server == 0) { - int fd_server, fd_coredump, fd_peer_pidfd; - socklen_t fd_peer_pidfd_len; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; close(ipc_sockets[0]); - fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); if (fd_server < 0) - _exit(EXIT_FAILURE); + goto out; - ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len); - if (ret < 0) { - fprintf(stderr, "Failed to bind coredump socket\n"); - close(fd_server); - close(ipc_sockets[1]); - _exit(EXIT_FAILURE); - } - - ret = listen(fd_server, 1); - if (ret < 0) { - fprintf(stderr, "Failed to listen on coredump socket\n"); - close(fd_server); - close(ipc_sockets[1]); - _exit(EXIT_FAILURE); - } - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) { - close(fd_server); - close(ipc_sockets[1]); - _exit(EXIT_FAILURE); - } + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + goto out; close(ipc_sockets[1]); fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) { - fprintf(stderr, "Failed to accept coredump socket connection\n"); - close(fd_server); - _exit(EXIT_FAILURE); - } + if (fd_coredump < 0) + goto out; - fd_peer_pidfd_len = sizeof(fd_peer_pidfd); - ret = getsockopt(fd_coredump, SOL_SOCKET, SO_PEERPIDFD, - &fd_peer_pidfd, &fd_peer_pidfd_len); - if (ret < 0) { - fprintf(stderr, "%m - Failed to retrieve peer pidfd for coredump socket connection\n"); - close(fd_coredump); - close(fd_server); - _exit(EXIT_FAILURE); - } + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) + goto out; - memset(&info, 0, sizeof(info)); - info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - ret = ioctl(fd_peer_pidfd, PIDFD_GET_INFO, &info); - if (ret < 0) { - fprintf(stderr, "Failed to retrieve pidfd info from peer pidfd for coredump socket connection\n"); - close(fd_coredump); - close(fd_server); + if (!get_pidfd_info(fd_peer_pidfd, &info)) + goto out; + + if (!(info.mask & PIDFD_INFO_COREDUMP)) + goto out; + + if (info.coredump_mask & PIDFD_COREDUMPED) + goto out; + + exit_code = EXIT_SUCCESS; +out: + if (fd_peer_pidfd >= 0) close(fd_peer_pidfd); - _exit(EXIT_FAILURE); - } - - if (!(info.mask & PIDFD_INFO_COREDUMP)) { - fprintf(stderr, "Missing coredump information from coredumping task\n"); + if (fd_coredump >= 0) close(fd_coredump); + if (fd_server >= 0) close(fd_server); - close(fd_peer_pidfd); - _exit(EXIT_FAILURE); - } - - if (info.coredump_mask & PIDFD_COREDUMPED) { - fprintf(stderr, "Received unexpected connection from coredumping task\n"); - close(fd_coredump); - close(fd_server); - close(fd_peer_pidfd); - _exit(EXIT_FAILURE); - } - - close(fd_coredump); - close(fd_server); - close(fd_peer_pidfd); - _exit(EXIT_SUCCESS); + _exit(exit_code); } self->pid_coredump_server = pid_coredump_server; @@ -474,12 +429,18 @@ TEST_F(coredump, socket_detect_userspace_client) if (pid == 0) { int fd_socket; ssize_t ret; + const struct sockaddr_un coredump_sk = { + .sun_family = AF_UNIX, + .sun_path = "/tmp/coredump.socket", + }; + size_t coredump_sk_len = + offsetof(struct sockaddr_un, sun_path) + + sizeof("/tmp/coredump.socket"); fd_socket = socket(AF_UNIX, SOCK_STREAM, 0); if (fd_socket < 0) _exit(EXIT_FAILURE); - ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len); if (ret < 0) _exit(EXIT_FAILURE); @@ -495,15 +456,11 @@ TEST_F(coredump, socket_detect_userspace_client) ASSERT_TRUE(WIFEXITED(status)); ASSERT_EQ(WEXITSTATUS(status), 0); - info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0); - waitpid(pid_coredump_server, &status, 0); - self->pid_coredump_server = -ESRCH; - ASSERT_TRUE(WIFEXITED(status)); - ASSERT_EQ(WEXITSTATUS(status), 0); + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); ASSERT_NE(stat("/tmp/coredump.file", &st), 0); ASSERT_EQ(errno, ENOENT); @@ -511,16 +468,10 @@ TEST_F(coredump, socket_detect_userspace_client) TEST_F(coredump, socket_enoent) { - int pidfd, ret, status; - FILE *file; + int pidfd, status; pid_t pid; - file = fopen("/proc/sys/kernel/core_pattern", "w"); - ASSERT_NE(file, NULL); - - ret = fprintf(file, "@/tmp/coredump.socket"); - ASSERT_EQ(ret, strlen("@/tmp/coredump.socket")); - ASSERT_EQ(fclose(file), 0); + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); pid = fork(); ASSERT_GE(pid, 0); @@ -538,7 +489,6 @@ TEST_F(coredump, socket_enoent) TEST_F(coredump, socket_no_listener) { int pidfd, ret, status; - FILE *file; pid_t pid, pid_coredump_server; int ipc_sockets[2]; char c; @@ -549,44 +499,36 @@ TEST_F(coredump, socket_no_listener) size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) + sizeof("/tmp/coredump.socket"); + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); ASSERT_EQ(ret, 0); - file = fopen("/proc/sys/kernel/core_pattern", "w"); - ASSERT_NE(file, NULL); - - ret = fprintf(file, "@/tmp/coredump.socket"); - ASSERT_EQ(ret, strlen("@/tmp/coredump.socket")); - ASSERT_EQ(fclose(file), 0); - pid_coredump_server = fork(); ASSERT_GE(pid_coredump_server, 0); if (pid_coredump_server == 0) { - int fd_server; + int fd_server = -1; + int exit_code = EXIT_FAILURE; close(ipc_sockets[0]); fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); if (fd_server < 0) - _exit(EXIT_FAILURE); + goto out; ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len); - if (ret < 0) { - fprintf(stderr, "Failed to bind coredump socket\n"); - close(fd_server); - close(ipc_sockets[1]); - _exit(EXIT_FAILURE); - } + if (ret < 0) + goto out; - if (write_nointr(ipc_sockets[1], "1", 1) < 0) { - close(fd_server); - close(ipc_sockets[1]); - _exit(EXIT_FAILURE); - } + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + goto out; - close(fd_server); + exit_code = EXIT_SUCCESS; +out: + if (fd_server >= 0) + close(fd_server); close(ipc_sockets[1]); - _exit(EXIT_SUCCESS); + _exit(exit_code); } self->pid_coredump_server = pid_coredump_server; @@ -606,10 +548,7 @@ TEST_F(coredump, socket_no_listener) ASSERT_TRUE(WIFSIGNALED(status)); ASSERT_FALSE(WCOREDUMP(status)); - waitpid(pid_coredump_server, &status, 0); - self->pid_coredump_server = -ESRCH; - ASSERT_TRUE(WIFEXITED(status)); - ASSERT_EQ(WEXITSTATUS(status), 0); + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); } TEST_HARNESS_MAIN From be227ba8215f08aaeb9bfc3ce5f1db8763e7b490 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 3 Jun 2025 15:31:58 +0200 Subject: [PATCH 04/31] tools: add coredump.h header Copy the coredump header so we can rely on it in the selftests. Link: https://lore.kernel.org/20250603-work-coredump-socket-protocol-v2-4-05a5f0c18ecc@kernel.org Acked-by: Lennart Poettering Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- tools/include/uapi/linux/coredump.h | 104 ++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 tools/include/uapi/linux/coredump.h diff --git a/tools/include/uapi/linux/coredump.h b/tools/include/uapi/linux/coredump.h new file mode 100644 index 000000000000..dc3789b78af0 --- /dev/null +++ b/tools/include/uapi/linux/coredump.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_COREDUMP_H +#define _UAPI_LINUX_COREDUMP_H + +#include + +/** + * coredump_{req,ack} flags + * @COREDUMP_KERNEL: kernel writes coredump + * @COREDUMP_USERSPACE: userspace writes coredump + * @COREDUMP_REJECT: don't generate coredump + * @COREDUMP_WAIT: wait for coredump server + */ +enum { + COREDUMP_KERNEL = (1ULL << 0), + COREDUMP_USERSPACE = (1ULL << 1), + COREDUMP_REJECT = (1ULL << 2), + COREDUMP_WAIT = (1ULL << 3), +}; + +/** + * struct coredump_req - message kernel sends to userspace + * @size: size of struct coredump_req + * @size_ack: known size of struct coredump_ack on this kernel + * @mask: supported features + * + * When a coredump happens the kernel will connect to the coredump + * socket and send a coredump request to the coredump server. The @size + * member is set to the size of struct coredump_req and provides a hint + * to userspace how much data can be read. Userspace may use MSG_PEEK to + * peek the size of struct coredump_req and then choose to consume it in + * one go. Userspace may also simply read a COREDUMP_ACK_SIZE_VER0 + * request. If the size the kernel sends is larger userspace simply + * discards any remaining data. + * + * The coredump_req->mask member is set to the currently know features. + * Userspace may only set coredump_ack->mask to the bits raised by the + * kernel in coredump_req->mask. + * + * The coredump_req->size_ack member is set by the kernel to the size of + * struct coredump_ack the kernel knows. Userspace may only send up to + * coredump_req->size_ack bytes to the kernel and must set + * coredump_ack->size accordingly. + */ +struct coredump_req { + __u32 size; + __u32 size_ack; + __u64 mask; +}; + +enum { + COREDUMP_REQ_SIZE_VER0 = 16U, /* size of first published struct */ +}; + +/** + * struct coredump_ack - message userspace sends to kernel + * @size: size of the struct + * @spare: unused + * @mask: features kernel is supposed to use + * + * The @size member must be set to the size of struct coredump_ack. It + * may never exceed what the kernel returned in coredump_req->size_ack + * but it may of course be smaller (>= COREDUMP_ACK_SIZE_VER0 and <= + * coredump_req->size_ack). + * + * The @mask member must be set to the features the coredump server + * wants the kernel to use. Only bits the kernel returned in + * coredump_req->mask may be set. + */ +struct coredump_ack { + __u32 size; + __u32 spare; + __u64 mask; +}; + +enum { + COREDUMP_ACK_SIZE_VER0 = 16U, /* size of first published struct */ +}; + +/** + * enum coredump_mark - Markers for the coredump socket + * + * The kernel will place a single byte on the coredump socket. The + * markers notify userspace whether the coredump ack succeeded or + * failed. + * + * @COREDUMP_MARK_MINSIZE: the provided coredump_ack size was too small + * @COREDUMP_MARK_MAXSIZE: the provided coredump_ack size was too big + * @COREDUMP_MARK_UNSUPPORTED: the provided coredump_ack mask was invalid + * @COREDUMP_MARK_CONFLICTING: the provided coredump_ack mask has conflicting options + * @COREDUMP_MARK_REQACK: the coredump request and ack was successful + * @__COREDUMP_MARK_MAX: the maximum coredump mark value + */ +enum coredump_mark { + COREDUMP_MARK_REQACK = 0U, + COREDUMP_MARK_MINSIZE = 1U, + COREDUMP_MARK_MAXSIZE = 2U, + COREDUMP_MARK_UNSUPPORTED = 3U, + COREDUMP_MARK_CONFLICTING = 4U, + __COREDUMP_MARK_MAX = (1U << 31), +}; + +#endif /* _UAPI_LINUX_COREDUMP_H */ From 59cd658eaf404e3634624b25afc3233066bea34c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 3 Jun 2025 15:31:59 +0200 Subject: [PATCH 05/31] selftests/coredump: add coredump server selftests This adds extensive tests for the coredump server. Link: https://lore.kernel.org/20250603-work-coredump-socket-protocol-v2-5-05a5f0c18ecc@kernel.org Acked-by: Lennart Poettering Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- tools/testing/selftests/coredump/Makefile | 2 +- tools/testing/selftests/coredump/config | 3 + .../selftests/coredump/stackdump_test.c | 1255 ++++++++++++++++- 3 files changed, 1258 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/coredump/config diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile index bc287a85b825..77b3665c73c7 100644 --- a/tools/testing/selftests/coredump/Makefile +++ b/tools/testing/selftests/coredump/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS = -Wall -O0 $(KHDR_INCLUDES) +CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) TEST_GEN_PROGS := stackdump_test TEST_FILES := stackdump diff --git a/tools/testing/selftests/coredump/config b/tools/testing/selftests/coredump/config new file mode 100644 index 000000000000..a05ef112b4f9 --- /dev/null +++ b/tools/testing/selftests/coredump/config @@ -0,0 +1,3 @@ +CONFIG_COREDUMP=y +CONFIG_NET=y +CONFIG_UNIX=y diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index 4d922e5f89fe..9a789156f27e 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -4,10 +4,15 @@ #include #include #include +#include +#include +#include #include #include #include #include +#include +#include #include #include #include @@ -15,6 +20,7 @@ #include #include "../kselftest_harness.h" +#include "../filesystems/wrappers.h" #include "../pidfd/pidfd.h" #define STACKDUMP_FILE "stack_values" @@ -49,14 +55,32 @@ FIXTURE(coredump) { char original_core_pattern[256]; pid_t pid_coredump_server; + int fd_tmpfs_detached; }; +static int create_detached_tmpfs(void) +{ + int fd_context, fd_tmpfs; + + fd_context = sys_fsopen("tmpfs", 0); + if (fd_context < 0) + return -1; + + if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) + return -1; + + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + close(fd_context); + return fd_tmpfs; +} + FIXTURE_SETUP(coredump) { FILE *file; int ret; self->pid_coredump_server = -ESRCH; + self->fd_tmpfs_detached = -1; file = fopen("/proc/sys/kernel/core_pattern", "r"); ASSERT_NE(NULL, file); @@ -65,6 +89,8 @@ FIXTURE_SETUP(coredump) ASSERT_LT(ret, sizeof(self->original_core_pattern)); self->original_core_pattern[ret] = '\0'; + self->fd_tmpfs_detached = create_detached_tmpfs(); + ASSERT_GE(self->fd_tmpfs_detached, 0); ret = fclose(file); ASSERT_EQ(0, ret); @@ -103,6 +129,15 @@ FIXTURE_TEARDOWN(coredump) goto fail; } + if (self->fd_tmpfs_detached >= 0) { + ret = close(self->fd_tmpfs_detached); + if (ret < 0) { + reason = "Unable to close detached tmpfs"; + goto fail; + } + self->fd_tmpfs_detached = -1; + } + return; fail: /* This should never happen */ @@ -192,7 +227,7 @@ static int create_and_listen_unix_socket(const char *path) if (ret < 0) goto out; - ret = listen(fd, 1); + ret = listen(fd, 128); if (ret < 0) goto out; @@ -551,4 +586,1222 @@ out: wait_and_check_coredump_server(pid_coredump_server, _metadata, self); } +static ssize_t recv_marker(int fd) +{ + enum coredump_mark mark = COREDUMP_MARK_REQACK; + ssize_t ret; + + ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL); + if (ret != sizeof(mark)) + return -1; + + switch (mark) { + case COREDUMP_MARK_REQACK: + fprintf(stderr, "Received marker: ReqAck\n"); + return COREDUMP_MARK_REQACK; + case COREDUMP_MARK_MINSIZE: + fprintf(stderr, "Received marker: MinSize\n"); + return COREDUMP_MARK_MINSIZE; + case COREDUMP_MARK_MAXSIZE: + fprintf(stderr, "Received marker: MaxSize\n"); + return COREDUMP_MARK_MAXSIZE; + case COREDUMP_MARK_UNSUPPORTED: + fprintf(stderr, "Received marker: Unsupported\n"); + return COREDUMP_MARK_UNSUPPORTED; + case COREDUMP_MARK_CONFLICTING: + fprintf(stderr, "Received marker: Conflicting\n"); + return COREDUMP_MARK_CONFLICTING; + default: + fprintf(stderr, "Received unknown marker: %u\n", mark); + break; + } + return -1; +} + +static bool read_marker(int fd, enum coredump_mark mark) +{ + ssize_t ret; + + ret = recv_marker(fd); + if (ret < 0) + return false; + return ret == mark; +} + +static bool read_coredump_req(int fd, struct coredump_req *req) +{ + ssize_t ret; + size_t field_size, user_size, ack_size, kernel_size, remaining_size; + + memset(req, 0, sizeof(*req)); + field_size = sizeof(req->size); + + /* Peek the size of the coredump request. */ + ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL); + if (ret != field_size) + return false; + kernel_size = req->size; + + if (kernel_size < COREDUMP_ACK_SIZE_VER0) + return false; + if (kernel_size >= PAGE_SIZE) + return false; + + /* Use the minimum of user and kernel size to read the full request. */ + user_size = sizeof(struct coredump_req); + ack_size = user_size < kernel_size ? user_size : kernel_size; + ret = recv(fd, req, ack_size, MSG_WAITALL); + if (ret != ack_size) + return false; + + fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n", + req->size, (unsigned long long)req->mask); + + if (user_size > kernel_size) + remaining_size = user_size - kernel_size; + else + remaining_size = kernel_size - user_size; + + if (PAGE_SIZE <= remaining_size) + return false; + + /* + * Discard any additional data if the kernel's request was larger than + * what we knew about or cared about. + */ + if (remaining_size) { + char buffer[PAGE_SIZE]; + + ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL); + if (ret != remaining_size) + return false; + fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size); + } + + return true; +} + +static bool send_coredump_ack(int fd, const struct coredump_req *req, + __u64 mask, size_t size_ack) +{ + ssize_t ret; + /* + * Wrap struct coredump_ack in a larger struct so we can + * simulate sending to much data to the kernel. + */ + struct large_ack_for_size_testing { + struct coredump_ack ack; + char buffer[PAGE_SIZE]; + } large_ack = {}; + + if (!size_ack) + size_ack = sizeof(struct coredump_ack) < req->size_ack ? + sizeof(struct coredump_ack) : + req->size_ack; + large_ack.ack.mask = mask; + large_ack.ack.size = size_ack; + ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL); + if (ret != size_ack) + return false; + + fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n", + size_ack, (unsigned long long)mask); + return true; +} + +static bool check_coredump_req(const struct coredump_req *req, size_t min_size, + __u64 required_mask) +{ + if (req->size < min_size) + return false; + if ((req->mask & required_mask) != required_mask) + return false; + if (req->mask & ~required_mask) + return false; + return true; +} + +TEST_F(coredump, socket_request_kernel) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) + goto out; + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + goto out; + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) + goto out; + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) + goto out; + + if (!get_pidfd_info(fd_peer_pidfd, &info)) + goto out; + + if (!(info.mask & PIDFD_INFO_COREDUMP)) + goto out; + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) + goto out; + + fd_core_file = creat("/tmp/coredump.file", 0644); + if (fd_core_file < 0) + goto out; + + if (!read_coredump_req(fd_coredump, &req)) + goto out; + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) + goto out; + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_WAIT, 0)) + goto out; + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) + goto out; + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) + goto out; + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) + goto out; + } + + exit_code = EXIT_SUCCESS; +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); + ASSERT_GT(st.st_size, 0); + system("file /tmp/coredump.file"); +} + +TEST_F(coredump, socket_request_userspace) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) + goto out; + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + goto out; + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) + goto out; + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) + goto out; + + if (!get_pidfd_info(fd_peer_pidfd, &info)) + goto out; + + if (!(info.mask & PIDFD_INFO_COREDUMP)) + goto out; + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) + goto out; + + if (!read_coredump_req(fd_coredump, &req)) + goto out; + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) + goto out; + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_USERSPACE | COREDUMP_WAIT, 0)) + goto out; + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) + goto out; + + for (;;) { + char buffer[4096]; + ssize_t bytes_read; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read > 0) + goto out; + + if (bytes_read < 0) + goto out; + + if (bytes_read == 0) + break; + } + + exit_code = EXIT_SUCCESS; +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_reject) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) + goto out; + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + goto out; + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) + goto out; + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) + goto out; + + if (!get_pidfd_info(fd_peer_pidfd, &info)) + goto out; + + if (!(info.mask & PIDFD_INFO_COREDUMP)) + goto out; + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) + goto out; + + if (!read_coredump_req(fd_coredump, &req)) + goto out; + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) + goto out; + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) + goto out; + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) + goto out; + + for (;;) { + char buffer[4096]; + ssize_t bytes_read; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read > 0) + goto out; + + if (bytes_read < 0) + goto out; + + if (bytes_read == 0) + break; + } + + exit_code = EXIT_SUCCESS; +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_flag_combination) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) + goto out; + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + goto out; + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) + goto out; + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) + goto out; + + if (!get_pidfd_info(fd_peer_pidfd, &info)) + goto out; + + if (!(info.mask & PIDFD_INFO_COREDUMP)) + goto out; + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) + goto out; + + if (!read_coredump_req(fd_coredump, &req)) + goto out; + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) + goto out; + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0)) + goto out; + + if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING)) + goto out; + + exit_code = EXIT_SUCCESS; +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_unknown_flag) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) + goto out; + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + goto out; + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) + goto out; + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) + goto out; + + if (!get_pidfd_info(fd_peer_pidfd, &info)) + goto out; + + if (!(info.mask & PIDFD_INFO_COREDUMP)) + goto out; + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) + goto out; + + if (!read_coredump_req(fd_coredump, &req)) + goto out; + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) + goto out; + + if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0)) + goto out; + + if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED)) + goto out; + + exit_code = EXIT_SUCCESS; +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_size_small) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) + goto out; + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + goto out; + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) + goto out; + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) + goto out; + + if (!get_pidfd_info(fd_peer_pidfd, &info)) + goto out; + + if (!(info.mask & PIDFD_INFO_COREDUMP)) + goto out; + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) + goto out; + + if (!read_coredump_req(fd_coredump, &req)) + goto out; + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) + goto out; + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, + COREDUMP_ACK_SIZE_VER0 / 2)) + goto out; + + if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE)) + goto out; + + exit_code = EXIT_SUCCESS; +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_size_large) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) + goto out; + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + goto out; + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) + goto out; + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) + goto out; + + if (!get_pidfd_info(fd_peer_pidfd, &info)) + goto out; + + if (!(info.mask & PIDFD_INFO_COREDUMP)) + goto out; + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) + goto out; + + if (!read_coredump_req(fd_coredump, &req)) + goto out; + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) + goto out; + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, + COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE)) + goto out; + + if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE)) + goto out; + + exit_code = EXIT_SUCCESS; +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +static int open_coredump_tmpfile(int fd_tmpfs_detached) +{ + return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600); +} + +#define NUM_CRASHING_COREDUMPS 5 + +TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500) +{ + int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; + pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + struct coredump_req req = {}; + + close(ipc_sockets[0]); + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "Failed to create and listen on unix socket\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "Failed to notify parent via ipc socket\n"); + goto out; + } + close(ipc_sockets[1]); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd); + goto out; + } + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file); + goto out; + } + } + + close(fd_core_file); + close(fd_peer_pidfd); + close(fd_coredump); + fd_peer_pidfd = -1; + fd_coredump = -1; + } + + exit_code = EXIT_SUCCESS; +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + pid[i] = fork(); + ASSERT_GE(pid[i], 0); + if (pid[i] == 0) + crashing_child(); + pidfd[i] = sys_pidfd_open(pid[i], 0); + ASSERT_GE(pidfd[i], 0); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + waitpid(pid[i], &status[i], 0); + ASSERT_TRUE(WIFSIGNALED(status[i])); + ASSERT_TRUE(WCOREDUMP(status[i])); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; + ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + } + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +#define MAX_EVENTS 128 + +static void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file) +{ + int epfd = -1; + int exit_code = EXIT_FAILURE; + + epfd = epoll_create1(0); + if (epfd < 0) + goto out; + + struct epoll_event ev; + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; + ev.data.fd = fd_coredump; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0) + goto out; + + for (;;) { + struct epoll_event events[1]; + int n = epoll_wait(epfd, events, 1, -1); + if (n < 0) + break; + + if (events[0].events & (EPOLLIN | EPOLLRDHUP)) { + for (;;) { + char buffer[4096]; + ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + break; + goto out; + } + if (bytes_read == 0) + goto done; + ssize_t bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_write != bytes_read) + goto out; + } + } + } + +done: + exit_code = EXIT_SUCCESS; +out: + if (epfd >= 0) + close(epfd); + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + _exit(exit_code); +} + +TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500) +{ + int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; + pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS]; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0; + fd_server = -1; + exit_code = EXIT_FAILURE; + n_conns = 0; + close(ipc_sockets[0]); + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) + goto out; + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + goto out; + close(ipc_sockets[1]); + + while (n_conns < NUM_CRASHING_COREDUMPS) { + int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + struct coredump_req req = {}; + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + continue; + goto out; + } + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) + goto out; + if (!get_pidfd_info(fd_peer_pidfd, &info)) + goto out; + if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED)) + goto out; + if (!read_coredump_req(fd_coredump, &req)) + goto out; + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) + goto out; + if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0)) + goto out; + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) + goto out; + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) + goto out; + pid_t worker = fork(); + if (worker == 0) { + close(fd_server); + process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file); + } + worker_pids[n_conns] = worker; + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_core_file >= 0) + close(fd_core_file); + n_conns++; + } + exit_code = EXIT_SUCCESS; +out: + if (fd_server >= 0) + close(fd_server); + + // Reap all worker processes + for (int i = 0; i < n_conns; i++) { + int wstatus; + if (waitpid(worker_pids[i], &wstatus, 0) < 0) { + fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]); + } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) { + fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus)); + exit_code = EXIT_FAILURE; + } + } + + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + pid[i] = fork(); + ASSERT_GE(pid[i], 0); + if (pid[i] == 0) + crashing_child(); + pidfd[i] = sys_pidfd_open(pid[i], 0); + ASSERT_GE(pidfd[i], 0); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + ASSERT_GE(waitpid(pid[i], &status[i], 0), 0); + ASSERT_TRUE(WIFSIGNALED(status[i])); + ASSERT_TRUE(WCOREDUMP(status[i])); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; + ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + } + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + TEST_HARNESS_MAIN From e04f97c8be29523bae2576fceee84a4b030406fb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Jun 2025 11:53:15 +0200 Subject: [PATCH 06/31] coredump: cleanup coredump socket functions We currently use multiple CONFIG_UNIX guards. This looks messy and makes the code harder to follow and maintain. Use a helper function coredump_sock_connect() that handles the connect portion. This allows us to remove the CONFIG_UNIX guard in the main do_coredump() function. Link: https://lore.kernel.org/20250605-schlamm-touren-720ba2b60a85@brauner Signed-off-by: Christian Brauner --- fs/coredump.c | 157 ++++++++++++++++++++++++++------------------------ 1 file changed, 83 insertions(+), 74 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index b3eaa8c27ced..3568c300623c 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -646,6 +646,77 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) } #ifdef CONFIG_UNIX +static bool coredump_sock_connect(struct core_name *cn, struct coredump_params *cprm) +{ + struct file *file __free(fput) = NULL; + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + ssize_t addr_len; + int retval; + struct socket *socket; + + addr_len = strscpy(addr.sun_path, cn->corename); + if (addr_len < 0) + return false; + addr_len += offsetof(struct sockaddr_un, sun_path) + 1; + + /* + * It is possible that the userspace process which is supposed + * to handle the coredump and is listening on the AF_UNIX socket + * coredumps. Userspace should just mark itself non dumpable. + */ + + retval = sock_create_kern(&init_net, AF_UNIX, SOCK_STREAM, 0, &socket); + if (retval < 0) + return false; + + file = sock_alloc_file(socket, 0, NULL); + if (IS_ERR(file)) + return false; + + /* + * Set the thread-group leader pid which is used for the peer + * credentials during connect() below. Then immediately register + * it in pidfs... + */ + cprm->pid = task_tgid(current); + retval = pidfs_register_pid(cprm->pid); + if (retval) + return false; + + /* + * ... and set the coredump information so userspace has it + * available after connect()... + */ + pidfs_coredump(cprm); + + retval = kernel_connect(socket, (struct sockaddr *)(&addr), addr_len, + O_NONBLOCK | SOCK_COREDUMP); + /* + * ... Make sure to only put our reference after connect() took + * its own reference keeping the pidfs entry alive ... + */ + pidfs_put_pid(cprm->pid); + + if (retval) { + if (retval == -EAGAIN) + coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path); + else + coredump_report_failure("Coredump socket connection %s failed %d", addr.sun_path, retval); + return false; + } + + /* ... and validate that @sk_peer_pid matches @cprm.pid. */ + if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm->pid)) + return false; + + cprm->limit = RLIM_INFINITY; + cprm->file = no_free_ptr(file); + + return true; +} + static inline bool coredump_sock_recv(struct file *file, struct coredump_ack *ack, size_t size, int flags) { struct msghdr msg = {}; @@ -707,7 +778,7 @@ static inline void coredump_sock_shutdown(struct file *file) kernel_sock_shutdown(socket, SHUT_WR); } -static bool coredump_request(struct core_name *cn, struct coredump_params *cprm) +static bool coredump_sock_request(struct core_name *cn, struct coredump_params *cprm) { struct coredump_req req = { .size = sizeof(struct coredump_req), @@ -770,6 +841,14 @@ static bool coredump_request(struct core_name *cn, struct coredump_params *cprm) return coredump_sock_mark(cprm->file, COREDUMP_MARK_REQACK); } #else +static bool coredump_sock_connect(struct core_name *cn, + struct coredump_params *cprm) +{ + coredump_report_failure("Core dump socket support %s disabled", cn->corename); + return false; +} +static bool coredump_sock_request(struct core_name *cn, + struct coredump_params *cprm) { return false; } static inline void coredump_sock_wait(struct file *file) { } static inline void coredump_sock_shutdown(struct file *file) { } #endif @@ -994,83 +1073,13 @@ void do_coredump(const kernel_siginfo_t *siginfo) } case COREDUMP_SOCK_REQ: fallthrough; - case COREDUMP_SOCK: { -#ifdef CONFIG_UNIX - struct file *file __free(fput) = NULL; - struct sockaddr_un addr = { - .sun_family = AF_UNIX, - }; - ssize_t addr_len; - struct socket *socket; - - addr_len = strscpy(addr.sun_path, cn.corename); - if (addr_len < 0) - goto close_fail; - addr_len += offsetof(struct sockaddr_un, sun_path) + 1; - - /* - * It is possible that the userspace process which is - * supposed to handle the coredump and is listening on - * the AF_UNIX socket coredumps. Userspace should just - * mark itself non dumpable. - */ - - retval = sock_create_kern(&init_net, AF_UNIX, SOCK_STREAM, 0, &socket); - if (retval < 0) + case COREDUMP_SOCK: + if (!coredump_sock_connect(&cn, &cprm)) goto close_fail; - file = sock_alloc_file(socket, 0, NULL); - if (IS_ERR(file)) + if (!coredump_sock_request(&cn, &cprm)) goto close_fail; - - /* - * Set the thread-group leader pid which is used for the - * peer credentials during connect() below. Then - * immediately register it in pidfs... - */ - cprm.pid = task_tgid(current); - retval = pidfs_register_pid(cprm.pid); - if (retval) - goto close_fail; - - /* - * ... and set the coredump information so userspace - * has it available after connect()... - */ - pidfs_coredump(&cprm); - - retval = kernel_connect(socket, (struct sockaddr *)(&addr), - addr_len, O_NONBLOCK | SOCK_COREDUMP); - - /* - * ... Make sure to only put our reference after connect() took - * its own reference keeping the pidfs entry alive ... - */ - pidfs_put_pid(cprm.pid); - - if (retval) { - if (retval == -EAGAIN) - coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path); - else - coredump_report_failure("Coredump socket connection %s failed %d", addr.sun_path, retval); - goto close_fail; - } - - /* ... and validate that @sk_peer_pid matches @cprm.pid. */ - if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm.pid)) - goto close_fail; - - cprm.limit = RLIM_INFINITY; - cprm.file = no_free_ptr(file); - - if (!coredump_request(&cn, &cprm)) - goto close_fail; -#else - coredump_report_failure("Core dump socket support %s disabled", cn.corename); - goto close_fail; -#endif break; - } default: WARN_ON_ONCE(true); goto close_fail; From fb4366ba8f1c86a72ffcb2b6f349e05cf77897d0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:15 +0200 Subject: [PATCH 07/31] coredump: rename format_corename() It's not really about the name anymore. It parses very distinct information. Reflect that in the name. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-1-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 3568c300623c..79a3c8141e8c 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -225,12 +225,13 @@ put_exe_file: return ret; } -/* format_corename will inspect the pattern parameter, and output a - * name into corename, which must have space for at least - * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. +/* + * coredump_parse will inspect the pattern parameter, and output a name + * into corename, which must have space for at least CORENAME_MAX_SIZE + * bytes plus one byte for the zero terminator. */ -static int format_corename(struct core_name *cn, struct coredump_params *cprm, - size_t **argv, int *argc) +static int coredump_parse(struct core_name *cn, struct coredump_params *cprm, + size_t **argv, int *argc) { const struct cred *cred = current_cred(); const char *pat_ptr = core_pattern; @@ -910,7 +911,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) old_cred = override_creds(cred); - retval = format_corename(&cn, &cprm, &argv, &argc); + retval = coredump_parse(&cn, &cprm, &argv, &argc); if (retval < 0) { coredump_report_failure("format_corename failed, aborting core"); goto fail_unlock; From a5715af549b2ee0139ff965d337cfd1a5f7ee615 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:16 +0200 Subject: [PATCH 08/31] coredump: make coredump_parse() return bool There's no point in returning negative error values. They will never be seen by anyone. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-2-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 79a3c8141e8c..42ceb9db2a5a 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -230,8 +230,8 @@ put_exe_file: * into corename, which must have space for at least CORENAME_MAX_SIZE * bytes plus one byte for the zero terminator. */ -static int coredump_parse(struct core_name *cn, struct coredump_params *cprm, - size_t **argv, int *argc) +static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm, + size_t **argv, int *argc) { const struct cred *cred = current_cred(); const char *pat_ptr = core_pattern; @@ -251,7 +251,7 @@ static int coredump_parse(struct core_name *cn, struct coredump_params *cprm, else cn->core_type = COREDUMP_FILE; if (expand_corename(cn, core_name_size)) - return -ENOMEM; + return false; cn->corename[0] = '\0'; switch (cn->core_type) { @@ -259,33 +259,33 @@ static int coredump_parse(struct core_name *cn, struct coredump_params *cprm, int argvs = sizeof(core_pattern) / 2; (*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL); if (!(*argv)) - return -ENOMEM; + return false; (*argv)[(*argc)++] = 0; ++pat_ptr; if (!(*pat_ptr)) - return -ENOMEM; + return false; break; } case COREDUMP_SOCK: { /* skip the @ */ pat_ptr++; if (!(*pat_ptr)) - return -ENOMEM; + return false; if (*pat_ptr == '@') { pat_ptr++; if (!(*pat_ptr)) - return -ENOMEM; + return false; cn->core_type = COREDUMP_SOCK_REQ; } err = cn_printf(cn, "%s", pat_ptr); if (err) - return err; + return false; /* Require absolute paths. */ if (cn->corename[0] != '/') - return -EINVAL; + return false; /* * Ensure we can uses spaces to indicate additional @@ -293,7 +293,7 @@ static int coredump_parse(struct core_name *cn, struct coredump_params *cprm, */ if (strchr(cn->corename, ' ')) { coredump_report_failure("Coredump socket may not %s contain spaces", cn->corename); - return -EINVAL; + return false; } /* @@ -303,13 +303,13 @@ static int coredump_parse(struct core_name *cn, struct coredump_params *cprm, * via /proc/, using the SO_PEERPIDFD to guard * against pid recycling when opening /proc/. */ - return 0; + return true; } case COREDUMP_FILE: break; default: WARN_ON_ONCE(true); - return -EINVAL; + return false; } /* Repeat as long as we have more pattern to process and more output @@ -447,7 +447,7 @@ static int coredump_parse(struct core_name *cn, struct coredump_params *cprm, } if (err) - return err; + return false; } out: @@ -457,9 +457,9 @@ out: * and core_uses_pid is set, then .%pid will be appended to * the filename. Do not do this for piped commands. */ if (cn->core_type == COREDUMP_FILE && !pid_in_pattern && core_uses_pid) - return cn_printf(cn, ".%d", task_tgid_vnr(current)); + return cn_printf(cn, ".%d", task_tgid_vnr(current)) == 0; - return 0; + return true; } static int zap_process(struct signal_struct *signal, int exit_code) @@ -911,8 +911,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) old_cred = override_creds(cred); - retval = coredump_parse(&cn, &cprm, &argv, &argc); - if (retval < 0) { + if (!coredump_parse(&cn, &cprm, &argv, &argc)) { coredump_report_failure("format_corename failed, aborting core"); goto fail_unlock; } From 67c3a0b0ad1a78d7ee9c3aadaed22561f7f85466 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:17 +0200 Subject: [PATCH 09/31] coredump: fix socket path validation Make sure that we keep it extensible and well-formed. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-3-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 42ceb9db2a5a..70e37435eca9 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1399,9 +1399,17 @@ static inline bool check_coredump_socket(void) if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns) return false; - /* Must be an absolute path or the socket request. */ - if (*(core_pattern + 1) != '/' && *(core_pattern + 1) != '@') + /* Must be an absolute path... */ + if (core_pattern[1] != '/') { + /* ... or the socket request protocol... */ + if (core_pattern[1] != '@') + return false; + /* ... and if so must be an absolute path. */ + if (core_pattern[2] != '/') + return false; + /* Anything else is unsupported. */ return false; + } return true; } From 3a2c977c463c68bf6fcd0138d15efa5f3adc743c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:18 +0200 Subject: [PATCH 10/31] coredump: validate that path doesn't exceed UNIX_PATH_MAX so we don't pointlessly accepts things that go over the limit. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-4-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 70e37435eca9..a64b87878ab3 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1388,6 +1388,8 @@ void validate_coredump_safety(void) static inline bool check_coredump_socket(void) { + const char *p; + if (core_pattern[0] != '@') return true; @@ -1407,10 +1409,15 @@ static inline bool check_coredump_socket(void) /* ... and if so must be an absolute path. */ if (core_pattern[2] != '/') return false; - /* Anything else is unsupported. */ - return false; + p = &core_pattern[2]; + } else { + p = &core_pattern[1]; } + /* The path obviously cannot exceed UNIX_PATH_MAX. */ + if (strlen(p) >= UNIX_PATH_MAX) + return false; + return true; } From 0da3e3822cfabf062945e449f91ea3ca529eeaa4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:19 +0200 Subject: [PATCH 11/31] fs: move name_contains_dotdot() to header Move the helper from the firmware specific code to a header so we can reuse it for coredump sockets. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-5-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- drivers/base/firmware_loader/main.c | 31 ++++++++++------------------- include/linux/fs.h | 16 +++++++++++++++ 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 44486b2c7172..6942c62fa59d 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -822,26 +822,6 @@ static void fw_log_firmware_info(const struct firmware *fw, const char *name, {} #endif -/* - * Reject firmware file names with ".." path components. - * There are drivers that construct firmware file names from device-supplied - * strings, and we don't want some device to be able to tell us "I would like to - * be sent my firmware from ../../../etc/shadow, please". - * - * Search for ".." surrounded by either '/' or start/end of string. - * - * This intentionally only looks at the firmware name, not at the firmware base - * directory or at symlink contents. - */ -static bool name_contains_dotdot(const char *name) -{ - size_t name_len = strlen(name); - - return strcmp(name, "..") == 0 || strncmp(name, "../", 3) == 0 || - strstr(name, "/../") != NULL || - (name_len >= 3 && strcmp(name+name_len-3, "/..") == 0); -} - /* called from request_firmware() and request_firmware_work_func() */ static int _request_firmware(const struct firmware **firmware_p, const char *name, @@ -862,6 +842,17 @@ _request_firmware(const struct firmware **firmware_p, const char *name, goto out; } + + /* + * Reject firmware file names with ".." path components. + * There are drivers that construct firmware file names from + * device-supplied strings, and we don't want some device to be + * able to tell us "I would like to be sent my firmware from + * ../../../etc/shadow, please". + * + * This intentionally only looks at the firmware name, not at + * the firmware base directory or at symlink contents. + */ if (name_contains_dotdot(name)) { dev_warn(device, "Firmware load for '%s' refused, path contains '..' component\n", diff --git a/include/linux/fs.h b/include/linux/fs.h index 96c7925a6551..18fdbd184eea 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3264,6 +3264,22 @@ static inline bool is_dot_dotdot(const char *name, size_t len) (len == 1 || (len == 2 && name[1] == '.')); } +/** + * name_contains_dotdot - check if a file name contains ".." path components + * + * Search for ".." surrounded by either '/' or start/end of string. + */ +static inline bool name_contains_dotdot(const char *name) +{ + size_t name_len; + + name_len = strlen(name); + return strcmp(name, "..") == 0 || + strncmp(name, "../", 3) == 0 || + strstr(name, "/../") != NULL || + (name_len >= 3 && strcmp(name + name_len - 3, "/..") == 0); +} + #include /* needed for stackable file system support */ From edfe3bdbbb52339cd8c2366402f2702c5ebc15c7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:20 +0200 Subject: [PATCH 12/31] coredump: don't allow ".." in coredump socket path There's no point in allowing to walk upwards for the coredump socket. We already force userspace to give use a sane path, no symlinks, no magiclinks, and also block "..". Use an absolute path without any shenanigans. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-6-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/coredump.c b/fs/coredump.c index a64b87878ab3..8437bdc26d08 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1418,6 +1418,10 @@ static inline bool check_coredump_socket(void) if (strlen(p) >= UNIX_PATH_MAX) return false; + /* Must not contain ".." in the path. */ + if (name_contains_dotdot(core_pattern)) + return false; + return true; } From 6dfc06d328b70af22c577bb908c97f8841b9f4fc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:21 +0200 Subject: [PATCH 13/31] coredump: validate socket path in coredump_parse() properly again. Someone might have modified the buffer concurrently. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-7-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/coredump.c b/fs/coredump.c index 8437bdc26d08..52efd1b34261 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -296,6 +296,17 @@ static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm, return false; } + /* Must not contain ".." in the path. */ + if (name_contains_dotdot(cn->corename)) { + coredump_report_failure("Coredump socket may not %s contain '..' spaces", cn->corename); + return false; + } + + if (strlen(cn->corename) >= UNIX_PATH_MAX) { + coredump_report_failure("Coredump socket path %s too long", cn->corename); + return false; + } + /* * Currently no need to parse any other options. * Relevant information can be retrieved from the peer From 8a25350fa430a28d0595a6d14af661d4f151b123 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:22 +0200 Subject: [PATCH 14/31] selftests/coredump: make sure invalid paths are rejected Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-8-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- .../selftests/coredump/stackdump_test.c | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index 9a789156f27e..a4ac80bb1003 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -241,16 +241,19 @@ out: static bool set_core_pattern(const char *pattern) { - FILE *file; - int ret; + int fd; + ssize_t ret; - file = fopen("/proc/sys/kernel/core_pattern", "w"); - if (!file) + fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC); + if (fd < 0) return false; - ret = fprintf(file, "%s", pattern); - fclose(file); + ret = write(fd, pattern, strlen(pattern)); + close(fd); + if (ret < 0) + return false; + fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern)); return ret == strlen(pattern); } @@ -1804,4 +1807,21 @@ out: wait_and_check_coredump_server(pid_coredump_server, _metadata, self); } +TEST_F(coredump, socket_invalid_paths) +{ + ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket")); + ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/..")); + ASSERT_FALSE(set_core_pattern("@..")); + + ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/..")); + ASSERT_FALSE(set_core_pattern("@@..")); + + ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket")); +} + TEST_HARNESS_MAIN From 70e3ee31282d293c794fb5bbec8efe495c32044b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:23 +0200 Subject: [PATCH 15/31] coredump: rename do_coredump() to vfs_coredump() Align the naming with the rest of our helpers exposed outside of core vfs. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-9-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- Documentation/security/credentials.rst | 2 +- Documentation/translations/zh_CN/security/credentials.rst | 2 +- fs/coredump.c | 2 +- include/linux/coredump.h | 4 ++-- kernel/signal.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/security/credentials.rst b/Documentation/security/credentials.rst index 2aa0791bcefe..d0191c8b8060 100644 --- a/Documentation/security/credentials.rst +++ b/Documentation/security/credentials.rst @@ -555,5 +555,5 @@ the VFS, and that can be done by calling into such as ``vfs_mkdir()`` with a different set of credentials. This is done in the following places: * ``sys_faccessat()``. - * ``do_coredump()``. + * ``vfs_coredump()``. * nfs4recover.c. diff --git a/Documentation/translations/zh_CN/security/credentials.rst b/Documentation/translations/zh_CN/security/credentials.rst index 91c353dfb622..88fcd9152ffe 100644 --- a/Documentation/translations/zh_CN/security/credentials.rst +++ b/Documentation/translations/zh_CN/security/credentials.rst @@ -475,5 +475,5 @@ const指针上操作,因此不需要进行类型转换,但需要临时放弃 如 ``vfs_mkdir()`` 来实现。以下是一些进行此操作的位置: * ``sys_faccessat()``. - * ``do_coredump()``. + * ``vfs_coredump()``. * nfs4recover.c. diff --git a/fs/coredump.c b/fs/coredump.c index 52efd1b34261..8a401eeee940 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -865,7 +865,7 @@ static inline void coredump_sock_wait(struct file *file) { } static inline void coredump_sock_shutdown(struct file *file) { } #endif -void do_coredump(const kernel_siginfo_t *siginfo) +void vfs_coredump(const kernel_siginfo_t *siginfo) { struct core_state core_state; struct core_name cn; diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 76e41805b92d..96e8a66da133 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -43,7 +43,7 @@ extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); -extern void do_coredump(const kernel_siginfo_t *siginfo); +extern void vfs_coredump(const kernel_siginfo_t *siginfo); /* * Logging for the coredump code, ratelimited. @@ -63,7 +63,7 @@ extern void do_coredump(const kernel_siginfo_t *siginfo); #define coredump_report_failure(fmt, ...) __COREDUMP_PRINTK(KERN_WARNING, fmt, ##__VA_ARGS__) #else -static inline void do_coredump(const kernel_siginfo_t *siginfo) {} +static inline void vfs_coredump(const kernel_siginfo_t *siginfo) {} #define coredump_report(...) #define coredump_report_failure(...) diff --git a/kernel/signal.c b/kernel/signal.c index 148082db9a55..e2c928de7d2c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3016,7 +3016,7 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(&ksig->info); + vfs_coredump(&ksig->info); } /* From 7bbb05dbea38e56d9f6ac7ac1040f93b0808cbce Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:24 +0200 Subject: [PATCH 16/31] coredump: split file coredumping into coredump_file() * Move that whole mess into a separate helper instead of having all that hanging around in vfs_coredump() directly. * Stop using that need_suid_safe variable and add an inline helper that clearly communicates what's going on everywhere consistently. The mm flag snapshot is stable and can't change so nothing's gained with that boolean. * Only setup cprm->file once everything else succeeded, using RAII for the coredump file before. That allows to don't care to what goto label we jump in vfs_coredump(). Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-10-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 207 ++++++++++++++++++++++++++------------------------ 1 file changed, 106 insertions(+), 101 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 8a401eeee940..9f9d8ae29359 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -865,6 +865,108 @@ static inline void coredump_sock_wait(struct file *file) { } static inline void coredump_sock_shutdown(struct file *file) { } #endif +/* cprm->mm_flags contains a stable snapshot of dumpability flags. */ +static inline bool coredump_force_suid_safe(const struct coredump_params *cprm) +{ + /* Require nonrelative corefile path and be extra careful. */ + return __get_dumpable(cprm->mm_flags) == SUID_DUMP_ROOT; +} + +static bool coredump_file(struct core_name *cn, struct coredump_params *cprm, + const struct linux_binfmt *binfmt) +{ + struct mnt_idmap *idmap; + struct inode *inode; + struct file *file __free(fput) = NULL; + int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | O_LARGEFILE | O_EXCL; + + if (cprm->limit < binfmt->min_coredump) + return false; + + if (coredump_force_suid_safe(cprm) && cn->corename[0] != '/') { + coredump_report_failure("this process can only dump core to a fully qualified path, skipping core dump"); + return false; + } + + /* + * Unlink the file if it exists unless this is a SUID + * binary - in that case, we're running around with root + * privs and don't want to unlink another user's coredump. + */ + if (!coredump_force_suid_safe(cprm)) { + /* + * If it doesn't exist, that's fine. If there's some + * other problem, we'll catch it at the filp_open(). + */ + do_unlinkat(AT_FDCWD, getname_kernel(cn->corename)); + } + + /* + * There is a race between unlinking and creating the + * file, but if that causes an EEXIST here, that's + * fine - another process raced with us while creating + * the corefile, and the other process won. To userspace, + * what matters is that at least one of the two processes + * writes its coredump successfully, not which one. + */ + if (coredump_force_suid_safe(cprm)) { + /* + * Using user namespaces, normal user tasks can change + * their current->fs->root to point to arbitrary + * directories. Since the intention of the "only dump + * with a fully qualified path" rule is to control where + * coredumps may be placed using root privileges, + * current->fs->root must not be used. Instead, use the + * root directory of init_task. + */ + struct path root; + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + file = file_open_root(&root, cn->corename, open_flags, 0600); + path_put(&root); + } else { + file = filp_open(cn->corename, open_flags, 0600); + } + if (IS_ERR(file)) + return false; + + inode = file_inode(file); + if (inode->i_nlink > 1) + return false; + if (d_unhashed(file->f_path.dentry)) + return false; + /* + * AK: actually i see no reason to not allow this for named + * pipes etc, but keep the previous behaviour for now. + */ + if (!S_ISREG(inode->i_mode)) + return false; + /* + * Don't dump core if the filesystem changed owner or mode + * of the file during file creation. This is an issue when + * a process dumps core while its cwd is e.g. on a vfat + * filesystem. + */ + idmap = file_mnt_idmap(file); + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) { + coredump_report_failure("Core dump to %s aborted: cannot preserve file owner", cn->corename); + return false; + } + if ((inode->i_mode & 0677) != 0600) { + coredump_report_failure("Core dump to %s aborted: cannot preserve file permissions", cn->corename); + return false; + } + if (!(file->f_mode & FMODE_CAN_WRITE)) + return false; + if (do_truncate(idmap, file->f_path.dentry, 0, 0, file)) + return false; + + cprm->file = no_free_ptr(file); + return true; +} + void vfs_coredump(const kernel_siginfo_t *siginfo) { struct core_state core_state; @@ -876,8 +978,6 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) int retval = 0; size_t *argv = NULL; int argc = 0; - /* require nonrelative corefile path and be extra careful */ - bool need_suid_safe = false; bool core_dumped = false; static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { @@ -910,11 +1010,8 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) * so we dump it as root in mode 2, and only into a controlled * environment (pipe handler or fully qualified path). */ - if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) { - /* Setuid core dump mode */ - cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ - need_suid_safe = true; - } + if (coredump_force_suid_safe(&cprm)) + cred->fsuid = GLOBAL_ROOT_UID; retval = coredump_wait(siginfo->si_signo, &core_state); if (retval < 0) @@ -928,102 +1025,10 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) } switch (cn.core_type) { - case COREDUMP_FILE: { - struct mnt_idmap *idmap; - struct inode *inode; - int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | - O_LARGEFILE | O_EXCL; - - if (cprm.limit < binfmt->min_coredump) - goto fail_unlock; - - if (need_suid_safe && cn.corename[0] != '/') { - coredump_report_failure( - "this process can only dump core to a fully qualified path, skipping core dump"); - goto fail_unlock; - } - - /* - * Unlink the file if it exists unless this is a SUID - * binary - in that case, we're running around with root - * privs and don't want to unlink another user's coredump. - */ - if (!need_suid_safe) { - /* - * If it doesn't exist, that's fine. If there's some - * other problem, we'll catch it at the filp_open(). - */ - do_unlinkat(AT_FDCWD, getname_kernel(cn.corename)); - } - - /* - * There is a race between unlinking and creating the - * file, but if that causes an EEXIST here, that's - * fine - another process raced with us while creating - * the corefile, and the other process won. To userspace, - * what matters is that at least one of the two processes - * writes its coredump successfully, not which one. - */ - if (need_suid_safe) { - /* - * Using user namespaces, normal user tasks can change - * their current->fs->root to point to arbitrary - * directories. Since the intention of the "only dump - * with a fully qualified path" rule is to control where - * coredumps may be placed using root privileges, - * current->fs->root must not be used. Instead, use the - * root directory of init_task. - */ - struct path root; - - task_lock(&init_task); - get_fs_root(init_task.fs, &root); - task_unlock(&init_task); - cprm.file = file_open_root(&root, cn.corename, - open_flags, 0600); - path_put(&root); - } else { - cprm.file = filp_open(cn.corename, open_flags, 0600); - } - if (IS_ERR(cprm.file)) - goto fail_unlock; - - inode = file_inode(cprm.file); - if (inode->i_nlink > 1) - goto close_fail; - if (d_unhashed(cprm.file->f_path.dentry)) - goto close_fail; - /* - * AK: actually i see no reason to not allow this for named - * pipes etc, but keep the previous behaviour for now. - */ - if (!S_ISREG(inode->i_mode)) - goto close_fail; - /* - * Don't dump core if the filesystem changed owner or mode - * of the file during file creation. This is an issue when - * a process dumps core while its cwd is e.g. on a vfat - * filesystem. - */ - idmap = file_mnt_idmap(cprm.file); - if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), - current_fsuid())) { - coredump_report_failure("Core dump to %s aborted: " - "cannot preserve file owner", cn.corename); - goto close_fail; - } - if ((inode->i_mode & 0677) != 0600) { - coredump_report_failure("Core dump to %s aborted: " - "cannot preserve file permissions", cn.corename); - goto close_fail; - } - if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) - goto close_fail; - if (do_truncate(idmap, cprm.file->f_path.dentry, - 0, 0, cprm.file)) + case COREDUMP_FILE: + if (!coredump_file(&cn, &cprm, binfmt)) goto close_fail; break; - } case COREDUMP_PIPE: { int argi; int dump_count; From a961c737cda8f172e108da881691cadafb9a061e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:25 +0200 Subject: [PATCH 17/31] coredump: prepare to simplify exit paths The exit path is currently entangled with core pipe limit accounting which is really unpleasant. Use a local variable in struct core_name that remembers whether the count was incremented and if so to clean decrement in once the coredump is done. Assert that this only happens for pipes. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-11-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 9f9d8ae29359..4afaf792a12e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -93,6 +93,7 @@ enum coredump_type_t { struct core_name { char *corename; int used, size; + unsigned int core_pipe_limit; enum coredump_type_t core_type; u64 mask; }; @@ -244,6 +245,7 @@ static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm, cn->mask |= COREDUMP_WAIT; cn->used = 0; cn->corename = NULL; + cn->core_pipe_limit = 0; if (*pat_ptr == '|') cn->core_type = COREDUMP_PIPE; else if (*pat_ptr == '@') @@ -1031,7 +1033,6 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) break; case COREDUMP_PIPE: { int argi; - int dump_count; char **helper_argv; struct subprocess_info *sub_info; @@ -1052,21 +1053,21 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) * core_pattern process dies. */ coredump_report_failure("RLIMIT_CORE is set to 1, aborting core"); - goto fail_unlock; + goto close_fail; } cprm.limit = RLIM_INFINITY; - dump_count = atomic_inc_return(&core_dump_count); - if (core_pipe_limit && (core_pipe_limit < dump_count)) { + cn.core_pipe_limit = atomic_inc_return(&core_dump_count); + if (core_pipe_limit && (core_pipe_limit < cn.core_pipe_limit)) { coredump_report_failure("over core_pipe_limit, skipping core dump"); - goto fail_dropcount; + goto close_fail; } helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), GFP_KERNEL); if (!helper_argv) { coredump_report_failure("%s failed to allocate memory", __func__); - goto fail_dropcount; + goto close_fail; } for (argi = 0; argi < argc; argi++) helper_argv[argi] = cn.corename + argv[argi]; @@ -1168,9 +1169,10 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) close_fail: if (cprm.file) filp_close(cprm.file, NULL); -fail_dropcount: - if (cn.core_type == COREDUMP_PIPE) + if (cn.core_pipe_limit) { + VFS_WARN_ON_ONCE(cn.core_type != COREDUMP_PIPE); atomic_dec(&core_dump_count); + } fail_unlock: kfree(argv); kfree(cn.corename); From 4f599219f71399ac2092d2e06b2cc38e50c45c53 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:26 +0200 Subject: [PATCH 18/31] coredump: move core_pipe_count to global variable The pipe coredump counter is a static local variable instead of a global variable like all of the rest. Move it to a global variable so it's all consistent. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-12-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 4afaf792a12e..c863e053b1f8 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -82,6 +82,7 @@ static unsigned int core_sort_vma; static char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT; +static atomic_t core_pipe_count = ATOMIC_INIT(0); enum coredump_type_t { COREDUMP_FILE = 1, @@ -981,7 +982,6 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) size_t *argv = NULL; int argc = 0; bool core_dumped = false; - static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .siginfo = siginfo, .limit = rlimit(RLIMIT_CORE), @@ -1057,7 +1057,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) } cprm.limit = RLIM_INFINITY; - cn.core_pipe_limit = atomic_inc_return(&core_dump_count); + cn.core_pipe_limit = atomic_inc_return(&core_pipe_count); if (core_pipe_limit && (core_pipe_limit < cn.core_pipe_limit)) { coredump_report_failure("over core_pipe_limit, skipping core dump"); goto close_fail; @@ -1171,7 +1171,7 @@ close_fail: filp_close(cprm.file, NULL); if (cn.core_pipe_limit) { VFS_WARN_ON_ONCE(cn.core_type != COREDUMP_PIPE); - atomic_dec(&core_dump_count); + atomic_dec(&core_pipe_count); } fail_unlock: kfree(argv); From 9f29a347d7b1b2022dfc6e5a93d4f2a7b34f5d4d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:27 +0200 Subject: [PATCH 19/31] coredump: split pipe coredumping into coredump_pipe() * Move that whole mess into a separate helper instead of having all that hanging around in vfs_coredump() directly. Cleanup paths are already centralized. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-13-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 114 ++++++++++++++++++++++++++------------------------ 1 file changed, 59 insertions(+), 55 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index c863e053b1f8..f4f7f0a0ae40 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -970,6 +970,63 @@ static bool coredump_file(struct core_name *cn, struct coredump_params *cprm, return true; } +static bool coredump_pipe(struct core_name *cn, struct coredump_params *cprm, + size_t *argv, int argc) +{ + int argi; + char **helper_argv __free(kfree) = NULL; + struct subprocess_info *sub_info; + + if (cprm->limit == 1) { + /* See umh_coredump_setup() which sets RLIMIT_CORE = 1. + * + * Normally core limits are irrelevant to pipes, since + * we're not writing to the file system, but we use + * cprm.limit of 1 here as a special value, this is a + * consistent way to catch recursive crashes. + * We can still crash if the core_pattern binary sets + * RLIM_CORE = !1, but it runs as root, and can do + * lots of stupid things. + * + * Note that we use task_tgid_vnr here to grab the pid + * of the process group leader. That way we get the + * right pid if a thread in a multi-threaded + * core_pattern process dies. + */ + coredump_report_failure("RLIMIT_CORE is set to 1, aborting core"); + return false; + } + cprm->limit = RLIM_INFINITY; + + cn->core_pipe_limit = atomic_inc_return(&core_pipe_count); + if (core_pipe_limit && (core_pipe_limit < cn->core_pipe_limit)) { + coredump_report_failure("over core_pipe_limit, skipping core dump"); + return false; + } + + helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), GFP_KERNEL); + if (!helper_argv) { + coredump_report_failure("%s failed to allocate memory", __func__); + return false; + } + for (argi = 0; argi < argc; argi++) + helper_argv[argi] = cn->corename + argv[argi]; + helper_argv[argi] = NULL; + + sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, + GFP_KERNEL, umh_coredump_setup, + NULL, cprm); + if (!sub_info) + return false; + + if (call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC)) { + coredump_report_failure("|%s pipe failed", cn->corename); + return false; + } + + return true; +} + void vfs_coredump(const kernel_siginfo_t *siginfo) { struct core_state core_state; @@ -1031,63 +1088,10 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) if (!coredump_file(&cn, &cprm, binfmt)) goto close_fail; break; - case COREDUMP_PIPE: { - int argi; - char **helper_argv; - struct subprocess_info *sub_info; - - if (cprm.limit == 1) { - /* See umh_coredump_setup() which sets RLIMIT_CORE = 1. - * - * Normally core limits are irrelevant to pipes, since - * we're not writing to the file system, but we use - * cprm.limit of 1 here as a special value, this is a - * consistent way to catch recursive crashes. - * We can still crash if the core_pattern binary sets - * RLIM_CORE = !1, but it runs as root, and can do - * lots of stupid things. - * - * Note that we use task_tgid_vnr here to grab the pid - * of the process group leader. That way we get the - * right pid if a thread in a multi-threaded - * core_pattern process dies. - */ - coredump_report_failure("RLIMIT_CORE is set to 1, aborting core"); + case COREDUMP_PIPE: + if (!coredump_pipe(&cn, &cprm, argv, argc)) goto close_fail; - } - cprm.limit = RLIM_INFINITY; - - cn.core_pipe_limit = atomic_inc_return(&core_pipe_count); - if (core_pipe_limit && (core_pipe_limit < cn.core_pipe_limit)) { - coredump_report_failure("over core_pipe_limit, skipping core dump"); - goto close_fail; - } - - helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), - GFP_KERNEL); - if (!helper_argv) { - coredump_report_failure("%s failed to allocate memory", __func__); - goto close_fail; - } - for (argi = 0; argi < argc; argi++) - helper_argv[argi] = cn.corename + argv[argi]; - helper_argv[argi] = NULL; - - retval = -ENOMEM; - sub_info = call_usermodehelper_setup(helper_argv[0], - helper_argv, NULL, GFP_KERNEL, - umh_coredump_setup, NULL, &cprm); - if (sub_info) - retval = call_usermodehelper_exec(sub_info, - UMH_WAIT_EXEC); - - kfree(helper_argv); - if (retval) { - coredump_report_failure("|%s pipe failed", cn.corename); - goto close_fail; - } break; - } case COREDUMP_SOCK_REQ: fallthrough; case COREDUMP_SOCK: From 5153053692987035a82bb4a6714ea12a5bd2bfdc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:28 +0200 Subject: [PATCH 20/31] coredump: move pipe specific file check into coredump_pipe() There's no point in having this eyesore in the middle of vfs_coredump(). Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-14-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index f4f7f0a0ae40..1e05d831cda8 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1024,6 +1024,15 @@ static bool coredump_pipe(struct core_name *cn, struct coredump_params *cprm, return false; } + /* + * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would + * have this set to NULL. + */ + if (!cprm->file) { + coredump_report_failure("Core dump to |%s disabled", cn->corename); + return false; + } + return true; } @@ -1117,14 +1126,6 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) goto close_fail; if ((cn.mask & COREDUMP_KERNEL) && !dump_interrupted()) { - /* - * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would - * have this set to NULL. - */ - if (!cprm.file) { - coredump_report_failure("Core dump to |%s disabled", cn.corename); - goto close_fail; - } if (!dump_vma_snapshot(&cprm)) goto close_fail; From d6527db34d08d7d411c377a25c05fff30eeba9ea Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:29 +0200 Subject: [PATCH 21/31] coredump: use a single helper for the socket Don't split it into multiple functions. Just use a single one like we do for coredump_file() and coredump_pipe() now. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-15-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 1e05d831cda8..48d90ec8c86a 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -855,17 +855,18 @@ static bool coredump_sock_request(struct core_name *cn, struct coredump_params * cn->mask = ack.mask; return coredump_sock_mark(cprm->file, COREDUMP_MARK_REQACK); } -#else -static bool coredump_sock_connect(struct core_name *cn, - struct coredump_params *cprm) + +static bool coredump_socket(struct core_name *cn, struct coredump_params *cprm) { - coredump_report_failure("Core dump socket support %s disabled", cn->corename); - return false; + if (!coredump_sock_connect(cn, cprm)) + return false; + + return coredump_sock_request(cn, cprm); } -static bool coredump_sock_request(struct core_name *cn, - struct coredump_params *cprm) { return false; } +#else static inline void coredump_sock_wait(struct file *file) { } static inline void coredump_sock_shutdown(struct file *file) { } +static inline bool coredump_socket(struct core_name *cn, struct coredump_params *cprm) { return false; } #endif /* cprm->mm_flags contains a stable snapshot of dumpability flags. */ @@ -1104,10 +1105,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) case COREDUMP_SOCK_REQ: fallthrough; case COREDUMP_SOCK: - if (!coredump_sock_connect(&cn, &cprm)) - goto close_fail; - - if (!coredump_sock_request(&cn, &cprm)) + if (!coredump_socket(&cn, &cprm)) goto close_fail; break; default: From 3a4db72d0368c5f618e18a12580d5c5dca7b2839 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:30 +0200 Subject: [PATCH 22/31] coredump: add coredump_write() to encapsulate that logic simplifying vfs_coredump() even further. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-16-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 56 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 48d90ec8c86a..dea4823b55b8 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -95,6 +95,7 @@ struct core_name { char *corename; int used, size; unsigned int core_pipe_limit; + bool core_dumped; enum coredump_type_t core_type; u64 mask; }; @@ -247,6 +248,7 @@ static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm, cn->used = 0; cn->corename = NULL; cn->core_pipe_limit = 0; + cn->core_dumped = false; if (*pat_ptr == '|') cn->core_type = COREDUMP_PIPE; else if (*pat_ptr == '@') @@ -1037,6 +1039,34 @@ static bool coredump_pipe(struct core_name *cn, struct coredump_params *cprm, return true; } +static bool coredump_write(struct core_name *cn, + struct coredump_params *cprm, + struct linux_binfmt *binfmt) +{ + + if (dump_interrupted()) + return true; + + if (!dump_vma_snapshot(cprm)) + return false; + + file_start_write(cprm->file); + cn->core_dumped = binfmt->core_dump(cprm); + /* + * Ensures that file size is big enough to contain the current + * file postion. This prevents gdb from complaining about + * a truncated file if the last "write" to the file was + * dump_skip. + */ + if (cprm->to_skip) { + cprm->to_skip--; + dump_emit(cprm, "", 1); + } + file_end_write(cprm->file); + free_vma_snapshot(cprm); + return true; +} + void vfs_coredump(const kernel_siginfo_t *siginfo) { struct core_state core_state; @@ -1048,7 +1078,6 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) int retval = 0; size_t *argv = NULL; int argc = 0; - bool core_dumped = false; struct coredump_params cprm = { .siginfo = siginfo, .limit = rlimit(RLIMIT_CORE), @@ -1123,31 +1152,14 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) if (retval) goto close_fail; - if ((cn.mask & COREDUMP_KERNEL) && !dump_interrupted()) { - if (!dump_vma_snapshot(&cprm)) - goto close_fail; - - file_start_write(cprm.file); - core_dumped = binfmt->core_dump(&cprm); - /* - * Ensures that file size is big enough to contain the current - * file postion. This prevents gdb from complaining about - * a truncated file if the last "write" to the file was - * dump_skip. - */ - if (cprm.to_skip) { - cprm.to_skip--; - dump_emit(&cprm, "", 1); - } - file_end_write(cprm.file); - free_vma_snapshot(&cprm); - } + if ((cn.mask & COREDUMP_KERNEL) && !coredump_write(&cn, &cprm, binfmt)) + goto close_fail; coredump_sock_shutdown(cprm.file); /* Let the parent know that a coredump was generated. */ if (cn.mask & COREDUMP_USERSPACE) - core_dumped = true; + cn.core_dumped = true; /* * When core_pipe_limit is set we wait for the coredump server @@ -1179,7 +1191,7 @@ close_fail: fail_unlock: kfree(argv); kfree(cn.corename); - coredump_finish(core_dumped); + coredump_finish(cn.core_dumped); revert_creds(old_cred); fail_creds: put_cred(cred); From 4a9f5d7fb6649af534c36aa8cc9c1aa51b172ad9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:31 +0200 Subject: [PATCH 23/31] coredump: auto cleanup argv to prepare for a simpler exit path. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-17-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index dea4823b55b8..68da77d00170 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1076,7 +1076,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) const struct cred *old_cred; struct cred *cred; int retval = 0; - size_t *argv = NULL; + size_t *argv __free(kfree) = NULL; int argc = 0; struct coredump_params cprm = { .siginfo = siginfo, @@ -1189,7 +1189,6 @@ close_fail: atomic_dec(&core_pipe_count); } fail_unlock: - kfree(argv); kfree(cn.corename); coredump_finish(cn.core_dumped); revert_creds(old_cred); From 8434fac512d35597488b13e23cc85e2903f5c8ae Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:32 +0200 Subject: [PATCH 24/31] coredump: directly return instead of jumping to a pointless cleanup label. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-18-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 68da77d00170..b2e9ac34d9a3 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1095,13 +1095,13 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) binfmt = mm->binfmt; if (!binfmt || !binfmt->core_dump) - goto fail; + return; if (!__get_dumpable(cprm.mm_flags)) - goto fail; + return; cred = prepare_creds(); if (!cred) - goto fail; + return; /* * We cannot trust fsuid as being the "true" uid of the process * nor do we know its entire history. We only know it was tainted @@ -1194,7 +1194,6 @@ fail_unlock: revert_creds(old_cred); fail_creds: put_cred(cred); -fail: return; } From 377d7860c960ac8e672881bc50353d867e2f94a4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:33 +0200 Subject: [PATCH 25/31] cred: add auto cleanup method Add a simple auto cleanup method for struct cred. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-19-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- include/linux/cred.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/cred.h b/include/linux/cred.h index 5658a3bfe803..a102a10f833f 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -263,6 +263,8 @@ static inline void put_cred(const struct cred *cred) put_cred_many(cred, 1); } +DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T)) + /** * current_cred - Access the current task's subjective credentials * From 7a568fcdad7c75a1ee196921cf651de607c2c5d5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:34 +0200 Subject: [PATCH 26/31] coredump: auto cleanup prepare_creds() which will allow us to simplify the exit path in further patches. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-20-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index b2e9ac34d9a3..e9c8696283f6 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1074,7 +1074,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; const struct cred *old_cred; - struct cred *cred; + struct cred *cred __free(put_cred) = NULL; int retval = 0; size_t *argv __free(kfree) = NULL; int argc = 0; @@ -1113,7 +1113,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) retval = coredump_wait(siginfo->si_signo, &core_state); if (retval < 0) - goto fail_creds; + return; old_cred = override_creds(cred); @@ -1192,8 +1192,6 @@ fail_unlock: kfree(cn.corename); coredump_finish(cn.core_dumped); revert_creds(old_cred); -fail_creds: - put_cred(cred); return; } From cfd6c12293d7cc257f27770399a3d8f11bf7d25c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:35 +0200 Subject: [PATCH 27/31] coredump: add coredump_cleanup() Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-21-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index e9c8696283f6..9c028ef087d4 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1067,6 +1067,18 @@ static bool coredump_write(struct core_name *cn, return true; } +static void coredump_cleanup(struct core_name *cn, struct coredump_params *cprm) +{ + if (cprm->file) + filp_close(cprm->file, NULL); + if (cn->core_pipe_limit) { + VFS_WARN_ON_ONCE(cn->core_type != COREDUMP_PIPE); + atomic_dec(&core_pipe_count); + } + kfree(cn->corename); + coredump_finish(cn->core_dumped); +} + void vfs_coredump(const kernel_siginfo_t *siginfo) { struct core_state core_state; @@ -1119,7 +1131,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) if (!coredump_parse(&cn, &cprm, &argv, &argc)) { coredump_report_failure("format_corename failed, aborting core"); - goto fail_unlock; + goto close_fail; } switch (cn.core_type) { @@ -1182,15 +1194,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) } close_fail: - if (cprm.file) - filp_close(cprm.file, NULL); - if (cn.core_pipe_limit) { - VFS_WARN_ON_ONCE(cn.core_type != COREDUMP_PIPE); - atomic_dec(&core_pipe_count); - } -fail_unlock: - kfree(cn.corename); - coredump_finish(cn.core_dumped); + coredump_cleanup(&cn, &cprm); revert_creds(old_cred); return; } From ae20958b37acf82da4928910ca6351719b5ddba7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:36 +0200 Subject: [PATCH 28/31] coredump: order auto cleanup variables at the top so they're easy to spot. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-22-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 9c028ef087d4..d3f09bf71f5f 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1081,14 +1081,14 @@ static void coredump_cleanup(struct core_name *cn, struct coredump_params *cprm) void vfs_coredump(const kernel_siginfo_t *siginfo) { + struct cred *cred __free(put_cred) = NULL; + size_t *argv __free(kfree) = NULL; struct core_state core_state; struct core_name cn; struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; const struct cred *old_cred; - struct cred *cred __free(put_cred) = NULL; int retval = 0; - size_t *argv __free(kfree) = NULL; int argc = 0; struct coredump_params cprm = { .siginfo = siginfo, From 9dd88f3626462e4ffd008196e053d4004e44427b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:37 +0200 Subject: [PATCH 29/31] coredump: avoid pointless variable we don't use that value at all so don't bother with it in the first place. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-23-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index d3f09bf71f5f..178eddbcd6ad 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1088,7 +1088,6 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; const struct cred *old_cred; - int retval = 0; int argc = 0; struct coredump_params cprm = { .siginfo = siginfo, @@ -1123,8 +1122,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) if (coredump_force_suid_safe(&cprm)) cred->fsuid = GLOBAL_ROOT_UID; - retval = coredump_wait(siginfo->si_signo, &core_state); - if (retval < 0) + if (coredump_wait(siginfo->si_signo, &core_state) < 0) return; old_cred = override_creds(cred); @@ -1160,8 +1158,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) /* get us an unshared descriptor table; almost always a no-op */ /* The cell spufs coredump code reads the file descriptor tables */ - retval = unshare_files(); - if (retval) + if (unshare_files()) goto close_fail; if ((cn.mask & COREDUMP_KERNEL) && !coredump_write(&cn, &cprm, binfmt)) From da9029b47d790675dcd82a6d9e332bc41e1a17f1 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:38 +0200 Subject: [PATCH 30/31] coredump: add coredump_skip() helper Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-24-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 178eddbcd6ad..fadf9d4be2e1 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1079,6 +1079,18 @@ static void coredump_cleanup(struct core_name *cn, struct coredump_params *cprm) coredump_finish(cn->core_dumped); } +static inline bool coredump_skip(const struct coredump_params *cprm, + const struct linux_binfmt *binfmt) +{ + if (!binfmt) + return true; + if (!binfmt->core_dump) + return true; + if (!__get_dumpable(cprm->mm_flags)) + return true; + return false; +} + void vfs_coredump(const kernel_siginfo_t *siginfo) { struct cred *cred __free(put_cred) = NULL; @@ -1086,7 +1098,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) struct core_state core_state; struct core_name cn; struct mm_struct *mm = current->mm; - struct linux_binfmt * binfmt; + struct linux_binfmt *binfmt = mm->binfmt; const struct cred *old_cred; int argc = 0; struct coredump_params cprm = { @@ -1104,10 +1116,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) audit_core_dumps(siginfo->si_signo); - binfmt = mm->binfmt; - if (!binfmt || !binfmt->core_dump) - return; - if (!__get_dumpable(cprm.mm_flags)) + if (coredump_skip(&cprm, binfmt)) return; cred = prepare_creds(); From 5c21c5f22d0701ac6c1cafc0e8de4bf42e5c53e5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 11 Jul 2025 15:47:48 +0200 Subject: [PATCH 31/31] cleanup: add a scoped version of CLASS() This will make it possible to use: scoped_class() { } constructs to limit variables to certain scopes and still perform auto-cleanup. Signed-off-by: Christian Brauner --- include/linux/cleanup.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 7093e1d08af0..bee606bebaca 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -277,6 +277,14 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ class_##_name##_t var __cleanup(class_##_name##_destructor) = \ class_##_name##_constructor +#define scoped_class(_name, var, args) \ + for (CLASS(_name, var)(args); \ + __guard_ptr(_name)(&var) || !__is_cond_ptr(_name); \ + ({ goto _label; })) \ + if (0) { \ +_label: \ + break; \ + } else /* * DEFINE_GUARD(name, type, lock, unlock):