twx-linux/include/linux/rseq_entry.h

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RSEQ_ENTRY_H
#define _LINUX_RSEQ_ENTRY_H

/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
#ifdef CONFIG_RSEQ_STATS
#include <linux/percpu.h>

struct rseq_stats {
	unsigned long	exit;
	unsigned long	signal;
	unsigned long	slowpath;
	unsigned long	fastpath;
	unsigned long	ids;
	unsigned long	cs;
	unsigned long	clear;
	unsigned long	fixup;
};

DECLARE_PER_CPU(struct rseq_stats, rseq_stats);

/*
 * Slow path has interrupts and preemption enabled, but the fast path
 * runs with interrupts disabled so there is no point in having the
 * preemption checks implied in __this_cpu_inc() for every operation.
 */
#ifdef RSEQ_BUILD_SLOW_PATH
#define rseq_stat_inc(which)	this_cpu_inc((which))
#else
#define rseq_stat_inc(which)	raw_cpu_inc((which))
#endif

#else /* CONFIG_RSEQ_STATS */
#define rseq_stat_inc(x)	do { } while (0)
#endif /* !CONFIG_RSEQ_STATS */

#ifdef CONFIG_RSEQ
#include <linux/jump_label.h>
#include <linux/rseq.h>
#include <linux/uaccess.h>

#include <linux/tracepoint-defs.h>

#ifdef CONFIG_TRACEPOINTS
DECLARE_TRACEPOINT(rseq_update);
DECLARE_TRACEPOINT(rseq_ip_fixup);
void __rseq_trace_update(struct task_struct *t);
void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
			   unsigned long offset, unsigned long abort_ip);

static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
{
	if (tracepoint_enabled(rseq_update) && ids)
		__rseq_trace_update(t);
}

static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
				       unsigned long offset, unsigned long abort_ip)
{
	if (tracepoint_enabled(rseq_ip_fixup))
		__rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
}

#else /* CONFIG_TRACEPOINT */
static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
				       unsigned long offset, unsigned long abort_ip) { }
#endif /* !CONFIG_TRACEPOINT */

DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);

#ifdef RSEQ_BUILD_SLOW_PATH
#define rseq_inline
#else
#define rseq_inline __always_inline
#endif

bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
bool rseq_debug_validate_ids(struct task_struct *t);

static __always_inline void rseq_note_user_irq_entry(void)
{
	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
		current->rseq.event.user_irq = true;
}

/*
 * Check whether there is a valid critical section and whether the
 * instruction pointer in @regs is inside the critical section.
 *
 *  - If the critical section is invalid, terminate the task.
 *
 *  - If valid and the instruction pointer is inside, set it to the abort IP.
 *
 *  - If valid and the instruction pointer is outside, clear the critical
 *    section address.
 *
 * Returns true, if the section was valid and either fixup or clear was
 * done, false otherwise.
 *
 * In the failure case task::rseq_event::fatal is set when a invalid
 * section was found. It's clear when the failure was an unresolved page
 * fault.
 *
 * If inlined into the exit to user path with interrupts disabled, the
 * caller has to protect against page faults with pagefault_disable().
 *
 * In preemptible task context this would be counterproductive as the page
 * faults could not be fully resolved. As a consequence unresolved page
 * faults in task context are fatal too.
 */

#ifdef RSEQ_BUILD_SLOW_PATH
/*
 * The debug version is put out of line, but kept here so the code stays
 * together.
 *
 * @csaddr has already been checked by the caller to be in user space
 */
bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
			       unsigned long csaddr)
{
	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
	u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
	unsigned long ip = instruction_pointer(regs);
	u64 __user *uc_head = (u64 __user *) ucs;
	u32 usig, __user *uc_sig;

	scoped_user_rw_access(ucs, efault) {
		/*
		 * Evaluate the user pile and exit if one of the conditions
		 * is not fulfilled.
		 */
		unsafe_get_user(start_ip, &ucs->start_ip, efault);
		if (unlikely(start_ip >= tasksize))
			goto die;
		/* If outside, just clear the critical section. */
		if (ip < start_ip)
			goto clear;

		unsafe_get_user(offset, &ucs->post_commit_offset, efault);
		cs_end = start_ip + offset;
		/* Check for overflow and wraparound */
		if (unlikely(cs_end >= tasksize || cs_end < start_ip))
			goto die;

		/* If not inside, clear it. */
		if (ip >= cs_end)
			goto clear;

		unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
		/* Ensure it's "valid" */
		if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
			goto die;
		/* Validate that the abort IP is not in the critical section */
		if (unlikely(abort_ip - start_ip < offset))
			goto die;

		/*
		 * Check version and flags for 0. No point in emitting
		 * deprecated warnings before dying. That could be done in
		 * the slow path eventually, but *shrug*.
		 */
		unsafe_get_user(head, uc_head, efault);
		if (unlikely(head))
			goto die;

		/* abort_ip - 4 is >= 0. See abort_ip check above */
		uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
		unsafe_get_user(usig, uc_sig, efault);
		if (unlikely(usig != t->rseq.sig))
			goto die;

		/* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
		if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
			/* If not in interrupt from user context, let it die */
			if (unlikely(!t->rseq.event.user_irq))
				goto die;
		}
		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
		instruction_pointer_set(regs, (unsigned long)abort_ip);
		rseq_stat_inc(rseq_stats.fixup);
		break;
	clear:
		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
		rseq_stat_inc(rseq_stats.clear);
		abort_ip = 0ULL;
	}

	if (unlikely(abort_ip))
		rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
	return true;
die:
	t->rseq.event.fatal = true;
efault:
	return false;
}

/*
 * On debug kernels validate that user space did not mess with it if the
 * debug branch is enabled.
 */
bool rseq_debug_validate_ids(struct task_struct *t)
{
	struct rseq __user *rseq = t->rseq.usrptr;
	u32 cpu_id, uval, node_id;

	/*
	 * On the first exit after registering the rseq region CPU ID is
	 * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0!
	 */
	node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ?
		  cpu_to_node(t->rseq.ids.cpu_id) : 0;

	scoped_user_read_access(rseq, efault) {
		unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
		if (cpu_id != t->rseq.ids.cpu_id)
			goto die;
		unsafe_get_user(uval, &rseq->cpu_id, efault);
		if (uval != cpu_id)
			goto die;
		unsafe_get_user(uval, &rseq->node_id, efault);
		if (uval != node_id)
			goto die;
		unsafe_get_user(uval, &rseq->mm_cid, efault);
		if (uval != t->rseq.ids.mm_cid)
			goto die;
	}
	return true;
die:
	t->rseq.event.fatal = true;
efault:
	return false;
}

#endif /* RSEQ_BUILD_SLOW_PATH */

/*
 * This only ensures that abort_ip is in the user address space and
 * validates that it is preceded by the signature.
 *
 * No other sanity checks are done here, that's what the debug code is for.
 */
static rseq_inline bool
rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
{
	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
	unsigned long ip = instruction_pointer(regs);
	unsigned long tasksize = TASK_SIZE;
	u64 start_ip, abort_ip, offset;
	u32 usig, __user *uc_sig;

	rseq_stat_inc(rseq_stats.cs);

	if (unlikely(csaddr >= tasksize)) {
		t->rseq.event.fatal = true;
		return false;
	}

	if (static_branch_unlikely(&rseq_debug_enabled))
		return rseq_debug_update_user_cs(t, regs, csaddr);

	scoped_user_rw_access(ucs, efault) {
		unsafe_get_user(start_ip, &ucs->start_ip, efault);
		unsafe_get_user(offset, &ucs->post_commit_offset, efault);
		unsafe_get_user(abort_ip, &ucs->abort_ip, efault);

		/*
		 * No sanity checks. If user space screwed it up, it can
		 * keep the pieces. That's what debug code is for.
		 *
		 * If outside, just clear the critical section.
		 */
		if (ip - start_ip >= offset)
			goto clear;

		/*
		 * Two requirements for @abort_ip:
		 *   - Must be in user space as x86 IRET would happily return to
		 *     the kernel.
		 *   - The four bytes preceding the instruction at @abort_ip must
		 *     contain the signature.
		 *
		 * The latter protects against the following attack vector:
		 *
		 * An attacker with limited abilities to write, creates a critical
		 * section descriptor, sets the abort IP to a library function or
		 * some other ROP gadget and stores the address of the descriptor
		 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
		 * protection.
		 */
		if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
			goto die;

		/* The address is guaranteed to be >= 0 and < TASK_SIZE */
		uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
		unsafe_get_user(usig, uc_sig, efault);
		if (unlikely(usig != t->rseq.sig))
			goto die;

		/* Invalidate the critical section */
		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
		/* Update the instruction pointer */
		instruction_pointer_set(regs, (unsigned long)abort_ip);
		rseq_stat_inc(rseq_stats.fixup);
		break;
	clear:
		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
		rseq_stat_inc(rseq_stats.clear);
		abort_ip = 0ULL;
	}

	if (unlikely(abort_ip))
		rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
	return true;
die:
	t->rseq.event.fatal = true;
efault:
	return false;
}

/*
 * Updates CPU ID, Node ID and MM CID and reads the critical section
 * address, when @csaddr != NULL. This allows to put the ID update and the
 * read under the same uaccess region to spare a separate begin/end.
 *
 * As this is either invoked from a C wrapper with @csaddr = NULL or from
 * the fast path code with a valid pointer, a clever compiler should be
 * able to optimize the read out. Spares a duplicate implementation.
 *
 * Returns true, if the operation was successful, false otherwise.
 *
 * In the failure case task::rseq_event::fatal is set when invalid data
 * was found on debug kernels. It's clear when the failure was an unresolved page
 * fault.
 *
 * If inlined into the exit to user path with interrupts disabled, the
 * caller has to protect against page faults with pagefault_disable().
 *
 * In preemptible task context this would be counterproductive as the page
 * faults could not be fully resolved. As a consequence unresolved page
 * faults in task context are fatal too.
 */
static rseq_inline
bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
			     u32 node_id, u64 *csaddr)
{
	struct rseq __user *rseq = t->rseq.usrptr;

	if (static_branch_unlikely(&rseq_debug_enabled)) {
		if (!rseq_debug_validate_ids(t))
			return false;
	}

	scoped_user_rw_access(rseq, efault) {
		unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
		unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
		unsafe_put_user(node_id, &rseq->node_id, efault);
		unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
		if (csaddr)
			unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
	}

	/* Cache the new values */
	t->rseq.ids.cpu_cid = ids->cpu_cid;
	rseq_stat_inc(rseq_stats.ids);
	rseq_trace_update(t, ids);
	return true;
efault:
	return false;
}

/*
 * Update user space with new IDs and conditionally check whether the task
 * is in a critical section.
 */
static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
					struct rseq_ids *ids, u32 node_id)
{
	u64 csaddr;

	if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
		return false;

	/*
	 * On architectures which utilize the generic entry code this
	 * allows to skip the critical section when the entry was not from
	 * a user space interrupt, unless debug mode is enabled.
	 */
	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
		if (!static_branch_unlikely(&rseq_debug_enabled)) {
			if (likely(!t->rseq.event.user_irq))
				return true;
		}
	}
	if (likely(!csaddr))
		return true;
	/* Sigh, this really needs to do work */
	return rseq_update_user_cs(t, regs, csaddr);
}

/*
 * If you want to use this then convert your architecture to the generic
 * entry code. I'm tired of building workarounds for people who can't be
 * bothered to make the maintenance of generic infrastructure less
 * burdensome. Just sucking everything into the architecture code and
 * thereby making others chase the horrible hacks and keep them working is
 * neither acceptable nor sustainable.
 */
#ifdef CONFIG_GENERIC_ENTRY

/*
 * This is inlined into the exit path because:
 *
 * 1) It's a one time comparison in the fast path when there is no event to
 *    handle
 *
 * 2) The access to the user space rseq memory (TLS) is unlikely to fault
 *    so the straight inline operation is:
 *
 *	- Four 32-bit stores only if CPU ID/ MM CID need to be updated
 *	- One 64-bit load to retrieve the critical section address
 *
 * 3) In the unlikely case that the critical section address is != NULL:
 *
 *     - One 64-bit load to retrieve the start IP
 *     - One 64-bit load to retrieve the offset for calculating the end
 *     - One 64-bit load to retrieve the abort IP
 *     - One 64-bit load to retrieve the signature
 *     - One store to clear the critical section address
 *
 * The non-debug case implements only the minimal required checking. It
 * provides protection against a rogue abort IP in kernel space, which
 * would be exploitable at least on x86, and also against a rogue CS
 * descriptor by checking the signature at the abort IP. Any fallout from
 * invalid critical section descriptors is a user space problem. The debug
 * case provides the full set of checks and terminates the task if a
 * condition is not met.
 *
 * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
 * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
 * slow path there will handle the failure.
 */
static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t)
{
	/*
	 * Page faults need to be disabled as this is called with
	 * interrupts disabled
	 */
	guard(pagefault)();
	if (likely(!t->rseq.event.ids_changed)) {
		struct rseq __user *rseq = t->rseq.usrptr;
		/*
		 * If IDs have not changed rseq_event::user_irq must be true
		 * See rseq_sched_switch_event().
		 */
		u64 csaddr;

		if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs)))
			return false;

		if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
			if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
				return false;
		}
		return true;
	}

	struct rseq_ids ids = {
		.cpu_id = task_cpu(t),
		.mm_cid = task_mm_cid(t),
	};
	u32 node_id = cpu_to_node(ids.cpu_id);

	return rseq_update_usr(t, regs, &ids, node_id);
}

static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
{
	struct task_struct *t = current;

	/*
	 * If the task did not go through schedule or got the flag enforced
	 * by the rseq syscall or execve, then nothing to do here.
	 *
	 * CPU ID and MM CID can only change when going through a context
	 * switch.
	 *
	 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit
	 * only when rseq_event::has_rseq is true. That conditional is
	 * required to avoid setting the TIF bit if RSEQ is not registered
	 * for a task. rseq_event::sched_switch is cleared when RSEQ is
	 * unregistered by a task so it's sufficient to check for the
	 * sched_switch bit alone.
	 *
	 * A sane compiler requires three instructions for the nothing to do
	 * case including clearing the events, but your mileage might vary.
	 */
	if (unlikely((t->rseq.event.sched_switch))) {
		rseq_stat_inc(rseq_stats.fastpath);

		if (unlikely(!rseq_exit_user_update(regs, t)))
			return true;
	}
	/* Clear state so next entry starts from a clean slate */
	t->rseq.event.events = 0;
	return false;
}

static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
{
	if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
		current->rseq.event.slowpath = true;
		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
		return true;
	}
	return false;
}

#else /* CONFIG_GENERIC_ENTRY */
static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; }
#endif /* !CONFIG_GENERIC_ENTRY */

static __always_inline void rseq_exit_to_user_mode(void)
{
	struct rseq_event *ev = &current->rseq.event;

	rseq_stat_inc(rseq_stats.exit);

	if (static_branch_unlikely(&rseq_debug_enabled))
		WARN_ON_ONCE(ev->sched_switch);

	/*
	 * Ensure that event (especially user_irq) is cleared when the
	 * interrupt did not result in a schedule and therefore the
	 * rseq processing did not clear it.
	 */
	ev->events = 0;
}

void __rseq_debug_syscall_return(struct pt_regs *regs);

static inline void rseq_debug_syscall_return(struct pt_regs *regs)
{
	if (static_branch_unlikely(&rseq_debug_enabled))
		__rseq_debug_syscall_return(regs);
}
#else /* CONFIG_RSEQ */
static inline void rseq_note_user_irq_entry(void) { }
static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
{
	return false;
}
static inline void rseq_exit_to_user_mode(void) { }
static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
#endif /* !CONFIG_RSEQ */

#endif /* _LINUX_RSEQ_ENTRY_H */