From ee4a2e08c10188f02fe3fb36b6beddc3c2fdb287 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Jul 2025 19:27:41 +1000 Subject: [PATCH 1/2] entry: Add arch_in_rcu_eqs() All architectures have an interruptible RCU extended quiescent state (EQS) as part of their idle sequences, where interrupts can occur without RCU watching. Entry code must account for this and wake RCU as necessary; the common entry code deals with this in irqentry_enter() by treating any interrupt from an idle thread as potentially having occurred within an EQS and waking RCU for the duration of the interrupt via rcu_irq_enter() .. rcu_irq_exit(). Some architectures may have other interruptible EQSs which require similar treatment. For example, on s390 it is necessary to enable interrupts around guest entry in the middle of a period where core KVM code has entered an EQS. So that architectures can wake RCU in these cases, this patch adds a new arch_in_rcu_eqs() hook to the common entry code which is checked in addition to the existing is_idle_thread() check, with RCU woken if either returns true. A default implementation is provided which always returns false, which suffices for most architectures. As no architectures currently implement arch_in_rcu_eqs(), there should be no functional change as a result of this patch alone. A subsequent patch will add an s390 implementation to fix a latent bug with missing RCU wakeups. [ajd@linux.ibm.com: rebase, fix commit message] Signed-off-by: Mark Rutland Cc: Andy Lutomirski Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Claudio Imbrenda Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Janosch Frank Reviewed-by: Christian Borntraeger Signed-off-by: Andrew Donnellan Acked-by: Peter Zijlstra (Intel) Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20250708092742.104309-2-ajd@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20250708092742.104309-2-ajd@linux.ibm.com> --- include/linux/entry-common.h | 16 ++++++++++++++++ kernel/entry/common.c | 3 ++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index f94f3fdf15fc..3bf99cbad8a3 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -86,6 +86,22 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs); static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {} #endif +/** + * arch_in_rcu_eqs - Architecture specific check for RCU extended quiescent + * states. + * + * Returns: true if the CPU is potentially in an RCU EQS, false otherwise. + * + * Architectures only need to define this if threads other than the idle thread + * may have an interruptible EQS. This does not need to handle idle threads. It + * is safe to over-estimate at the cost of redundant RCU management work. + * + * Invoked from irqentry_enter() + */ +#ifndef arch_in_rcu_eqs +static __always_inline bool arch_in_rcu_eqs(void) { return false; } +#endif + /** * enter_from_user_mode - Establish state when coming from user mode * diff --git a/kernel/entry/common.c b/kernel/entry/common.c index a8dd1f27417c..eb52d38e8099 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -220,7 +220,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * TINY_RCU does not support EQS, so let the compiler eliminate * this part when enabled. */ - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + if (!IS_ENABLED(CONFIG_TINY_RCU) && + (is_idle_task(current) || arch_in_rcu_eqs())) { /* * If RCU is not watching then the same careful * sequence vs. lockdep and tracing is required From 57d88f02eb4449d96dfee3af4b7cd4287998bdbd Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Jul 2025 19:27:42 +1000 Subject: [PATCH 2/2] KVM: s390: Rework guest entry logic In __vcpu_run() and do_vsie_run(), we enter an RCU extended quiescent state (EQS) by calling guest_enter_irqoff(), which lasts until __vcpu_run() calls guest_exit_irqoff(). However, between the two we enable interrupts and may handle interrupts during the EQS. As the IRQ entry code will not wake RCU in this case, we may run the core IRQ code and IRQ handler without RCU watching, leading to various potential problems. It is necessary to unmask (host) interrupts around entering the guest, as entering the guest via SIE will not automatically unmask these. When a host interrupt is taken from a guest, it is taken via its regular host IRQ handler rather than being treated as a direct exit from SIE. Due to this, we cannot simply mask interrupts around guest entry, and must handle interrupts during this window, waking RCU as required. Additionally, between guest_enter_irqoff() and guest_exit_irqoff(), we use local_irq_enable() and local_irq_disable() to unmask interrupts, violating the ordering requirements for RCU/lockdep/tracing around entry/exit sequences. Further, since this occurs in an instrumentable function, it's possible that instrumented code runs during this window, with potential usage of RCU, etc. To fix the RCU wakeup problem, an s390 implementation of arch_in_rcu_eqs() is added which checks for PF_VCPU in current->flags. PF_VCPU is set/cleared by guest_timing_{enter,exit}_irqoff(), which surround the actual guest entry. To fix the remaining issues, the lower-level guest entry logic is moved into a shared noinstr helper function using the guest_state_{enter,exit}_irqoff() helpers. These perform all the lockdep/RCU/tracing manipulation necessary, but as sie64a() does not enable/disable interrupts, we must do this explicitly with the non-instrumented arch_local_irq_{enable,disable}() helpers: guest_state_enter_irqoff() arch_local_irq_enable(); sie64a(...); arch_local_irq_disable(); guest_state_exit_irqoff(); [ajd@linux.ibm.com: rebase, fix commit message] Signed-off-by: Mark Rutland Cc: Christian Borntraeger Cc: Frederic Weisbecker Cc: Heiko Carstens Cc: Janosch Frank Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Claudio Imbrenda Cc: Alexander Gordeev Signed-off-by: Andrew Donnellan Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20250708092742.104309-3-ajd@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20250708092742.104309-3-ajd@linux.ibm.com> --- arch/s390/include/asm/entry-common.h | 10 ++++++ arch/s390/include/asm/kvm_host.h | 3 ++ arch/s390/kvm/kvm-s390.c | 51 +++++++++++++++++++++------- arch/s390/kvm/vsie.c | 17 ++++------ 4 files changed, 59 insertions(+), 22 deletions(-) diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h index 35555c944630..979af986a8fe 100644 --- a/arch/s390/include/asm/entry-common.h +++ b/arch/s390/include/asm/entry-common.h @@ -59,4 +59,14 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare +static __always_inline bool arch_in_rcu_eqs(void) +{ + if (IS_ENABLED(CONFIG_KVM)) + return current->flags & PF_VCPU; + + return false; +} + +#define arch_in_rcu_eqs arch_in_rcu_eqs + #endif diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index cb89e54ada25..f870d09515cc 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -716,6 +716,9 @@ extern char sie_exit; bool kvm_s390_pv_is_protected(struct kvm *kvm); bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu); +extern int kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb, + u64 *gprs, unsigned long gasce); + extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d5ad10791c25..bfe9ba5c4f45 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5062,6 +5062,30 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) return vcpu_post_run_handle_fault(vcpu); } +int noinstr kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb, + u64 *gprs, unsigned long gasce) +{ + int ret; + + guest_state_enter_irqoff(); + + /* + * The guest_state_{enter,exit}_irqoff() functions inform lockdep and + * tracing that entry to the guest will enable host IRQs, and exit from + * the guest will disable host IRQs. + * + * We must not use lockdep/tracing/RCU in this critical section, so we + * use the low-level arch_local_irq_*() helpers to enable/disable IRQs. + */ + arch_local_irq_enable(); + ret = sie64a(scb, gprs, gasce); + arch_local_irq_disable(); + + guest_state_exit_irqoff(); + + return ret; +} + #define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK) static int __vcpu_run(struct kvm_vcpu *vcpu) { @@ -5082,20 +5106,27 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) kvm_vcpu_srcu_read_unlock(vcpu); /* * As PF_VCPU will be used in fault handler, between - * guest_enter and guest_exit should be no uaccess. + * guest_timing_enter_irqoff and guest_timing_exit_irqoff + * should be no uaccess. */ - local_irq_disable(); - guest_enter_irqoff(); - __disable_cpu_timer_accounting(vcpu); - local_irq_enable(); if (kvm_s390_pv_cpu_is_protected(vcpu)) { memcpy(sie_page->pv_grregs, vcpu->run->s.regs.gprs, sizeof(sie_page->pv_grregs)); } - exit_reason = sie64a(vcpu->arch.sie_block, - vcpu->run->s.regs.gprs, - vcpu->arch.gmap->asce); + + local_irq_disable(); + guest_timing_enter_irqoff(); + __disable_cpu_timer_accounting(vcpu); + + exit_reason = kvm_s390_enter_exit_sie(vcpu->arch.sie_block, + vcpu->run->s.regs.gprs, + vcpu->arch.gmap->asce); + + __enable_cpu_timer_accounting(vcpu); + guest_timing_exit_irqoff(); + local_irq_enable(); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { memcpy(vcpu->run->s.regs.gprs, sie_page->pv_grregs, @@ -5111,10 +5142,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; } } - local_irq_disable(); - __enable_cpu_timer_accounting(vcpu); - guest_exit_irqoff(); - local_irq_enable(); kvm_vcpu_srcu_read_lock(vcpu); rc = vcpu_post_run(vcpu, exit_reason); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 13a9661d2b28..347268f89f2f 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1170,10 +1170,6 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) vcpu->arch.sie_block->fpf & FPF_BPBC) set_thread_flag(TIF_ISOLATE_BP_GUEST); - local_irq_disable(); - guest_enter_irqoff(); - local_irq_enable(); - /* * Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking * and VCPU requests also hinder the vSIE from running and lead @@ -1183,15 +1179,16 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) vcpu->arch.sie_block->prog0c |= PROG_IN_SIE; current->thread.gmap_int_code = 0; barrier(); - if (!kvm_s390_vcpu_sie_inhibited(vcpu)) - rc = sie64a(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce); + if (!kvm_s390_vcpu_sie_inhibited(vcpu)) { + local_irq_disable(); + guest_timing_enter_irqoff(); + rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce); + guest_timing_exit_irqoff(); + local_irq_enable(); + } barrier(); vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE; - local_irq_disable(); - guest_exit_irqoff(); - local_irq_enable(); - /* restore guest state for bp isolation override */ if (!guest_bp_isolation) clear_thread_flag(TIF_ISOLATE_BP_GUEST);