From 1b558e14f3c17dc29ce2e8cd0b8bd385e108734b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Jul 2025 16:12:19 +0200 Subject: [PATCH 01/49] x86/apic: Make the ISR clearing sane apic_pending_intr_clear() is fundamentally voodoo programming. It's primary purpose is to clear stale ISR bits in the local APIC, which would otherwise lock the corresponding interrupt priority level. The comments and the implementation claim falsely that after clearing the stale ISR bits, eventually stale IRR bits would be turned into ISR bits and can be cleared as well. That's just wishful thinking because: 1) If interrupts are disabled, the APIC does not propagate an IRR bit to the ISR. 2) If interrupts are enabled, then the APIC propagates the IRR bit to the ISR and raises the interrupt in the CPU, which means that code _cannot_ observe the ISR bit for any of those IRR bits. Rename the function to reflect the purpose and make exactly _one_ attempt to EOI the pending ISR bits and add comments why traversing the pending bit map in low to high priority order is correct. Instead of trying to "clear" IRR bits, simply print a warning message when the IRR is not empty. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/871ppwih4s.ffs@tglx --- arch/x86/kernel/apic/apic.c | 79 ++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 41 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d73ba5a7b623..ff4029b57e95 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1428,63 +1428,61 @@ union apic_ir { u32 regs[APIC_IR_REGS]; }; -static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) +static bool apic_check_and_eoi_isr(union apic_ir *isr) { int i, bit; - /* Read the IRRs */ - for (i = 0; i < APIC_IR_REGS; i++) - irr->regs[i] = apic_read(APIC_IRR + i * 0x10); - /* Read the ISRs */ for (i = 0; i < APIC_IR_REGS; i++) isr->regs[i] = apic_read(APIC_ISR + i * 0x10); - /* - * If the ISR map is not empty. ACK the APIC and run another round - * to verify whether a pending IRR has been unblocked and turned - * into a ISR. - */ - if (!bitmap_empty(isr->map, APIC_IR_BITS)) { - /* - * There can be multiple ISR bits set when a high priority - * interrupt preempted a lower priority one. Issue an ACK - * per set bit. - */ - for_each_set_bit(bit, isr->map, APIC_IR_BITS) - apic_eoi(); + /* If the ISR map empty, nothing to do here. */ + if (bitmap_empty(isr->map, APIC_IR_BITS)) return true; - } - return !bitmap_empty(irr->map, APIC_IR_BITS); + /* + * There can be multiple ISR bits set when a high priority + * interrupt preempted a lower priority one. Issue an EOI for each + * set bit. The priority traversal order does not matter as there + * can't be new ISR bits raised at this point. What matters is that + * an EOI is issued for each ISR bit. + */ + for_each_set_bit(bit, isr->map, APIC_IR_BITS) + apic_eoi(); + + /* Reread the ISRs, they should be empty now */ + for (i = 0; i < APIC_IR_REGS; i++) + isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + + return bitmap_empty(isr->map, APIC_IR_BITS); } /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. + * If a CPU services an interrupt and crashes before issuing EOI to the + * local APIC, the corresponding ISR bit is still set when the crashing CPU + * jumps into a crash kernel. Read the ISR and issue an EOI for each set + * bit to acknowledge it as otherwise these slots would be locked forever + * waiting for an EOI. * - * Most probably by now the CPU has serviced that pending interrupt and it - * might not have done the apic_eoi() because it thought, interrupt - * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear - * the ISR bit and cpu thinks it has already serviced the interrupt. Hence - * a vector might get locked. It was noticed for timer irq (vector - * 0x31). Issue an extra EOI to clear ISR. + * If there are pending bits in the IRR, then they won't be converted into + * ISR bits as the CPU has interrupts disabled. They will be delivered once + * the CPU enables interrupts and there is nothing which can prevent that. * - * If there are pending IRR bits they turn into ISR bits after a higher - * priority ISR bit has been acked. + * In the worst case this results in spurious interrupt warnings. */ -static void apic_pending_intr_clear(void) +static void apic_clear_isr(void) { - union apic_ir irr, isr; + union apic_ir ir; unsigned int i; - /* 512 loops are way oversized and give the APIC a chance to obey. */ - for (i = 0; i < 512; i++) { - if (!apic_check_and_ack(&irr, &isr)) - return; - } - /* Dump the IRR/ISR content if that failed */ - pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map); + if (!apic_check_and_eoi_isr(&ir)) + pr_warn("APIC: Stale ISR: %256pb\n", ir.map); + + for (i = 0; i < APIC_IR_REGS; i++) + ir.regs[i] = apic_read(APIC_IRR + i * 0x10); + + if (!bitmap_empty(ir.map, APIC_IR_BITS)) + pr_warn("APIC: Stale IRR: %256pb\n", ir.map); } /** @@ -1541,8 +1539,7 @@ static void setup_local_APIC(void) value |= 0x10; apic_write(APIC_TASKPRI, value); - /* Clear eventually stale ISR/IRR bits */ - apic_pending_intr_clear(); + apic_clear_isr(); /* * Now that we are all set up, enable the APIC From e2e29752357f32feb69a68e9e6e7361405b3f289 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:04 +0200 Subject: [PATCH 02/49] x86/sev: Separate MSR and GHCB based snp_cpuid() via a callback There are two distinct callers of snp_cpuid(): the MSR protocol and the GHCB page based interface. The snp_cpuid() logic does not care about the distinction, which only matters at a lower level. But the fact that it supports both interfaces means that the GHCB page based logic is pulled into the early startup code where PA to VA conversions are problematic, given that it runs from the 1:1 mapping of memory. So keep snp_cpuid() itself in the startup code, but factor out the hypervisor calls via a callback, so that the GHCB page handling can be moved out. Code refactoring only - no functional change intended. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/20250828102202.1849035-25-ardb+git@google.com --- arch/x86/boot/startup/sev-shared.c | 59 ++++++------------------------ arch/x86/coco/sev/vc-shared.c | 49 ++++++++++++++++++++++++- arch/x86/include/asm/sev.h | 3 +- 3 files changed, 61 insertions(+), 50 deletions(-) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index a34cd19796f9..ed88dfe7605e 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -342,44 +342,7 @@ static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf) return ret; } -static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - u32 cr4 = native_read_cr4(); - int ret; - ghcb_set_rax(ghcb, leaf->fn); - ghcb_set_rcx(ghcb, leaf->subfn); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #UD - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - leaf->eax = ghcb->save.rax; - leaf->ebx = ghcb->save.rbx; - leaf->ecx = ghcb->save.rcx; - leaf->edx = ghcb->save.rdx; - - return ES_OK; -} - -static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf) - : __sev_cpuid_hv_msr(leaf); -} /* * This may be called early while still running on the initial identity @@ -484,21 +447,21 @@ snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) return false; } -static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +static void snp_cpuid_hv_msr(void *ctx, struct cpuid_leaf *leaf) { - if (sev_cpuid_hv(ghcb, ctxt, leaf)) + if (__sev_cpuid_hv_msr(leaf)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); } static int __head -snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - struct cpuid_leaf *leaf) +snp_cpuid_postprocess(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) { struct cpuid_leaf leaf_hv = *leaf; switch (leaf->fn) { case 0x1: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* initial APIC ID */ leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); @@ -517,7 +480,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, break; case 0xB: leaf_hv.subfn = 0; - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* extended APIC ID */ leaf->edx = leaf_hv.edx; @@ -565,7 +528,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, } break; case 0x8000001E: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* extended APIC ID */ leaf->eax = leaf_hv.eax; @@ -586,8 +549,8 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value * should be treated as fatal by caller. */ -int __head -snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +int __head snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -621,7 +584,7 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) return 0; } - return snp_cpuid_postprocess(ghcb, ctxt, leaf); + return snp_cpuid_postprocess(cpuid_fn, ctx, leaf); } /* @@ -648,7 +611,7 @@ void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) leaf.fn = fn; leaf.subfn = subfn; - ret = snp_cpuid(NULL, NULL, &leaf); + ret = snp_cpuid(snp_cpuid_hv_msr, NULL, &leaf); if (!ret) goto cpuid_done; diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c index 2c0ab0fdc060..b4688f69102e 100644 --- a/arch/x86/coco/sev/vc-shared.c +++ b/arch/x86/coco/sev/vc-shared.c @@ -409,15 +409,62 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + u32 cr4 = native_read_cr4(); + int ret; + + ghcb_set_rax(ghcb, leaf->fn); + ghcb_set_rcx(ghcb, leaf->subfn); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #UD - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + leaf->eax = ghcb->save.rax; + leaf->ebx = ghcb->save.rbx; + leaf->ecx = ghcb->save.rcx; + leaf->edx = ghcb->save.rdx; + + return ES_OK; +} + +struct cpuid_ctx { + struct ghcb *ghcb; + struct es_em_ctxt *ctxt; +}; + +static void snp_cpuid_hv_ghcb(void *p, struct cpuid_leaf *leaf) +{ + struct cpuid_ctx *ctx = p; + + if (__sev_cpuid_hv_ghcb(ctx->ghcb, ctx->ctxt, leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { + struct cpuid_ctx ctx = { ghcb, ctxt }; struct pt_regs *regs = ctxt->regs; struct cpuid_leaf leaf; int ret; leaf.fn = regs->ax; leaf.subfn = regs->cx; - ret = snp_cpuid(ghcb, ctxt, &leaf); + ret = snp_cpuid(snp_cpuid_hv_ghcb, &ctx, &leaf); if (!ret) { regs->ax = leaf.eax; regs->bx = leaf.ebx; diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 02236962fdb1..e4622e470ceb 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -552,7 +552,8 @@ struct cpuid_leaf { u32 edx; }; -int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf); +int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf); void __noreturn sev_es_terminate(unsigned int set, unsigned int reason); enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, From 30c2b98aa84c76f2ae60e66dd4ec2d9497713359 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 12:33:17 +0530 Subject: [PATCH 03/49] x86/apic: Add new driver for Secure AVIC The Secure AVIC feature provides SEV-SNP guests hardware acceleration for performance sensitive APIC accesses while securely managing the guest-owned APIC state through the use of a private APIC backing page. This helps prevent the hypervisor from generating unexpected interrupts for a vCPU or otherwise violate architectural assumptions around the APIC behavior. Add a new x2APIC driver that will serve as the base of the Secure AVIC support. It is initially the same as the x2APIC physical driver (without IPI callbacks), but will be modified as features are implemented. As the new driver does not implement Secure AVIC features yet, if the hypervisor sets the Secure AVIC bit in SEV_STATUS, maintain the existing behavior to enforce the guest termination. [ bp: Massage commit message. ] Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828070334.208401-2-Neeraj.Upadhyay@amd.com --- arch/x86/Kconfig | 13 ++++++ arch/x86/boot/compressed/sev.c | 1 + arch/x86/coco/core.c | 3 ++ arch/x86/coco/sev/core.c | 1 + arch/x86/include/asm/msr-index.h | 4 +- arch/x86/kernel/apic/Makefile | 1 + arch/x86/kernel/apic/x2apic_savic.c | 63 +++++++++++++++++++++++++++++ include/linux/cc_platform.h | 8 ++++ 8 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/apic/x2apic_savic.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100..e32952728d9a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -483,6 +483,19 @@ config X86_X2APIC If in doubt, say Y. +config AMD_SECURE_AVIC + bool "AMD Secure AVIC" + depends on AMD_MEM_ENCRYPT && X86_X2APIC + help + Enable this to get AMD Secure AVIC support on guests that have this feature. + + AMD Secure AVIC provides hardware acceleration for performance sensitive + APIC accesses and support for managing guest owned APIC state for SEV-SNP + guests. Secure AVIC does not support xAPIC mode. It has functional + dependency on x2apic being enabled in the guest. + + If you don't know what to do here, say N. + config X86_POSTED_MSI bool "Enable MSI and MSI-x delivery by posted interrupts" depends on X86_64 && IRQ_REMAP diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index fd1b67dfea22..74e083feb2d9 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -235,6 +235,7 @@ bool sev_es_check_ghcb_fault(unsigned long address) MSR_AMD64_SNP_VMSA_REG_PROT | \ MSR_AMD64_SNP_RESERVED_BIT13 | \ MSR_AMD64_SNP_RESERVED_BIT15 | \ + MSR_AMD64_SNP_SECURE_AVIC | \ MSR_AMD64_SNP_RESERVED_MASK) /* diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index d4610af68114..989ca9f72ba3 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -104,6 +104,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_HOST_SEV_SNP: return cc_flags.host_sev_snp; + case CC_ATTR_SNP_SECURE_AVIC: + return sev_status & MSR_AMD64_SNP_SECURE_AVIC; + default: return false; } diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 14ef5908fb27..f7a549f650e9 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -79,6 +79,7 @@ static const char * const sev_status_feat_names[] = { [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt", [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt", [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", + [MSR_AMD64_SNP_SECURE_AVIC_BIT] = "SecureAVIC", }; /* diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa14..2a6d4fd8659a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -699,7 +699,9 @@ #define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) #define MSR_AMD64_SNP_SMT_PROT_BIT 17 #define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) -#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT) +#define MSR_AMD64_SNP_RESV_BIT 19 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52d1808ee360..581db89477f9 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -18,6 +18,7 @@ ifeq ($(CONFIG_X86_64),y) # APIC probe will depend on the listing order here obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_AMD_SECURE_AVIC) += x2apic_savic.o obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c new file mode 100644 index 000000000000..bea844f28192 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure AVIC Support (SEV-SNP Guests) + * + * Copyright (C) 2024 Advanced Micro Devices, Inc. + * + * Author: Neeraj Upadhyay + */ + +#include + +#include +#include + +#include "local.h" + +static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); +} + +static int savic_probe(void) +{ + if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return 0; + + if (!x2apic_mode) { + pr_err("Secure AVIC enabled in non x2APIC mode\n"); + snp_abort(); + /* unreachable */ + } + + return 1; +} + +static struct apic apic_x2apic_savic __ro_after_init = { + + .name = "secure avic x2apic", + .probe = savic_probe, + .acpi_madt_oem_check = savic_acpi_madt_oem_check, + + .dest_mode_logical = false, + + .disable_esr = 0, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, + .get_apic_id = x2apic_get_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .nmi_to_offline_cpu = true, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi = native_apic_msr_eoi, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, +}; + +apic_driver(apic_x2apic_savic); diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index 0bf7d33a1048..7fcec025c5e0 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -96,6 +96,14 @@ enum cc_attr { * enabled to run SEV-SNP guests. */ CC_ATTR_HOST_SEV_SNP, + + /** + * @CC_ATTR_SNP_SECURE_AVIC: Secure AVIC mode is active. + * + * The host kernel is running with the necessary features enabled + * to run SEV-SNP guests with full Secure AVIC capabilities. + */ + CC_ATTR_SNP_SECURE_AVIC, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM From c15a4705d59caeb44f4c373cf04e89041309e568 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:05 +0200 Subject: [PATCH 04/49] x86/sev: Use MSR protocol for remapping SVSM calling area As the preceding code comment already indicates, remapping the SVSM calling area occurs long before the GHCB page is configured, and so calling svsm_perform_call_protocol() is guaranteed to result in a call to svsm_perform_msr_protocol(). So just call the latter directly. This allows most of the GHCB based API infrastructure to be moved out of the startup code in a subsequent patch. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/20250828102202.1849035-26-ardb+git@google.com --- arch/x86/boot/startup/sev-shared.c | 11 +++++++++++ arch/x86/boot/startup/sev-startup.c | 5 ++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index ed88dfe7605e..975d2b02926a 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -724,6 +724,17 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) } } +static int __head svsm_call_msr_protocol(struct svsm_call *call) +{ + int ret; + + do { + ret = svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + return ret; +} + static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) { struct svsm_pvalidate_call *pc; diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 0b7e3b950183..8412807a865c 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -295,7 +295,6 @@ found_cc_info: static __head void svsm_setup(struct cc_blob_sev_info *cc_info) { struct svsm_call call = {}; - int ret; u64 pa; /* @@ -325,8 +324,8 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) call.caa = svsm_get_caa(); call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); call.rcx = pa; - ret = svsm_perform_call_protocol(&call); - if (ret) + + if (svsm_call_msr_protocol(&call)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); boot_svsm_caa = (struct svsm_ca *)pa; From 7cb7b6de9cb90311a917d65c0228b6aa223dc456 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:06 +0200 Subject: [PATCH 05/49] x86/sev: Use MSR protocol only for early SVSM PVALIDATE call The early page state change API performs an SVSM call to PVALIDATE each page when running under a SVSM, and this involves either a GHCB page based call or a call based on the MSR protocol. The GHCB page based variant involves VA to PA translation of the GHCB address, and this is best avoided in the startup code, where virtual addresses are ambiguous (1:1 or kernel virtual). As this is the last remaining occurrence of svsm_perform_call_protocol() in the startup code, switch to the MSR protocol exclusively in this particular case, so that the GHCB based plumbing can be moved out of the startup code entirely in a subsequent patch. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/20250828102202.1849035-27-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 20 -------------------- arch/x86/boot/startup/sev-shared.c | 9 ++++++--- 2 files changed, 6 insertions(+), 23 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index fd1b67dfea22..b71c1ab6a282 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -50,31 +50,11 @@ u64 svsm_get_caa_pa(void) return boot_svsm_caa_pa; } -int svsm_perform_call_protocol(struct svsm_call *call); - u8 snp_vmpl; /* Include code for early handlers */ #include "../../boot/startup/sev-shared.c" -int svsm_perform_call_protocol(struct svsm_call *call) -{ - struct ghcb *ghcb; - int ret; - - if (boot_ghcb) - ghcb = boot_ghcb; - else - ghcb = NULL; - - do { - ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); - } while (ret == -EAGAIN); - - return ret; -} - static bool sev_snp_enabled(void) { return sev_status & MSR_AMD64_SEV_SNP_ENABLED; diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 975d2b02926a..7bd73462c11e 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -741,7 +741,6 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) struct svsm_call call = {}; unsigned long flags; u64 pc_pa; - int ret; /* * This can be called very early in the boot, use native functions in @@ -766,8 +765,12 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE); call.rcx = pc_pa; - ret = svsm_perform_call_protocol(&call); - if (ret) + /* + * Use the MSR protocol exclusively, so that this code is usable in + * startup code where VA/PA translations of the GHCB page's address may + * be problematic. + */ + if (svsm_call_msr_protocol(&call)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); native_local_irq_restore(flags); From e349241b97a8b1169a4e90375159df4d22061f9a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:07 +0200 Subject: [PATCH 06/49] x86/sev: Run RMPADJUST on SVSM calling area page to test VMPL Determining the VMPL at which the kernel runs involves performing a RMPADJUST operation on an arbitrary page of memory, and observing whether it succeeds. The use of boot_ghcb_page in the core kernel in this case is completely arbitrary, but results in the need to provide a PIC alias for it. So use boot_svsm_ca_page instead, which already needs this alias for other reasons. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/20250828102202.1849035-28-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 2 +- arch/x86/boot/startup/sev-shared.c | 5 +++-- arch/x86/boot/startup/sev-startup.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index b71c1ab6a282..3628e9bddc6a 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -327,7 +327,7 @@ static bool early_snp_init(struct boot_params *bp) * running at VMPL0. The CA will be used to communicate with the * SVSM and request its services. */ - svsm_setup_ca(cc_info); + svsm_setup_ca(cc_info, rip_rel_ptr(&boot_ghcb_page)); /* * Pass run-time kernel a pointer to CC info via boot_params so EFI diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 7bd73462c11e..83c222a4f1fa 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -801,7 +801,8 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM * services needed when not running in VMPL0. */ -static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) +static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info, + void *page) { struct snp_secrets_page *secrets_page; struct snp_cpuid_table *cpuid_table; @@ -824,7 +825,7 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) * routine is running identity mapped when called, both by the decompressor * code and the early kernel code. */ - if (!rmpadjust((unsigned long)rip_rel_ptr(&boot_ghcb_page), RMP_PG_SIZE_4K, 1)) + if (!rmpadjust((unsigned long)page, RMP_PG_SIZE_4K, 1)) return false; /* diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 8412807a865c..3da04a715831 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -302,7 +302,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) * running at VMPL0. The CA will be used to communicate with the * SVSM to perform the SVSM services. */ - if (!svsm_setup_ca(cc_info)) + if (!svsm_setup_ca(cc_info, rip_rel_ptr(&boot_svsm_ca_page))) return; /* From b8c3c9f5d0505905e21c03731d1665c67053b47e Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 12:33:18 +0530 Subject: [PATCH 07/49] x86/apic: Initialize Secure AVIC APIC backing page With Secure AVIC, the APIC backing page is owned and managed by the guest. Allocate and initialize APIC backing page for all guest CPUs. The NPT entry for a vCPU's APIC backing page must always be present when the vCPU is running in order for Secure AVIC to function. A VMEXIT_BUSY is returned on VMRUN and the vCPU cannot be resumed otherwise. To handle this, notify GPA of the vCPU's APIC backing page to the hypervisor by using the SVM_VMGEXIT_SECURE_AVIC GHCB protocol event. Before executing VMRUN, the hypervisor makes use of this information to make sure the APIC backing page is mapped in the NPT. [ bp: Massage commit message. ] Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828070334.208401-3-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/core.c | 22 ++++++++++++++++++ arch/x86/include/asm/apic.h | 1 + arch/x86/include/asm/sev.h | 2 ++ arch/x86/include/uapi/asm/svm.h | 4 ++++ arch/x86/kernel/apic/apic.c | 3 +++ arch/x86/kernel/apic/x2apic_savic.c | 35 +++++++++++++++++++++++++++++ 6 files changed, 67 insertions(+) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index f7a549f650e9..7669aafcad95 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1108,6 +1108,28 @@ int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd) return 0; } +enum es_result savic_register_gpa(u64 gpa) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA); + ghcb_set_rbx(ghcb, gpa); + res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC, + SVM_VMGEXIT_SAVIC_REGISTER_GPA, 0); + + __sev_put_ghcb(&state); + + return res; +} + static void snp_register_per_cpu_ghcb(void) { struct sev_es_runtime_data *data; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 07ba4935e873..44b4080721a6 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -305,6 +305,7 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); + void (*setup)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); void (*init_apic_ldr)(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 02236962fdb1..9036122a6d45 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -533,6 +533,7 @@ int snp_svsm_vtpm_send_command(u8 *buffer); void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); +enum es_result savic_register_gpa(u64 gpa); static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { @@ -605,6 +606,7 @@ static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } +static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 9c640a521a67..650e3256ea7d 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -118,6 +118,10 @@ #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 #define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 +#define SVM_VMGEXIT_SAVIC 0x8000001a +#define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0 +#define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1 +#define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ff4029b57e95..7874284c1ca7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1501,6 +1501,9 @@ static void setup_local_APIC(void) return; } + if (apic->setup) + apic->setup(); + /* * If this comes from kexec/kcrash the APIC might be enabled in * SPIV. Soft disable it before doing further initialization. diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index bea844f28192..948d89497baa 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -8,17 +8,47 @@ */ #include +#include #include #include #include "local.h" +struct secure_avic_page { + u8 regs[PAGE_SIZE]; +} __aligned(PAGE_SIZE); + +static struct secure_avic_page __percpu *savic_page __ro_after_init; + static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); } +static void savic_setup(void) +{ + void *ap = this_cpu_ptr(savic_page); + enum es_result res; + unsigned long gpa; + + gpa = __pa(ap); + + /* + * The NPT entry for a vCPU's APIC backing page must always be + * present when the vCPU is running in order for Secure AVIC to + * function. A VMEXIT_BUSY is returned on VMRUN and the vCPU cannot + * be resumed if the NPT entry for the APIC backing page is not + * present. Notify GPA of the vCPU's APIC backing page to the + * hypervisor by calling savic_register_gpa(). Before executing + * VMRUN, the hypervisor makes use of this information to make sure + * the APIC backing page is mapped in NPT. + */ + res = savic_register_gpa(gpa); + if (res != ES_OK) + snp_abort(); +} + static int savic_probe(void) { if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) @@ -30,6 +60,10 @@ static int savic_probe(void) /* unreachable */ } + savic_page = alloc_percpu(struct secure_avic_page); + if (!savic_page) + snp_abort(); + return 1; } @@ -38,6 +72,7 @@ static struct apic apic_x2apic_savic __ro_after_init = { .name = "secure avic x2apic", .probe = savic_probe, .acpi_madt_oem_check = savic_acpi_madt_oem_check, + .setup = savic_setup, .dest_mode_logical = false, From c822f58a4fab25944ba66768c1d6c563aa6ac077 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:32:40 +0530 Subject: [PATCH 08/49] x86/apic: Populate .read()/.write() callbacks of Secure AVIC driver Add read() and write() APIC callback functions to read and write the x2APIC registers directly from the guest APIC backing page of a vCPU. The x2APIC registers are mapped at an offset within the guest APIC backing page which is the same as their x2APIC MMIO offset. Secure AVIC adds new registers such as ALLOWED_IRRs (which are at 4-byte offset within the IRR register offset range) and NMI_REQ to the APIC register space. When Secure AVIC is enabled, accessing the guest's APIC registers through RD/WRMSR results in a #VC exception (for non-accelerated register accesses) with error code VMEXIT_AVIC_NOACCEL. The #VC exception handler can read/write the x2APIC register in the guest APIC backing page to complete the RDMSR/WRMSR. Since doing this would increase the latency of accessing the x2APIC registers, instead of doing RDMSR/WRMSR based register accesses and handling reads/writes in the #VC exception, directly read/write the APIC registers from/to the guest APIC backing page of the vCPU in read() and write() callbacks of the Secure AVIC APIC driver. [ bp: Massage commit message. ] Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828110255.208779-1-Neeraj.Upadhyay@amd.com --- arch/x86/include/asm/apicdef.h | 2 + arch/x86/kernel/apic/x2apic_savic.c | 122 +++++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 094106b6a538..be39a543fbe5 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -135,6 +135,8 @@ #define APIC_TDR_DIV_128 0xA #define APIC_EFEAT 0x400 #define APIC_ECTRL 0x410 +#define APIC_SEOI 0x420 +#define APIC_IER 0x480 #define APIC_EILVTn(n) (0x500 + 0x10 * n) #define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ #define APIC_EILVT_NR_AMD_10H 4 diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 948d89497baa..5479605429c1 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -26,6 +27,123 @@ static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); } +#define SAVIC_ALLOWED_IRR 0x204 + +/* + * When Secure AVIC is enabled, RDMSR/WRMSR of the APIC registers + * result in #VC exception (for non-accelerated register accesses) + * with VMEXIT_AVIC_NOACCEL error code. The #VC exception handler + * can read/write the x2APIC register in the guest APIC backing page. + * + * Since doing this would increase the latency of accessing x2APIC + * registers, instead of doing RDMSR/WRMSR based accesses and + * handling the APIC register reads/writes in the #VC exception handler, + * the read() and write() callbacks directly read/write the APIC register + * from/to the vCPU's APIC backing page. + */ +static u32 savic_read(u32 reg) +{ + void *ap = this_cpu_ptr(savic_page); + + switch (reg) { + case APIC_LVTT: + case APIC_TMICT: + case APIC_TMCCT: + case APIC_TDCR: + case APIC_ID: + case APIC_LVR: + case APIC_TASKPRI: + case APIC_ARBPRI: + case APIC_PROCPRI: + case APIC_LDR: + case APIC_SPIV: + case APIC_ESR: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + case APIC_EFEAT: + case APIC_ECTRL: + case APIC_SEOI: + case APIC_IER: + case APIC_EILVTn(0) ... APIC_EILVTn(3): + return apic_get_reg(ap, reg); + case APIC_ICR: + return (u32)apic_get_reg64(ap, reg); + case APIC_ISR ... APIC_ISR + 0x70: + case APIC_TMR ... APIC_TMR + 0x70: + if (WARN_ONCE(!IS_ALIGNED(reg, 16), + "APIC register read offset 0x%x not aligned at 16 bytes", reg)) + return 0; + return apic_get_reg(ap, reg); + /* IRR and ALLOWED_IRR offset range */ + case APIC_IRR ... APIC_IRR + 0x74: + /* + * Valid APIC_IRR/SAVIC_ALLOWED_IRR registers are at 16 bytes strides from + * their respective base offset. APIC_IRRs are in the range + * + * (0x200, 0x210, ..., 0x270) + * + * while the SAVIC_ALLOWED_IRR range starts 4 bytes later, in the range + * + * (0x204, 0x214, ..., 0x274). + * + * Filter out everything else. + */ + if (WARN_ONCE(!(IS_ALIGNED(reg, 16) || + IS_ALIGNED(reg - 4, 16)), + "Misaligned APIC_IRR/ALLOWED_IRR APIC register read offset 0x%x", reg)) + return 0; + return apic_get_reg(ap, reg); + default: + pr_err("Error reading unknown Secure AVIC reg offset 0x%x\n", reg); + return 0; + } +} + +#define SAVIC_NMI_REQ 0x278 + +static void savic_write(u32 reg, u32 data) +{ + void *ap = this_cpu_ptr(savic_page); + + switch (reg) { + case APIC_LVTT: + case APIC_LVT0: + case APIC_LVT1: + case APIC_TMICT: + case APIC_TDCR: + case APIC_SELF_IPI: + case APIC_TASKPRI: + case APIC_EOI: + case APIC_SPIV: + case SAVIC_NMI_REQ: + case APIC_ESR: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVTERR: + case APIC_ECTRL: + case APIC_SEOI: + case APIC_IER: + case APIC_EILVTn(0) ... APIC_EILVTn(3): + apic_set_reg(ap, reg, data); + break; + case APIC_ICR: + apic_set_reg64(ap, reg, (u64)data); + break; + /* ALLOWED_IRR offsets are writable */ + case SAVIC_ALLOWED_IRR ... SAVIC_ALLOWED_IRR + 0x70: + if (IS_ALIGNED(reg - 4, 16)) { + apic_set_reg(ap, reg, data); + break; + } + fallthrough; + default: + pr_err("Error writing unknown Secure AVIC reg offset 0x%x\n", reg); + } +} + static void savic_setup(void) { void *ap = this_cpu_ptr(savic_page); @@ -88,8 +206,8 @@ static struct apic apic_x2apic_savic __ro_after_init = { .nmi_to_offline_cpu = true, - .read = native_apic_msr_read, - .write = native_apic_msr_write, + .read = savic_read, + .write = savic_write, .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, From 45e2cef568cdf87cb06c9783b45c8f08d1ab1cec Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:32:41 +0530 Subject: [PATCH 09/49] x86/apic: Initialize APIC ID for Secure AVIC Initialize the APIC ID in the Secure AVIC APIC backing page with the APIC_ID MSR value read from the hypervisor. CPU topology evaluation later during boot would catch and report any duplicate APIC ID for two CPUs. Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828110255.208779-2-Neeraj.Upadhyay@amd.com --- arch/x86/kernel/apic/x2apic_savic.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 5479605429c1..56c51ea4e5ab 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -150,6 +150,12 @@ static void savic_setup(void) enum es_result res; unsigned long gpa; + /* + * Before Secure AVIC is enabled, APIC MSR reads are intercepted. + * APIC_ID MSR read returns the value from the hypervisor. + */ + apic_set_reg(ap, APIC_ID, native_apic_msr_read(APIC_ID)); + gpa = __pa(ap); /* From 60791ef3751cb0ceccd6f5ac98276153745c7980 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:32:42 +0530 Subject: [PATCH 10/49] x86/apic: Add update_vector() callback for APIC drivers Add an update_vector() callback to allow APIC drivers to perform driver specific operations on external vector allocation/teardown on a CPU. This callback will be used by the Secure AVIC APIC driver to configure the vectors which a guest vCPU allows the hypervisor to send to it. As system vectors have fixed vector assignments and are not dynamically allocated, add an apic_update_vector() public API to facilitate update_vector() callback invocation for them. This will be used for Secure AVIC enabled guests to allow the hypervisor to inject system vectors which are emulated by the hypervisor such as APIC timer vector and HYPERVISOR_CALLBACK_VECTOR. While at it, cleanup line break in apic_update_irq_cfg(). Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828110255.208779-3-Neeraj.Upadhyay@amd.com --- arch/x86/include/asm/apic.h | 9 +++++++++ arch/x86/kernel/apic/vector.c | 28 +++++++++++++++++----------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 44b4080721a6..0683318470be 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -318,6 +318,8 @@ struct apic { /* wakeup secondary CPU using 64-bit wakeup point */ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); + void (*update_vector)(unsigned int cpu, unsigned int vector, bool set); + char *name; }; @@ -471,6 +473,12 @@ static __always_inline bool apic_id_valid(u32 apic_id) return apic_id <= apic->max_apic_id; } +static __always_inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) +{ + if (apic->update_vector) + apic->update_vector(cpu, vector, set); +} + #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 apic_read(u32 reg) { return 0; } @@ -482,6 +490,7 @@ static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); } static inline void apic_setup_apic_calls(void) { } +static inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) { } #define apic_update_callback(_callback, _fn) do { } while (0) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a947b46a8b64..bddc54465399 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -134,13 +134,20 @@ static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector, apicd->hw_irq_cfg.vector = vector; apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu); + + apic_update_vector(cpu, vector, true); + irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); - trace_vector_config(irqd->irq, vector, cpu, - apicd->hw_irq_cfg.dest_apicid); + trace_vector_config(irqd->irq, vector, cpu, apicd->hw_irq_cfg.dest_apicid); } -static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, - unsigned int newcpu) +static void apic_free_vector(unsigned int cpu, unsigned int vector, bool managed) +{ + apic_update_vector(cpu, vector, false); + irq_matrix_free(vector_matrix, cpu, vector, managed); +} + +static void chip_data_update(struct irq_data *irqd, unsigned int newvec, unsigned int newcpu) { struct apic_chip_data *apicd = apic_chip_data(irqd); struct irq_desc *desc = irq_data_to_desc(irqd); @@ -174,8 +181,7 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, apicd->prev_cpu = apicd->cpu; WARN_ON_ONCE(apicd->cpu == newcpu); } else { - irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, - managed); + apic_free_vector(apicd->cpu, apicd->vector, managed); } setnew: @@ -261,7 +267,7 @@ assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) trace_vector_alloc(irqd->irq, vector, resvd, vector); if (vector < 0) return vector; - apic_update_vector(irqd, vector, cpu); + chip_data_update(irqd, vector, cpu); return 0; } @@ -337,7 +343,7 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) trace_vector_alloc_managed(irqd->irq, vector, vector); if (vector < 0) return vector; - apic_update_vector(irqd, vector, cpu); + chip_data_update(irqd, vector, cpu); return 0; } @@ -357,7 +363,7 @@ static void clear_irq_vector(struct irq_data *irqd) apicd->prev_cpu); per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN; - irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); + apic_free_vector(apicd->cpu, vector, managed); apicd->vector = 0; /* Clean up move in progress */ @@ -366,7 +372,7 @@ static void clear_irq_vector(struct irq_data *irqd) return; per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN; - irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); + apic_free_vector(apicd->prev_cpu, vector, managed); apicd->prev_vector = 0; apicd->move_in_progress = 0; hlist_del_init(&apicd->clist); @@ -905,7 +911,7 @@ static void free_moved_vector(struct apic_chip_data *apicd) * affinity mask comes online. */ trace_vector_free_moved(apicd->irq, cpu, vector, managed); - irq_matrix_free(vector_matrix, cpu, vector, managed); + apic_free_vector(cpu, vector, managed); per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; hlist_del_init(&apicd->clist); apicd->prev_vector = 0; From 8c79a68de1d2d63537f2a318e5a3b27744c835ad Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:32:43 +0530 Subject: [PATCH 11/49] x86/apic: Add an update_vector() callback for Secure AVIC Add an update_vector() callback to set/clear the ALLOWED_IRR field in a vCPU's APIC backing page for vectors which are emulated by the hypervisor. The ALLOWED_IRR field indicates the interrupt vectors which the guest allows the hypervisor to inject (typically for emulated devices). Interrupt vectors used exclusively by the guest itself and the vectors which are not emulated by the hypervisor, such as IPI vectors, should not be set by the guest in the ALLOWED_IRR fields. As clearing/setting state of a vector will also be used in subsequent commits for other APIC registers (such as APIC_IRR update for sending IPI), add a common update_vector() in the Secure AVIC driver. [ bp: Massage commit message. ] Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828110255.208779-4-Neeraj.Upadhyay@amd.com --- arch/x86/kernel/apic/x2apic_savic.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 56c51ea4e5ab..942d3aa25082 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -27,6 +27,22 @@ static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); } +static inline void *get_reg_bitmap(unsigned int cpu, unsigned int offset) +{ + return &per_cpu_ptr(savic_page, cpu)->regs[offset]; +} + +static inline void update_vector(unsigned int cpu, unsigned int offset, + unsigned int vector, bool set) +{ + void *bitmap = get_reg_bitmap(cpu, offset); + + if (set) + apic_set_vector(vector, bitmap); + else + apic_clear_vector(vector, bitmap); +} + #define SAVIC_ALLOWED_IRR 0x204 /* @@ -144,6 +160,11 @@ static void savic_write(u32 reg, u32 data) } } +static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set) +{ + update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set); +} + static void savic_setup(void) { void *ap = this_cpu_ptr(savic_page); @@ -217,6 +238,8 @@ static struct apic apic_x2apic_savic __ro_after_init = { .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, + + .update_vector = savic_update_vector, }; apic_driver(apic_x2apic_savic); From 2c6978ea1a85603fe7d401f7bb3a1fbcab21fde2 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:38:24 +0530 Subject: [PATCH 12/49] x86/apic: Add support to send IPI for Secure AVIC Secure AVIC hardware accelerates only Self-IPI, i.e. on WRMSR to APIC_SELF_IPI and APIC_ICR (with destination shorthand equal to Self) registers, hardware takes care of updating the APIC_IRR in the APIC backing page of the vCPU. For other IPI types (cross-vCPU, broadcast IPIs), software needs to take care of updating the APIC_IRR state of the target vCPUs and to ensure that the target vCPUs notice the new pending interrupt. Add new callbacks in the Secure AVIC driver for sending IPI requests. These callbacks update the IRR in the target guest vCPU's APIC backing page. To ensure that the remote vCPU notices the new pending interrupt, reuse the GHCB MSR handling code in vc_handle_msr() to issue APIC_ICR MSR-write GHCB protocol event to the hypervisor. For Secure AVIC guests, on APIC_ICR write MSR exits, the hypervisor notifies the target vCPU by either sending an AVIC doorbell (if target vCPU is running) or by waking up the non-running target vCPU. Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828110824.208851-1-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/core.c | 28 ++++++ arch/x86/coco/sev/vc-handle.c | 11 ++- arch/x86/include/asm/sev-internal.h | 2 + arch/x86/include/asm/sev.h | 2 + arch/x86/kernel/apic/x2apic_savic.c | 138 +++++++++++++++++++++++++++- 5 files changed, 173 insertions(+), 8 deletions(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 7669aafcad95..bb33fc2265db 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1108,6 +1108,34 @@ int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd) return 0; } +void savic_ghcb_msr_write(u32 reg, u64 value) +{ + u64 msr = APIC_BASE_MSR + (reg >> 4); + struct pt_regs regs = { + .cx = msr, + .ax = lower_32_bits(value), + .dx = upper_32_bits(value) + }; + struct es_em_ctxt ctxt = { .regs = ®s }; + struct ghcb_state state; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + res = sev_es_ghcb_handle_msr(ghcb, &ctxt, true); + if (res != ES_OK) { + pr_err("Secure AVIC MSR (0x%llx) write returned error (%d)\n", msr, res); + /* MSR writes should never fail. Any failure is fatal error for SNP guest */ + snp_abort(); + } + + __sev_put_ghcb(&state); +} + enum es_result savic_register_gpa(u64 gpa) { struct ghcb_state state; diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index c3b4acbde0d8..c1aa10ce9d54 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -402,14 +402,10 @@ static enum es_result __vc_handle_secure_tsc_msrs(struct es_em_ctxt *ctxt, bool return ES_OK; } -static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write) { struct pt_regs *regs = ctxt->regs; enum es_result ret; - bool write; - - /* Is it a WRMSR? */ - write = ctxt->insn.opcode.bytes[1] == 0x30; switch (regs->cx) { case MSR_SVSM_CAA: @@ -439,6 +435,11 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_handle_msr(ghcb, ctxt, ctxt->insn.opcode.bytes[1] == 0x30); +} + static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) { int trapnr = ctxt->fi.vector; diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index 3dfd306d1c9e..6876655183a6 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -97,6 +97,8 @@ static __always_inline void sev_es_wr_ghcb_msr(u64 val) native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); } +enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write); + void snp_register_ghcb_early(unsigned long paddr); bool sev_es_negotiate_protocol(void); bool sev_es_check_cpu_features(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 9036122a6d45..fa2864eb3e20 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -534,6 +534,7 @@ int snp_svsm_vtpm_send_command(u8 *buffer); void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); enum es_result savic_register_gpa(u64 gpa); +void savic_ghcb_msr_write(u32 reg, u64 value); static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { @@ -607,6 +608,7 @@ static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; } +static inline void savic_ghcb_msr_write(u32 reg, u64 value) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 942d3aa25082..47dfbf0c5ec5 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -8,6 +8,7 @@ */ #include +#include #include #include @@ -120,6 +121,73 @@ static u32 savic_read(u32 reg) #define SAVIC_NMI_REQ 0x278 +/* + * On WRMSR to APIC_SELF_IPI register by the guest, Secure AVIC hardware + * updates the APIC_IRR in the APIC backing page of the vCPU. In addition, + * hardware evaluates the new APIC_IRR update for interrupt injection to + * the vCPU. So, self IPIs are hardware-accelerated. + */ +static inline void self_ipi_reg_write(unsigned int vector) +{ + native_apic_msr_write(APIC_SELF_IPI, vector); +} + +static void send_ipi_dest(unsigned int cpu, unsigned int vector) +{ + update_vector(cpu, APIC_IRR, vector, true); +} + +static void send_ipi_allbut(unsigned int vector) +{ + unsigned int cpu, src_cpu; + + guard(irqsave)(); + + src_cpu = raw_smp_processor_id(); + + for_each_cpu(cpu, cpu_online_mask) { + if (cpu == src_cpu) + continue; + send_ipi_dest(cpu, vector); + } +} + +static inline void self_ipi(unsigned int vector) +{ + u32 icr_low = APIC_SELF_IPI | vector; + + native_x2apic_icr_write(icr_low, 0); +} + +static void savic_icr_write(u32 icr_low, u32 icr_high) +{ + unsigned int dsh, vector; + u64 icr_data; + + dsh = icr_low & APIC_DEST_ALLBUT; + vector = icr_low & APIC_VECTOR_MASK; + + switch (dsh) { + case APIC_DEST_SELF: + self_ipi(vector); + break; + case APIC_DEST_ALLINC: + self_ipi(vector); + fallthrough; + case APIC_DEST_ALLBUT: + send_ipi_allbut(vector); + break; + default: + send_ipi_dest(icr_high, vector); + break; + } + + icr_data = ((u64)icr_high) << 32 | icr_low; + if (dsh != APIC_DEST_SELF) + savic_ghcb_msr_write(APIC_ICR, icr_data); + apic_set_reg64(this_cpu_ptr(savic_page), APIC_ICR, icr_data); +} + static void savic_write(u32 reg, u32 data) { void *ap = this_cpu_ptr(savic_page); @@ -130,7 +198,6 @@ static void savic_write(u32 reg, u32 data) case APIC_LVT1: case APIC_TMICT: case APIC_TDCR: - case APIC_SELF_IPI: case APIC_TASKPRI: case APIC_EOI: case APIC_SPIV: @@ -146,7 +213,10 @@ static void savic_write(u32 reg, u32 data) apic_set_reg(ap, reg, data); break; case APIC_ICR: - apic_set_reg64(ap, reg, (u64)data); + savic_icr_write(data, 0); + break; + case APIC_SELF_IPI: + self_ipi_reg_write(data); break; /* ALLOWED_IRR offsets are writable */ case SAVIC_ALLOWED_IRR ... SAVIC_ALLOWED_IRR + 0x70: @@ -160,6 +230,61 @@ static void savic_write(u32 reg, u32 data) } } +static void send_ipi(u32 dest, unsigned int vector, unsigned int dsh) +{ + unsigned int icr_low; + + icr_low = __prepare_ICR(dsh, vector, APIC_DEST_PHYSICAL); + savic_icr_write(icr_low, dest); +} + +static void savic_send_ipi(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_apicid, cpu); + + send_ipi(dest, vector, 0); +} + +static void send_ipi_mask(const struct cpumask *mask, unsigned int vector, bool excl_self) +{ + unsigned int cpu, this_cpu; + + guard(irqsave)(); + + this_cpu = raw_smp_processor_id(); + + for_each_cpu(cpu, mask) { + if (excl_self && cpu == this_cpu) + continue; + send_ipi(per_cpu(x86_cpu_to_apicid, cpu), vector, 0); + } +} + +static void savic_send_ipi_mask(const struct cpumask *mask, int vector) +{ + send_ipi_mask(mask, vector, false); +} + +static void savic_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + send_ipi_mask(mask, vector, true); +} + +static void savic_send_ipi_allbutself(int vector) +{ + send_ipi(0, vector, APIC_DEST_ALLBUT); +} + +static void savic_send_ipi_all(int vector) +{ + send_ipi(0, vector, APIC_DEST_ALLINC); +} + +static void savic_send_ipi_self(int vector) +{ + self_ipi_reg_write(vector); +} + static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set) { update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set); @@ -231,13 +356,20 @@ static struct apic apic_x2apic_savic __ro_after_init = { .calc_dest_apicid = apic_default_calc_apicid, + .send_IPI = savic_send_ipi, + .send_IPI_mask = savic_send_ipi_mask, + .send_IPI_mask_allbutself = savic_send_ipi_mask_allbutself, + .send_IPI_allbutself = savic_send_ipi_allbutself, + .send_IPI_all = savic_send_ipi_all, + .send_IPI_self = savic_send_ipi_self, + .nmi_to_offline_cpu = true, .read = savic_read, .write = savic_write, .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, - .icr_write = native_x2apic_icr_write, + .icr_write = savic_icr_write, .update_vector = savic_update_vector, }; From ea7d792e11e10f502933c39f3836cb73d35dac36 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:39:26 +0530 Subject: [PATCH 13/49] x86/apic: Support LAPIC timer for Secure AVIC Secure AVIC requires the LAPIC timer to be emulated by the hypervisor. KVM already supports emulating the LAPIC timer using hrtimers. In order to emulate it, APIC_LVTT, APIC_TMICT and APIC_TDCR register values need to be propagated to the hypervisor for arming the timer. APIC_TMCCT register value has to be read from the hypervisor, which is required for calibrating the APIC timer. So, read/write all APIC timer registers from/to the hypervisor. Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828110926.208866-1-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/core.c | 26 ++++++++++++++++++++++++++ arch/x86/include/asm/sev.h | 2 ++ arch/x86/kernel/apic/apic.c | 2 ++ arch/x86/kernel/apic/x2apic_savic.c | 7 +++++-- 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index bb33fc2265db..da9fa9d7254b 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1108,6 +1108,32 @@ int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd) return 0; } +u64 savic_ghcb_msr_read(u32 reg) +{ + u64 msr = APIC_BASE_MSR + (reg >> 4); + struct pt_regs regs = { .cx = msr }; + struct es_em_ctxt ctxt = { .regs = ®s }; + struct ghcb_state state; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + res = sev_es_ghcb_handle_msr(ghcb, &ctxt, false); + if (res != ES_OK) { + pr_err("Secure AVIC MSR (0x%llx) read returned error (%d)\n", msr, res); + /* MSR read failures are treated as fatal errors */ + snp_abort(); + } + + __sev_put_ghcb(&state); + + return regs.ax | regs.dx << 32; +} + void savic_ghcb_msr_write(u32 reg, u64 value) { u64 msr = APIC_BASE_MSR + (reg >> 4); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index fa2864eb3e20..875c7669ba95 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -534,6 +534,7 @@ int snp_svsm_vtpm_send_command(u8 *buffer); void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); enum es_result savic_register_gpa(u64 gpa); +u64 savic_ghcb_msr_read(u32 reg); void savic_ghcb_msr_write(u32 reg, u64 value); static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) @@ -609,6 +610,7 @@ static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; } static inline void savic_ghcb_msr_write(u32 reg, u64 value) { } +static inline u64 savic_ghcb_msr_read(u32 reg) { return 0; } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7874284c1ca7..db18810576bc 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -592,6 +592,8 @@ static void setup_APIC_timer(void) 0xF, ~0UL); } else clockevents_register_device(levt); + + apic_update_vector(smp_processor_id(), LOCAL_TIMER_VECTOR, true); } /* diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 47dfbf0c5ec5..bdefe4cd4e29 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -67,6 +67,7 @@ static u32 savic_read(u32 reg) case APIC_TMICT: case APIC_TMCCT: case APIC_TDCR: + return savic_ghcb_msr_read(reg); case APIC_ID: case APIC_LVR: case APIC_TASKPRI: @@ -194,10 +195,12 @@ static void savic_write(u32 reg, u32 data) switch (reg) { case APIC_LVTT: - case APIC_LVT0: - case APIC_LVT1: case APIC_TMICT: case APIC_TDCR: + savic_ghcb_msr_write(reg, data); + break; + case APIC_LVT0: + case APIC_LVT1: case APIC_TASKPRI: case APIC_EOI: case APIC_SPIV: From c77683eccf53428a6934df76702e33c0faf46fe5 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 28 Aug 2025 16:41:41 +0530 Subject: [PATCH 14/49] x86/sev: Initialize VGIF for secondary vCPUs for Secure AVIC Virtual GIF (VGIF) provides masking capability for when virtual interrupts (virtual maskable interrupts, virtual NMIs) can be taken by the guest vCPU. The Secure AVIC hardware reads VGIF state from the vCPU's VMSA. So, set VGIF for secondary CPUs (the configuration for the boot CPU is done by the hypervisor), to unmask delivery of virtual interrupts to the vCPU. Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828111141.208920-1-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index da9fa9d7254b..37b1d41e68d0 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -974,6 +974,9 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip, unsigned vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + vmsa->vintr_ctrl |= V_GIF_MASK; + /* SVME must be set. */ vmsa->efer = EFER_SVME; From 9de196f519a505cf104216d6f1d8688570dacca4 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:42:13 +0530 Subject: [PATCH 15/49] x86/apic: Add support to send NMI IPI for Secure AVIC Secure AVIC introduces a new field in the APIC backing page "NmiReq" that has to be set by the guest to request a NMI IPI through APIC_ICR write. Add support to set NmiReq appropriately to send NMI IPI. Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828111213.208933-1-Neeraj.Upadhyay@amd.com --- arch/x86/kernel/apic/x2apic_savic.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index bdefe4cd4e29..8ed56e87c32f 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -133,12 +133,15 @@ static inline void self_ipi_reg_write(unsigned int vector) native_apic_msr_write(APIC_SELF_IPI, vector); } -static void send_ipi_dest(unsigned int cpu, unsigned int vector) +static void send_ipi_dest(unsigned int cpu, unsigned int vector, bool nmi) { - update_vector(cpu, APIC_IRR, vector, true); + if (nmi) + apic_set_reg(per_cpu_ptr(savic_page, cpu), SAVIC_NMI_REQ, 1); + else + update_vector(cpu, APIC_IRR, vector, true); } -static void send_ipi_allbut(unsigned int vector) +static void send_ipi_allbut(unsigned int vector, bool nmi) { unsigned int cpu, src_cpu; @@ -149,14 +152,17 @@ static void send_ipi_allbut(unsigned int vector) for_each_cpu(cpu, cpu_online_mask) { if (cpu == src_cpu) continue; - send_ipi_dest(cpu, vector); + send_ipi_dest(cpu, vector, nmi); } } -static inline void self_ipi(unsigned int vector) +static inline void self_ipi(unsigned int vector, bool nmi) { u32 icr_low = APIC_SELF_IPI | vector; + if (nmi) + icr_low |= APIC_DM_NMI; + native_x2apic_icr_write(icr_low, 0); } @@ -164,22 +170,24 @@ static void savic_icr_write(u32 icr_low, u32 icr_high) { unsigned int dsh, vector; u64 icr_data; + bool nmi; dsh = icr_low & APIC_DEST_ALLBUT; vector = icr_low & APIC_VECTOR_MASK; + nmi = ((icr_low & APIC_DM_FIXED_MASK) == APIC_DM_NMI); switch (dsh) { case APIC_DEST_SELF: - self_ipi(vector); + self_ipi(vector, nmi); break; case APIC_DEST_ALLINC: - self_ipi(vector); + self_ipi(vector, nmi); fallthrough; case APIC_DEST_ALLBUT: - send_ipi_allbut(vector); + send_ipi_allbut(vector, nmi); break; default: - send_ipi_dest(icr_high, vector); + send_ipi_dest(icr_high, vector, nmi); break; } From 869e36b9660dd72ab960b74c55d7a200c22588d0 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:42:43 +0530 Subject: [PATCH 16/49] x86/apic: Allow NMI to be injected from hypervisor for Secure AVIC Secure AVIC requires the "AllowedNmi" bit in the Secure AVIC Control MSR to be set for an NMI to be injected from the hypervisor. So set it. Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828111243.208946-1-Neeraj.Upadhyay@amd.com --- arch/x86/include/asm/msr-index.h | 3 +++ arch/x86/kernel/apic/x2apic_savic.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 2a6d4fd8659a..1291e053e40c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -703,6 +703,9 @@ #define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT) #define MSR_AMD64_SNP_RESV_BIT 19 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) +#define MSR_AMD64_SAVIC_CONTROL 0xc0010138 +#define MSR_AMD64_SAVIC_ALLOWEDNMI_BIT 1 +#define MSR_AMD64_SAVIC_ALLOWEDNMI BIT_ULL(MSR_AMD64_SAVIC_ALLOWEDNMI_BIT) #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 #define MSR_AMD64_RMP_CFG 0xc0010136 diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 8ed56e87c32f..bbaedb48a7fb 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -328,6 +328,8 @@ static void savic_setup(void) res = savic_register_gpa(gpa); if (res != ES_OK) snp_abort(); + + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, gpa | MSR_AMD64_SAVIC_ALLOWEDNMI); } static int savic_probe(void) From 28bbfad229e4addf9990279c73c07b762b4a04e4 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 28 Aug 2025 16:43:15 +0530 Subject: [PATCH 17/49] x86/sev: Enable NMI support for Secure AVIC Now that support to send NMI IPI and support to inject NMI from the hypervisor has been added, set V_NMI_ENABLE in the VINTR_CTRL field of the VMSA to enable NMI for Secure AVIC guests. [ bp: Zap useless brackets. ] Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828111315.208959-1-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 37b1d41e68d0..e4740611228d 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -975,7 +975,7 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip, unsigned vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) - vmsa->vintr_ctrl |= V_GIF_MASK; + vmsa->vintr_ctrl |= V_GIF_MASK | V_NMI_ENABLE_MASK; /* SVME must be set. */ vmsa->efer = EFER_SVME; From 8e3714305ad29866d27aa354f09fd03036f44375 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:43:56 +0530 Subject: [PATCH 18/49] x86/apic: Read and write LVT* APIC registers from HV for SAVIC guests The Hypervisor needs information about the current state of the LVT registers for device emulation and NMIs. So, forward reads and write of these registers to the hypervisor for Secure AVIC enabled guests. Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828111356.208972-1-Neeraj.Upadhyay@amd.com --- arch/x86/kernel/apic/x2apic_savic.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index bbaedb48a7fb..b6d6e7a69c89 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -67,6 +67,11 @@ static u32 savic_read(u32 reg) case APIC_TMICT: case APIC_TMCCT: case APIC_TDCR: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: return savic_ghcb_msr_read(reg); case APIC_ID: case APIC_LVR: @@ -76,11 +81,6 @@ static u32 savic_read(u32 reg) case APIC_LDR: case APIC_SPIV: case APIC_ESR: - case APIC_LVTTHMR: - case APIC_LVTPC: - case APIC_LVT0: - case APIC_LVT1: - case APIC_LVTERR: case APIC_EFEAT: case APIC_ECTRL: case APIC_SEOI: @@ -205,18 +205,18 @@ static void savic_write(u32 reg, u32 data) case APIC_LVTT: case APIC_TMICT: case APIC_TDCR: - savic_ghcb_msr_write(reg, data); - break; case APIC_LVT0: case APIC_LVT1: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVTERR: + savic_ghcb_msr_write(reg, data); + break; case APIC_TASKPRI: case APIC_EOI: case APIC_SPIV: case SAVIC_NMI_REQ: case APIC_ESR: - case APIC_LVTTHMR: - case APIC_LVTPC: - case APIC_LVTERR: case APIC_ECTRL: case APIC_SEOI: case APIC_IER: From 43b6687ac8777821973d790ff9e9565a84cf6b98 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:46:54 +0530 Subject: [PATCH 19/49] x86/apic: Handle EOI writes for Secure AVIC guests Secure AVIC accelerates the guest's EOI MSR writes for edge-triggered interrupts. For level-triggered interrupts, EOI MSR writes trigger a #VC exception with an SVM_EXIT_AVIC_UNACCELERATED_ACCESS error code. To complete EOI handling, the #VC exception handler would need to trigger a GHCB protocol MSR write event to notify the hypervisor about completion of the level-triggered interrupt. Hypervisor notification is required for cases like emulated IO-APIC, to complete and clear interrupt in the IO-APIC's interrupt state. However, #VC exception handling adds extra performance overhead for APIC register writes. In addition, for Secure AVIC, some unaccelerated APIC register MSR writes are trapped, whereas others are faulted. This results in additional complexity in #VC exception handling for unaccelerated APIC MSR accesses. So, directly do a GHCB protocol based APIC EOI MSR write from apic->eoi() callback for level-triggered interrupts. Use WRMSR for edge-triggered interrupts, so that hardware re-evaluates any pending interrupt which can be delivered to the guest vCPU. For level-triggered interrupts, re-evaluation happens on return from VMGEXIT corresponding to the GHCB event for APIC EOI MSR write. Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828111654.208987-1-Neeraj.Upadhyay@amd.com --- arch/x86/kernel/apic/x2apic_savic.c | 31 ++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index b6d6e7a69c89..d76faeaced83 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -301,6 +301,35 @@ static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set) update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set); } +static void savic_eoi(void) +{ + unsigned int cpu; + int vec; + + cpu = raw_smp_processor_id(); + vec = apic_find_highest_vector(get_reg_bitmap(cpu, APIC_ISR)); + if (WARN_ONCE(vec == -1, "EOI write while no active interrupt in APIC_ISR")) + return; + + /* Is level-triggered interrupt? */ + if (apic_test_vector(vec, get_reg_bitmap(cpu, APIC_TMR))) { + update_vector(cpu, APIC_ISR, vec, false); + /* + * Propagate the EOI write to the hypervisor for level-triggered + * interrupts. Return to the guest from GHCB protocol event takes + * care of re-evaluating interrupt state. + */ + savic_ghcb_msr_write(APIC_EOI, 0); + } else { + /* + * Hardware clears APIC_ISR and re-evaluates the interrupt state + * to determine if there is any pending interrupt which can be + * delivered to CPU. + */ + native_apic_msr_eoi(); + } +} + static void savic_setup(void) { void *ap = this_cpu_ptr(savic_page); @@ -380,7 +409,7 @@ static struct apic apic_x2apic_savic __ro_after_init = { .read = savic_read, .write = savic_write, - .eoi = native_apic_msr_eoi, + .eoi = savic_eoi, .icr_read = native_x2apic_icr_read, .icr_write = savic_icr_write, From c8018325dd3e7c75c19b1e9263c358c4c96214f9 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:50:08 +0530 Subject: [PATCH 20/49] x86/apic: Add kexec support for Secure AVIC Add a apic->teardown() callback to disable Secure AVIC before rebooting into the new kernel. This ensures that the new kernel does not access the old APIC backing page which was allocated by the previous kernel. Such accesses can happen if there are any APIC accesses done during the guest boot before Secure AVIC driver probe is done by the new kernel (as Secure AVIC would have remained enabled in the Secure AVIC control MSR). Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828112008.209013-1-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/core.c | 23 +++++++++++++++++++++++ arch/x86/include/asm/apic.h | 1 + arch/x86/include/asm/sev.h | 2 ++ arch/x86/kernel/apic/apic.c | 3 +++ arch/x86/kernel/apic/x2apic_savic.c | 8 ++++++++ 5 files changed, 37 insertions(+) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index e4740611228d..b64f43010a12 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1187,6 +1187,29 @@ enum es_result savic_register_gpa(u64 gpa) return res; } +enum es_result savic_unregister_gpa(u64 *gpa) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA); + res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC, + SVM_VMGEXIT_SAVIC_UNREGISTER_GPA, 0); + if (gpa && res == ES_OK) + *gpa = ghcb->save.rbx; + + __sev_put_ghcb(&state); + + return res; +} + static void snp_register_per_cpu_ghcb(void) { struct sev_es_runtime_data *data; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 0683318470be..a26e66d66444 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -306,6 +306,7 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); void (*setup)(void); + void (*teardown)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); void (*init_apic_ldr)(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 875c7669ba95..46915dd163ed 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -534,6 +534,7 @@ int snp_svsm_vtpm_send_command(u8 *buffer); void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); enum es_result savic_register_gpa(u64 gpa); +enum es_result savic_unregister_gpa(u64 *gpa); u64 savic_ghcb_msr_read(u32 reg); void savic_ghcb_msr_write(u32 reg, u64 value); @@ -609,6 +610,7 @@ static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; } +static inline enum es_result savic_unregister_gpa(u64 *gpa) { return ES_UNSUPPORTED; } static inline void savic_ghcb_msr_write(u32 reg, u64 value) { } static inline u64 savic_ghcb_msr_read(u32 reg) { return 0; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index db18810576bc..680d305589a3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1170,6 +1170,9 @@ void disable_local_APIC(void) if (!apic_accessible()) return; + if (apic->teardown) + apic->teardown(); + apic_soft_disable(); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index d76faeaced83..36e6d0dbcc9c 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -330,6 +330,13 @@ static void savic_eoi(void) } } +static void savic_teardown(void) +{ + /* Disable Secure AVIC */ + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, 0); + savic_unregister_gpa(NULL); +} + static void savic_setup(void) { void *ap = this_cpu_ptr(savic_page); @@ -385,6 +392,7 @@ static struct apic apic_x2apic_savic __ro_after_init = { .probe = savic_probe, .acpi_madt_oem_check = savic_acpi_madt_oem_check, .setup = savic_setup, + .teardown = savic_teardown, .dest_mode_logical = false, From c4074ab87f3483deb15f277f302f199cdb997738 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 16:51:26 +0530 Subject: [PATCH 21/49] x86/apic: Enable Secure AVIC in the control MSR With all the pieces in place now, enable Secure AVIC in the Secure AVIC Control MSR. Any access to x2APIC MSRs are emulated by the hypervisor before Secure AVIC is enabled in the control MSR. Post Secure AVIC enablement, all x2APIC MSR accesses (whether accelerated by AVIC hardware or trapped as a #VC exception) operate on the vCPU's APIC backing page. Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828112126.209028-1-Neeraj.Upadhyay@amd.com --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/apic/x2apic_savic.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1291e053e40c..5951344009f1 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -704,6 +704,8 @@ #define MSR_AMD64_SNP_RESV_BIT 19 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_SAVIC_CONTROL 0xc0010138 +#define MSR_AMD64_SAVIC_EN_BIT 0 +#define MSR_AMD64_SAVIC_EN BIT_ULL(MSR_AMD64_SAVIC_EN_BIT) #define MSR_AMD64_SAVIC_ALLOWEDNMI_BIT 1 #define MSR_AMD64_SAVIC_ALLOWEDNMI BIT_ULL(MSR_AMD64_SAVIC_ALLOWEDNMI_BIT) #define MSR_AMD64_RMP_BASE 0xc0010132 diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index 36e6d0dbcc9c..b846de0fbcfa 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -365,7 +365,8 @@ static void savic_setup(void) if (res != ES_OK) snp_abort(); - native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, gpa | MSR_AMD64_SAVIC_ALLOWEDNMI); + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, + gpa | MSR_AMD64_SAVIC_EN | MSR_AMD64_SAVIC_ALLOWEDNMI); } static int savic_probe(void) From 952aefeebb3339d8129f7ca7fdb8f4344b6543a7 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 17:01:19 +0530 Subject: [PATCH 22/49] x86/sev: Prevent SECURE_AVIC_CONTROL MSR interception for Secure AVIC guests The SECURE_AVIC_CONTROL MSR holds the GPA of the guest APIC backing page and bitfields to control enablement of Secure AVIC and whether the guest allows NMIs to be injected by the hypervisor. This MSR is populated by the guest and can be read by the guest to get the GPA of the APIC backing page. The MSR can only be accessed in Secure AVIC mode. Any attempt to access it when not in Secure AVIC mode results in #GP. So, the hypervisor should not intercept it. A #VC exception will be generated otherwise. If this occurs and Secure AVIC is enabled, terminate the guest execution. Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828113119.209135-1-Neeraj.Upadhyay@amd.com --- arch/x86/coco/sev/vc-handle.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index c1aa10ce9d54..0fd94b7ce191 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -415,6 +415,15 @@ enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt if (sev_status & MSR_AMD64_SNP_SECURE_TSC) return __vc_handle_secure_tsc_msrs(ctxt, write); break; + case MSR_AMD64_SAVIC_CONTROL: + /* + * AMD64_SAVIC_CONTROL should not be intercepted when + * Secure AVIC is enabled. Terminate the Secure AVIC guest + * if the interception is enabled. + */ + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return ES_VMM_ERROR; + break; default: break; } From 27a17e02418e978198513edfb389b65237f4eaf5 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 17:02:24 +0530 Subject: [PATCH 23/49] x86/sev: Indicate the SEV-SNP guest supports Secure AVIC Now that Secure AVIC support is complete, make it part of to the SNP present features. Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828113225.209174-1-Neeraj.Upadhyay@amd.com --- arch/x86/boot/compressed/sev.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 74e083feb2d9..048d3e8839c3 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -238,13 +238,20 @@ bool sev_es_check_ghcb_fault(unsigned long address) MSR_AMD64_SNP_SECURE_AVIC | \ MSR_AMD64_SNP_RESERVED_MASK) +#ifdef CONFIG_AMD_SECURE_AVIC +#define SNP_FEATURE_SECURE_AVIC MSR_AMD64_SNP_SECURE_AVIC +#else +#define SNP_FEATURE_SECURE_AVIC 0 +#endif + /* * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented * by the guest kernel. As and when a new feature is implemented in the * guest kernel, a corresponding bit should be added to the mask. */ #define SNP_FEATURES_PRESENT (MSR_AMD64_SNP_DEBUG_SWAP | \ - MSR_AMD64_SNP_SECURE_TSC) + MSR_AMD64_SNP_SECURE_TSC | \ + SNP_FEATURE_SECURE_AVIC) u64 snp_get_unsupported_features(u64 status) { From 37dbd78f98a80e89b5413f4649d0fbd023d99b2f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:08 +0200 Subject: [PATCH 24/49] x86/sev: Move GHCB page based HV communication out of startup code Both the decompressor and the core kernel implement an early #VC handler, which only deals with CPUID instructions, and full featured one, which can handle any #VC exception. The former communicates with the hypervisor using the MSR based protocol, whereas the latter uses a shared GHCB page, which is configured a bit later during the boot, when the kernel runs from its ordinary virtual mapping, rather than the 1:1 mapping that the startup code uses. Accessing this shared GHCB page from the core kernel's startup code is problematic, because it involves converting the GHCB address provided by the caller to a physical address. In the startup code, virtual to physical address translations are problematic, given that the virtual address might be a 1:1 mapped address, and such translations should therefore be avoided. This means that exposing startup code dealing with the GHCB to callers that execute from the ordinary kernel virtual mapping should be avoided too. So move all GHCB page based communication out of the startup code, now that all communication occurring before the kernel virtual mapping is up relies on the MSR protocol only. As an exception, add a flag representing the need to apply the coherency fix in order to avoid exporting CPUID* helpers because of the code running too early for the *cpu_has* infrastructure. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-29-ardb+git@google.com --- arch/x86/boot/compressed/sev-handle-vc.c | 3 + arch/x86/boot/compressed/sev.c | 2 + arch/x86/boot/cpuflags.c | 13 -- arch/x86/boot/startup/sev-shared.c | 145 +---------------------- arch/x86/boot/startup/sev-startup.c | 42 ------- arch/x86/boot/startup/sme.c | 1 + arch/x86/coco/sev/core.c | 76 ++++++++++++ arch/x86/coco/sev/vc-handle.c | 2 + arch/x86/coco/sev/vc-shared.c | 94 +++++++++++++++ arch/x86/include/asm/sev-internal.h | 7 +- arch/x86/include/asm/sev.h | 12 +- 11 files changed, 196 insertions(+), 201 deletions(-) diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c index 89dd02de2a0f..7530ad8b768b 100644 --- a/arch/x86/boot/compressed/sev-handle-vc.c +++ b/arch/x86/boot/compressed/sev-handle-vc.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" +#include "error.h" #include "sev.h" #include @@ -14,6 +15,8 @@ #include #define __BOOT_COMPRESSED +#undef __init +#define __init /* Basic instruction decoding support needed */ #include "../../lib/inat.c" diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 3628e9bddc6a..f197173d60e6 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -371,6 +371,8 @@ static int sev_check_cpu_support(void) if (!(eax & BIT(1))) return -ENODEV; + sev_snp_needs_sfw = !(ebx & BIT(31)); + return ebx & 0x3f; } diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index 63e037e94e4c..916bac09b464 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -106,18 +106,5 @@ void get_cpuflags(void) cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6], &cpu.flags[1]); } - - if (max_amd_level >= 0x8000001f) { - u32 ebx; - - /* - * The X86_FEATURE_COHERENCY_SFW_NO feature bit is in - * the virtualization flags entry (word 8) and set by - * scattered.c, so the bit needs to be explicitly set. - */ - cpuid(0x8000001f, &ignored, &ebx, &ignored, &ignored); - if (ebx & BIT(31)) - set_bit(X86_FEATURE_COHERENCY_SFW_NO, cpu.flags); - } } } diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 83c222a4f1fa..348811aa7847 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -13,12 +13,9 @@ #ifndef __BOOT_COMPRESSED #define error(v) pr_err(v) -#define has_cpuflag(f) boot_cpu_has(f) #else #undef WARN #define WARN(condition, format...) (!!(condition)) -#undef vc_forward_exception -#define vc_forward_exception(c) panic("SNP: Hypervisor requested exception\n") #endif /* @@ -39,7 +36,7 @@ u64 boot_svsm_caa_pa __ro_after_init; * * GHCB protocol version negotiated with the hypervisor. */ -static u16 ghcb_version __ro_after_init; +u16 ghcb_version __ro_after_init; /* Copy of the SNP firmware's CPUID page. */ static struct snp_cpuid_table cpuid_table_copy __ro_after_init; @@ -54,15 +51,7 @@ static u32 cpuid_std_range_max __ro_after_init; static u32 cpuid_hyp_range_max __ro_after_init; static u32 cpuid_ext_range_max __ro_after_init; -bool __init sev_es_check_cpu_features(void) -{ - if (!has_cpuflag(X86_FEATURE_RDRAND)) { - error("RDRAND instruction not supported - no trusted source of randomness available\n"); - return false; - } - - return true; -} +bool sev_snp_needs_sfw; void __head __noreturn sev_es_terminate(unsigned int set, unsigned int reason) @@ -100,72 +89,7 @@ u64 get_hv_features(void) return GHCB_MSR_HV_FT_RESP_VAL(val); } -void snp_register_ghcb_early(unsigned long paddr) -{ - unsigned long pfn = paddr >> PAGE_SHIFT; - u64 val; - - sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - /* If the response GPA is not ours then abort the guest */ - if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || - (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); -} - -bool sev_es_negotiate_protocol(void) -{ - u64 val; - - /* Do the GHCB protocol version negotiation */ - sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - - if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) - return false; - - if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || - GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) - return false; - - ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); - - return true; -} - -static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - u32 ret; - - ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); - if (!ret) - return ES_OK; - - if (ret == 1) { - u64 info = ghcb->save.sw_exit_info_2; - unsigned long v = info & SVM_EVTINJ_VEC_MASK; - - /* Check if exception information from hypervisor is sane. */ - if ((info & SVM_EVTINJ_VALID) && - ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && - ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { - ctxt->fi.vector = v; - - if (info & SVM_EVTINJ_VALID_ERR) - ctxt->fi.error_code = info >> 32; - - return ES_EXCEPTION; - } - } - - return ES_VMM_ERROR; -} - -static inline int svsm_process_result_codes(struct svsm_call *call) +int svsm_process_result_codes(struct svsm_call *call) { switch (call->rax_out) { case SVSM_SUCCESS: @@ -193,7 +117,7 @@ static inline int svsm_process_result_codes(struct svsm_call *call) * - RAX specifies the SVSM protocol/callid as input and the return code * as output. */ -static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending) +void svsm_issue_call(struct svsm_call *call, u8 *pending) { register unsigned long rax asm("rax") = call->rax; register unsigned long rcx asm("rcx") = call->rcx; @@ -216,7 +140,7 @@ static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending) call->r9_out = r9; } -static int svsm_perform_msr_protocol(struct svsm_call *call) +int svsm_perform_msr_protocol(struct svsm_call *call) { u8 pending = 0; u64 val, resp; @@ -247,63 +171,6 @@ static int svsm_perform_msr_protocol(struct svsm_call *call) return svsm_process_result_codes(call); } -static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) -{ - struct es_em_ctxt ctxt; - u8 pending = 0; - - vc_ghcb_invalidate(ghcb); - - /* - * Fill in protocol and format specifiers. This can be called very early - * in the boot, so use rip-relative references as needed. - */ - ghcb->protocol_version = ghcb_version; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - - svsm_issue_call(call, &pending); - - if (pending) - return -EINVAL; - - switch (verify_exception_info(ghcb, &ctxt)) { - case ES_OK: - break; - case ES_EXCEPTION: - vc_forward_exception(&ctxt); - fallthrough; - default: - return -EINVAL; - } - - return svsm_process_result_codes(call); -} - -enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) -{ - /* Fill in protocol and format specifiers */ - ghcb->protocol_version = ghcb_version; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, exit_code); - ghcb_set_sw_exit_info_1(ghcb, exit_info_1); - ghcb_set_sw_exit_info_2(ghcb, exit_info_2); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - return verify_exception_info(ghcb, ctxt); -} - static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) { u64 val; @@ -793,7 +660,7 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, * If validating memory (making it private) and affected by the * cache-coherency vulnerability, perform the cache eviction mitigation. */ - if (validate && !has_cpuflag(X86_FEATURE_COHERENCY_SFW_NO)) + if (validate && sev_snp_needs_sfw) sev_evict_cache((void *)vaddr, 1); } diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 3da04a715831..fd18a00f000e 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -41,15 +41,6 @@ #include #include -/* For early boot hypervisor communication in SEV-ES enabled guests */ -struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); - -/* - * Needs to be in the .data section because we need it NULL before bss is - * cleared - */ -struct ghcb *boot_ghcb __section(".data"); - /* Bitmap of SEV features supported by the hypervisor */ u64 sev_hv_features __ro_after_init; @@ -139,39 +130,6 @@ noinstr void __sev_put_ghcb(struct ghcb_state *state) } } -int svsm_perform_call_protocol(struct svsm_call *call) -{ - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - int ret; - - /* - * This can be called very early in the boot, use native functions in - * order to avoid paravirt issues. - */ - flags = native_local_irq_save(); - - if (sev_cfg.ghcbs_initialized) - ghcb = __sev_get_ghcb(&state); - else if (boot_ghcb) - ghcb = boot_ghcb; - else - ghcb = NULL; - - do { - ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); - } while (ret == -EAGAIN); - - if (sev_cfg.ghcbs_initialized) - __sev_put_ghcb(&state); - - native_local_irq_restore(flags); - - return ret; -} - void __head early_set_pages_state(unsigned long vaddr, unsigned long paddr, unsigned long npages, enum psc_op op) diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index 70ea1748c0a7..bf9153b9a3d9 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -521,6 +521,7 @@ void __head sme_enable(struct boot_params *bp) return; me_mask = 1UL << (ebx & 0x3f); + sev_snp_needs_sfw = !(ebx & BIT(31)); /* Check the SEV MSR whether SEV or SME is enabled */ sev_status = msr = native_rdmsrq(MSR_AMD64_SEV); diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 14ef5908fb27..2a28d14425d4 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -101,6 +101,15 @@ DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); u8 snp_vmpl __ro_after_init; EXPORT_SYMBOL_GPL(snp_vmpl); +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +struct ghcb *boot_ghcb __section(".data"); + static u64 __init get_snp_jump_table_addr(void) { struct snp_secrets_page *secrets; @@ -154,6 +163,73 @@ static u64 __init get_jump_table_addr(void) return ret; } +static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) +{ + struct es_em_ctxt ctxt; + u8 pending = 0; + + vc_ghcb_invalidate(ghcb); + + /* + * Fill in protocol and format specifiers. This can be called very early + * in the boot, so use rip-relative references as needed. + */ + ghcb->protocol_version = ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + + svsm_issue_call(call, &pending); + + if (pending) + return -EINVAL; + + switch (verify_exception_info(ghcb, &ctxt)) { + case ES_OK: + break; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + fallthrough; + default: + return -EINVAL; + } + + return svsm_process_result_codes(call); +} + +static int svsm_perform_call_protocol(struct svsm_call *call) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + flags = native_local_irq_save(); + + if (sev_cfg.ghcbs_initialized) + ghcb = __sev_get_ghcb(&state); + else if (boot_ghcb) + ghcb = boot_ghcb; + else + ghcb = NULL; + + do { + ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) + : svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + if (sev_cfg.ghcbs_initialized) + __sev_put_ghcb(&state); + + native_local_irq_restore(flags); + + return ret; +} + static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size, int ret, u64 svsm_ret) { diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index c3b4acbde0d8..357389456296 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -351,6 +351,8 @@ fault: } #define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) +#define error(v) +#define has_cpuflag(f) boot_cpu_has(f) #include "vc-shared.c" diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c index b4688f69102e..9b01c9ad81be 100644 --- a/arch/x86/coco/sev/vc-shared.c +++ b/arch/x86/coco/sev/vc-shared.c @@ -409,6 +409,53 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + u32 ret; + + ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); + if (!ret) + return ES_OK; + + if (ret == 1) { + u64 info = ghcb->save.sw_exit_info_2; + unsigned long v = info & SVM_EVTINJ_VEC_MASK; + + /* Check if exception information from hypervisor is sane. */ + if ((info & SVM_EVTINJ_VALID) && + ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && + ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { + ctxt->fi.vector = v; + + if (info & SVM_EVTINJ_VALID_ERR) + ctxt->fi.error_code = info >> 32; + + return ES_EXCEPTION; + } + } + + return ES_VMM_ERROR; +} + +enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) +{ + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + return verify_exception_info(ghcb, ctxt); +} + static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) { u32 cr4 = native_read_cr4(); @@ -549,3 +596,50 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, return ES_OK; } + +void snp_register_ghcb_early(unsigned long paddr) +{ + unsigned long pfn = paddr >> PAGE_SHIFT; + u64 val; + + sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + /* If the response GPA is not ours then abort the guest */ + if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || + (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); +} + +bool __init sev_es_check_cpu_features(void) +{ + if (!has_cpuflag(X86_FEATURE_RDRAND)) { + error("RDRAND instruction not supported - no trusted source of randomness available\n"); + return false; + } + + return true; +} + +bool sev_es_negotiate_protocol(void) +{ + u64 val; + + /* Do the GHCB protocol version negotiation */ + sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + + if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) + return false; + + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) + return false; + + ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); + + return true; +} diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index 3dfd306d1c9e..6199b35a82e4 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -2,7 +2,6 @@ #define DR7_RESET_VALUE 0x400 -extern struct ghcb boot_ghcb_page; extern u64 sev_hv_features; extern u64 sev_secrets_pa; @@ -80,7 +79,8 @@ static __always_inline u64 svsm_get_caa_pa(void) return boot_svsm_caa_pa; } -int svsm_perform_call_protocol(struct svsm_call *call); +enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt); +void vc_forward_exception(struct es_em_ctxt *ctxt); static inline u64 sev_es_rd_ghcb_msr(void) { @@ -97,9 +97,6 @@ static __always_inline void sev_es_wr_ghcb_msr(u64 val) native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); } -void snp_register_ghcb_early(unsigned long paddr); -bool sev_es_negotiate_protocol(void); -bool sev_es_check_cpu_features(void); u64 get_hv_features(void); const struct snp_cpuid_table *snp_cpuid_get_table(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index e4622e470ceb..be9d7cb87ad0 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -503,6 +503,7 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) } void setup_ghcb(void); +void snp_register_ghcb_early(unsigned long paddr); void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages); void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, @@ -540,8 +541,6 @@ static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -void vc_forward_exception(struct es_em_ctxt *ctxt); - /* I/O parameters for CPUID-related helpers */ struct cpuid_leaf { u32 fn; @@ -552,16 +551,25 @@ struct cpuid_leaf { u32 edx; }; +int svsm_perform_msr_protocol(struct svsm_call *call); int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), void *ctx, struct cpuid_leaf *leaf); +void svsm_issue_call(struct svsm_call *call, u8 *pending); +int svsm_process_result_codes(struct svsm_call *call); + void __noreturn sev_es_terminate(unsigned int set, unsigned int reason); enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, struct es_em_ctxt *ctxt, u64 exit_code, u64 exit_info_1, u64 exit_info_2); +bool sev_es_negotiate_protocol(void); +bool sev_es_check_cpu_features(void); + +extern u16 ghcb_version; extern struct ghcb *boot_ghcb; +extern bool sev_snp_needs_sfw; #else /* !CONFIG_AMD_MEM_ENCRYPT */ From a5f03880f06a6da6ea5f1d966fffffcb3fc65462 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:09 +0200 Subject: [PATCH 25/49] x86/sev: Avoid global variable to store virtual address of SVSM area The boottime SVSM calling area is used both by the startup code running from a 1:1 mapping, and potentially later on running from the ordinary kernel mapping. This SVSM calling area is statically allocated, and so its physical address doesn't change. However, its virtual address depends on the calling context (1:1 mapping or kernel virtual mapping), and even though the variable that holds the virtual address of this calling area gets updated from 1:1 address to kernel address during the boot, it is hard to reason about why this is guaranteed to be safe. So instead, take the RIP-relative address of the boottime SVSM calling area whenever its virtual address is required, and only use a global variable for the physical address. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/20250828102202.1849035-30-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 5 ++--- arch/x86/boot/startup/sev-shared.c | 7 +------ arch/x86/boot/startup/sev-startup.c | 9 +++++---- arch/x86/coco/sev/core.c | 9 --------- arch/x86/include/asm/sev-internal.h | 3 +-- arch/x86/include/asm/sev.h | 2 -- arch/x86/mm/mem_encrypt_amd.c | 6 ------ 7 files changed, 9 insertions(+), 32 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index f197173d60e6..f2b8dfbd453c 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -37,12 +37,12 @@ struct ghcb *boot_ghcb; #define __BOOT_COMPRESSED -extern struct svsm_ca *boot_svsm_caa; extern u64 boot_svsm_caa_pa; struct svsm_ca *svsm_get_caa(void) { - return boot_svsm_caa; + /* The decompressor is mapped 1:1 so VA == PA */ + return (struct svsm_ca *)boot_svsm_caa_pa; } u64 svsm_get_caa_pa(void) @@ -532,7 +532,6 @@ bool early_is_sevsnp_guest(void) /* Obtain the address of the calling area to use */ boot_rdmsr(MSR_SVSM_CAA, &m); - boot_svsm_caa = (void *)m.q; boot_svsm_caa_pa = m.q; /* diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 348811aa7847..80d4fdada33a 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -13,6 +13,7 @@ #ifndef __BOOT_COMPRESSED #define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) #else #undef WARN #define WARN(condition, format...) (!!(condition)) @@ -26,7 +27,6 @@ * early boot, both with identity mapped virtual addresses and proper kernel * virtual addresses. */ -struct svsm_ca *boot_svsm_caa __ro_after_init; u64 boot_svsm_caa_pa __ro_after_init; /* @@ -720,11 +720,6 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info, if (caa & (PAGE_SIZE - 1)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CAA); - /* - * The CA is identity mapped when this routine is called, both by the - * decompressor code and the early kernel code. - */ - boot_svsm_caa = (struct svsm_ca *)caa; boot_svsm_caa_pa = caa; /* Advertise the SVSM presence via CPUID. */ diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index fd18a00f000e..8a06f6026101 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -252,6 +252,7 @@ found_cc_info: static __head void svsm_setup(struct cc_blob_sev_info *cc_info) { + struct snp_secrets_page *secrets = (void *)cc_info->secrets_phys; struct svsm_call call = {}; u64 pa; @@ -272,21 +273,21 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) pa = (u64)rip_rel_ptr(&boot_svsm_ca_page); /* - * Switch over to the boot SVSM CA while the current CA is still - * addressable. There is no GHCB at this point so use the MSR protocol. + * Switch over to the boot SVSM CA while the current CA is still 1:1 + * mapped and thus addressable with VA == PA. There is no GHCB at this + * point so use the MSR protocol. * * SVSM_CORE_REMAP_CA call: * RAX = 0 (Protocol=0, CallID=0) * RCX = New CA GPA */ - call.caa = svsm_get_caa(); + call.caa = (struct svsm_ca *)secrets->svsm_caa; call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); call.rcx = pa; if (svsm_call_msr_protocol(&call)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); - boot_svsm_caa = (struct svsm_ca *)pa; boot_svsm_caa_pa = pa; } diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 2a28d14425d4..ff1e2be8b5a8 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1666,15 +1666,6 @@ void sev_show_status(void) pr_cont("\n"); } -void __init snp_update_svsm_ca(void) -{ - if (!snp_vmpl) - return; - - /* Update the CAA to a proper kernel address */ - boot_svsm_caa = &boot_svsm_ca_page; -} - #ifdef CONFIG_SYSFS static ssize_t vmpl_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index 6199b35a82e4..ffe4755962fe 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -60,7 +60,6 @@ void early_set_pages_state(unsigned long vaddr, unsigned long paddr, DECLARE_PER_CPU(struct svsm_ca *, svsm_caa); DECLARE_PER_CPU(u64, svsm_caa_pa); -extern struct svsm_ca *boot_svsm_caa; extern u64 boot_svsm_caa_pa; static __always_inline struct svsm_ca *svsm_get_caa(void) @@ -68,7 +67,7 @@ static __always_inline struct svsm_ca *svsm_get_caa(void) if (sev_cfg.use_cas) return this_cpu_read(svsm_caa); else - return boot_svsm_caa; + return rip_rel_ptr(&boot_svsm_ca_page); } static __always_inline u64 svsm_get_caa_pa(void) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index be9d7cb87ad0..92b1269c877e 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -519,7 +519,6 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); u64 sev_get_status(void); void sev_show_status(void); -void snp_update_svsm_ca(void); int prepare_pte_enc(struct pte_enc_desc *d); void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot); void snp_kexec_finish(void); @@ -601,7 +600,6 @@ static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 snp_get_unsupported_features(u64 status) { return 0; } static inline u64 sev_get_status(void) { return 0; } static inline void sev_show_status(void) { } -static inline void snp_update_svsm_ca(void) { } static inline int prepare_pte_enc(struct pte_enc_desc *d) { return 0; } static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { } static inline void snp_kexec_finish(void) { } diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index faf3a13fb6ba..2f8c32173972 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -536,12 +536,6 @@ void __init sme_early_init(void) x86_init.resources.dmi_setup = snp_dmi_setup; } - /* - * Switch the SVSM CA mapping (if active) from identity mapped to - * kernel mapped. - */ - snp_update_svsm_ca(); - if (sev_status & MSR_AMD64_SNP_SECURE_TSC) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } From d5949ea50c5642ab7e3c4dd6020e23725c079b25 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:10 +0200 Subject: [PATCH 26/49] x86/sev: Share implementation of MSR-based page state change Both the decompressor and the SEV startup code implement the exact same sequence for invoking the MSR based communication protocol to effectuate a page state change. Before tweaking the internal APIs used in both versions, merge them and share them so those tweaks are only needed in a single place. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-31-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 40 +++-------------------------- arch/x86/boot/startup/sev-shared.c | 35 +++++++++++++++++++++++++ arch/x86/boot/startup/sev-startup.c | 29 +-------------------- 3 files changed, 39 insertions(+), 65 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index f2b8dfbd453c..6b62c75ea911 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -60,46 +60,12 @@ static bool sev_snp_enabled(void) return sev_status & MSR_AMD64_SEV_SNP_ENABLED; } -static void __page_state_change(unsigned long paddr, enum psc_op op) -{ - u64 val, msr; - - /* - * If private -> shared then invalidate the page before requesting the - * state change in the RMP table. - */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(paddr, paddr, false); - - /* Save the current GHCB MSR value */ - msr = sev_es_rd_ghcb_msr(); - - /* Issue VMGEXIT to change the page state in RMP table. */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - /* Read the response of the VMGEXIT. */ - val = sev_es_rd_ghcb_msr(); - if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); - - /* Restore the GHCB MSR value */ - sev_es_wr_ghcb_msr(msr); - - /* - * Now that page state is changed in the RMP table, validate it so that it is - * consistent with the RMP entry. - */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(paddr, paddr, true); -} - void snp_set_page_private(unsigned long paddr) { if (!sev_snp_enabled()) return; - __page_state_change(paddr, SNP_PAGE_STATE_PRIVATE); + __page_state_change(paddr, paddr, SNP_PAGE_STATE_PRIVATE); } void snp_set_page_shared(unsigned long paddr) @@ -107,7 +73,7 @@ void snp_set_page_shared(unsigned long paddr) if (!sev_snp_enabled()) return; - __page_state_change(paddr, SNP_PAGE_STATE_SHARED); + __page_state_change(paddr, paddr, SNP_PAGE_STATE_SHARED); } bool early_setup_ghcb(void) @@ -133,7 +99,7 @@ bool early_setup_ghcb(void) void snp_accept_memory(phys_addr_t start, phys_addr_t end) { for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE) - __page_state_change(pa, SNP_PAGE_STATE_PRIVATE); + __page_state_change(pa, pa, SNP_PAGE_STATE_PRIVATE); } void sev_es_shutdown_ghcb(void) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 80d4fdada33a..00220d7b981b 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -664,6 +664,41 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, sev_evict_cache((void *)vaddr, 1); } +static void __head __page_state_change(unsigned long vaddr, unsigned long paddr, + enum psc_op op) +{ + u64 val, msr; + + /* + * If private -> shared then invalidate the page before requesting the + * state change in the RMP table. + */ + if (op == SNP_PAGE_STATE_SHARED) + pvalidate_4k_page(vaddr, paddr, false); + + /* Save the current GHCB MSR value */ + msr = sev_es_rd_ghcb_msr(); + + /* Issue VMGEXIT to change the page state in RMP table. */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + /* Read the response of the VMGEXIT. */ + val = sev_es_rd_ghcb_msr(); + if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + /* Restore the GHCB MSR value */ + sev_es_wr_ghcb_msr(msr); + + /* + * Now that page state is changed in the RMP table, validate it so that it is + * consistent with the RMP entry. + */ + if (op == SNP_PAGE_STATE_PRIVATE) + pvalidate_4k_page(vaddr, paddr, true); +} + /* * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM * services needed when not running in VMPL0. diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 8a06f6026101..5eb7d939ebd3 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -135,7 +135,6 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, unsigned long npages, enum psc_op op) { unsigned long paddr_end; - u64 val; vaddr = vaddr & PAGE_MASK; @@ -143,37 +142,11 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, paddr_end = paddr + (npages << PAGE_SHIFT); while (paddr < paddr_end) { - /* Page validation must be rescinded before changing to shared */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(vaddr, paddr, false); - - /* - * Use the MSR protocol because this function can be called before - * the GHCB is established. - */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) - goto e_term; - - if (GHCB_MSR_PSC_RESP_VAL(val)) - goto e_term; - - /* Page validation must be performed after changing to private */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(vaddr, paddr, true); + __page_state_change(vaddr, paddr, op); vaddr += PAGE_SIZE; paddr += PAGE_SIZE; } - - return; - -e_term: - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); } void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, From 00d25566761746ba53934ad3a89ea79923a38d01 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:11 +0200 Subject: [PATCH 27/49] x86/sev: Pass SVSM calling area down to early page state change API The early page state change API is mostly only used very early, when only the boot time SVSM calling area is in use. However, this API is also called by the kexec finishing code, which runs very late, and potentially from a different CPU (which uses a different calling area). To avoid pulling the per-CPU SVSM calling area pointers and related SEV state into the startup code, refactor the page state change API so the SVSM calling area virtual and physical addresses can be provided by the caller. No functional change intended. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-32-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 24 +++++++++++++++++++++--- arch/x86/boot/startup/sev-shared.c | 23 ++++++++++++----------- arch/x86/boot/startup/sev-startup.c | 16 ++++++++++++---- arch/x86/coco/sev/core.c | 7 +++++-- arch/x86/include/asm/sev-internal.h | 2 +- arch/x86/include/asm/sev.h | 6 ++++++ 6 files changed, 57 insertions(+), 21 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 6b62c75ea911..0e567410d24d 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -62,18 +62,30 @@ static bool sev_snp_enabled(void) void snp_set_page_private(unsigned long paddr) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + if (!sev_snp_enabled()) return; - __page_state_change(paddr, paddr, SNP_PAGE_STATE_PRIVATE); + __page_state_change(paddr, paddr, &d); } void snp_set_page_shared(unsigned long paddr) { + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + if (!sev_snp_enabled()) return; - __page_state_change(paddr, paddr, SNP_PAGE_STATE_SHARED); + __page_state_change(paddr, paddr, &d); } bool early_setup_ghcb(void) @@ -98,8 +110,14 @@ bool early_setup_ghcb(void) void snp_accept_memory(phys_addr_t start, phys_addr_t end) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE) - __page_state_change(pa, pa, SNP_PAGE_STATE_PRIVATE); + __page_state_change(pa, pa, &d); } void sev_es_shutdown_ghcb(void) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 00220d7b981b..fb86f03da663 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -602,7 +602,8 @@ static int __head svsm_call_msr_protocol(struct svsm_call *call) return ret; } -static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) +static void __head svsm_pval_4k_page(unsigned long paddr, bool validate, + struct svsm_ca *caa, u64 caa_pa) { struct svsm_pvalidate_call *pc; struct svsm_call call = {}; @@ -615,10 +616,10 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) */ flags = native_local_irq_save(); - call.caa = svsm_get_caa(); + call.caa = caa; pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer; - pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer); + pc_pa = caa_pa + offsetof(struct svsm_ca, svsm_buffer); pc->num_entries = 1; pc->cur_index = 0; @@ -644,12 +645,12 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) } static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, - bool validate) + bool validate, struct svsm_ca *caa, u64 caa_pa) { int ret; if (snp_vmpl) { - svsm_pval_4k_page(paddr, validate); + svsm_pval_4k_page(paddr, validate, caa, caa_pa); } else { ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); if (ret) @@ -665,7 +666,7 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, } static void __head __page_state_change(unsigned long vaddr, unsigned long paddr, - enum psc_op op) + const struct psc_desc *desc) { u64 val, msr; @@ -673,14 +674,14 @@ static void __head __page_state_change(unsigned long vaddr, unsigned long paddr, * If private -> shared then invalidate the page before requesting the * state change in the RMP table. */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(vaddr, paddr, false); + if (desc->op == SNP_PAGE_STATE_SHARED) + pvalidate_4k_page(vaddr, paddr, false, desc->ca, desc->caa_pa); /* Save the current GHCB MSR value */ msr = sev_es_rd_ghcb_msr(); /* Issue VMGEXIT to change the page state in RMP table. */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, desc->op)); VMGEXIT(); /* Read the response of the VMGEXIT. */ @@ -695,8 +696,8 @@ static void __head __page_state_change(unsigned long vaddr, unsigned long paddr, * Now that page state is changed in the RMP table, validate it so that it is * consistent with the RMP entry. */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(vaddr, paddr, true); + if (desc->op == SNP_PAGE_STATE_PRIVATE) + pvalidate_4k_page(vaddr, paddr, true, desc->ca, desc->caa_pa); } /* diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 5eb7d939ebd3..8009a37d53c1 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -132,7 +132,7 @@ noinstr void __sev_put_ghcb(struct ghcb_state *state) void __head early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op) + unsigned long npages, const struct psc_desc *desc) { unsigned long paddr_end; @@ -142,7 +142,7 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, paddr_end = paddr + (npages << PAGE_SHIFT); while (paddr < paddr_end) { - __page_state_change(vaddr, paddr, op); + __page_state_change(vaddr, paddr, desc); vaddr += PAGE_SIZE; paddr += PAGE_SIZE; @@ -152,6 +152,10 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, svsm_get_caa(), svsm_get_caa_pa() + }; + /* * This can be invoked in early boot while running identity mapped, so * use an open coded check for SNP instead of using cc_platform_has(). @@ -165,12 +169,16 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd * Ask the hypervisor to mark the memory pages as private in the RMP * table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); + early_set_pages_state(vaddr, paddr, npages, &d); } void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, svsm_get_caa(), svsm_get_caa_pa() + }; + /* * This can be invoked in early boot while running identity mapped, so * use an open coded check for SNP instead of using cc_platform_has(). @@ -181,7 +189,7 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr return; /* Ask hypervisor to mark the memory pages shared in the RMP table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); + early_set_pages_state(vaddr, paddr, npages, &d); } /* diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index ff1e2be8b5a8..a833b2b31d3d 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -607,8 +607,11 @@ static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) unsigned long vaddr_end; /* Use the MSR protocol when a GHCB is not available. */ - if (!boot_ghcb) - return early_set_pages_state(vaddr, __pa(vaddr), npages, op); + if (!boot_ghcb) { + struct psc_desc d = { op, svsm_get_caa(), svsm_get_caa_pa() }; + + return early_set_pages_state(vaddr, __pa(vaddr), npages, &d); + } vaddr = vaddr & PAGE_MASK; vaddr_end = vaddr + (npages << PAGE_SHIFT); diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index ffe4755962fe..9ff824540b48 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -55,7 +55,7 @@ DECLARE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DECLARE_PER_CPU(struct sev_es_save_area *, sev_vmsa); void early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op); + unsigned long npages, const struct psc_desc *desc); DECLARE_PER_CPU(struct svsm_ca *, svsm_caa); DECLARE_PER_CPU(u64, svsm_caa_pa); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 92b1269c877e..0030c7125b29 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -570,6 +570,12 @@ extern u16 ghcb_version; extern struct ghcb *boot_ghcb; extern bool sev_snp_needs_sfw; +struct psc_desc { + enum psc_op op; + struct svsm_ca *ca; + u64 caa_pa; +}; + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define snp_vmpl 0 From c54604fb7f2522fec5b97e86103ec49e539e80fe Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:12 +0200 Subject: [PATCH 28/49] x86/sev: Use boot SVSM CA for all startup and init code To avoid having to reason about whether or not to use the per-CPU SVSM calling area when running startup and init code on the boot CPU, reuse the boot SVSM calling area as the per-CPU area for the BSP. Thus, remove the need to make the per-CPU variables and associated state in sev_cfg accessible to the startup code once confined. [ bp: Massage commit message. ] Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-33-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 13 -------- arch/x86/boot/startup/sev-startup.c | 11 ++++--- arch/x86/coco/sev/core.c | 47 ++++++++++++++--------------- arch/x86/include/asm/sev-internal.h | 16 ---------- 4 files changed, 28 insertions(+), 59 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 0e567410d24d..4873469b2a39 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -37,19 +37,6 @@ struct ghcb *boot_ghcb; #define __BOOT_COMPRESSED -extern u64 boot_svsm_caa_pa; - -struct svsm_ca *svsm_get_caa(void) -{ - /* The decompressor is mapped 1:1 so VA == PA */ - return (struct svsm_ca *)boot_svsm_caa_pa; -} - -u64 svsm_get_caa_pa(void) -{ - return boot_svsm_caa_pa; -} - u8 snp_vmpl; /* Include code for early handlers */ diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 8009a37d53c1..b0fc63f8dee1 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -50,9 +50,6 @@ u64 sev_secrets_pa __ro_after_init; /* For early boot SVSM communication */ struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); -DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); -DEFINE_PER_CPU(u64, svsm_caa_pa); - /* * Nothing shall interrupt this code path while holding the per-CPU * GHCB. The backup GHCB is only for NMIs interrupting this path. @@ -153,7 +150,9 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd unsigned long npages) { struct psc_desc d = { - SNP_PAGE_STATE_PRIVATE, svsm_get_caa(), svsm_get_caa_pa() + SNP_PAGE_STATE_PRIVATE, + rip_rel_ptr(&boot_svsm_ca_page), + boot_svsm_caa_pa }; /* @@ -176,7 +175,9 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr unsigned long npages) { struct psc_desc d = { - SNP_PAGE_STATE_SHARED, svsm_get_caa(), svsm_get_caa_pa() + SNP_PAGE_STATE_SHARED, + rip_rel_ptr(&boot_svsm_ca_page), + boot_svsm_caa_pa }; /* diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index a833b2b31d3d..9782ebe30675 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -46,6 +46,25 @@ #include #include +DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); +DEFINE_PER_CPU(u64, svsm_caa_pa); + +static inline struct svsm_ca *svsm_get_caa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa); + else + return rip_rel_ptr(&boot_svsm_ca_page); +} + +static inline u64 svsm_get_caa_pa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa_pa); + else + return boot_svsm_caa_pa; +} + /* AP INIT values as documented in the APM2 section "Processor Initialization State" */ #define AP_INIT_CS_LIMIT 0xffff #define AP_INIT_DS_LIMIT 0xffff @@ -1312,7 +1331,8 @@ static void __init alloc_runtime_data(int cpu) struct svsm_ca *caa; /* Allocate the SVSM CA page if an SVSM is present */ - caa = memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE); + caa = cpu ? memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE) + : &boot_svsm_ca_page; per_cpu(svsm_caa, cpu) = caa; per_cpu(svsm_caa_pa, cpu) = __pa(caa); @@ -1366,32 +1386,9 @@ void __init sev_es_init_vc_handling(void) init_ghcb(cpu); } - /* If running under an SVSM, switch to the per-cpu CA */ - if (snp_vmpl) { - struct svsm_call call = {}; - unsigned long flags; - int ret; - - local_irq_save(flags); - - /* - * SVSM_CORE_REMAP_CA call: - * RAX = 0 (Protocol=0, CallID=0) - * RCX = New CA GPA - */ - call.caa = svsm_get_caa(); - call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); - call.rcx = this_cpu_read(svsm_caa_pa); - ret = svsm_perform_call_protocol(&call); - if (ret) - panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", - ret, call.rax_out); - + if (snp_vmpl) sev_cfg.use_cas = true; - local_irq_restore(flags); - } - sev_es_setup_play_dead(); /* Secondary CPUs use the runtime #VC handler */ diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index 9ff824540b48..f98f080410ad 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -62,22 +62,6 @@ DECLARE_PER_CPU(u64, svsm_caa_pa); extern u64 boot_svsm_caa_pa; -static __always_inline struct svsm_ca *svsm_get_caa(void) -{ - if (sev_cfg.use_cas) - return this_cpu_read(svsm_caa); - else - return rip_rel_ptr(&boot_svsm_ca_page); -} - -static __always_inline u64 svsm_get_caa_pa(void) -{ - if (sev_cfg.use_cas) - return this_cpu_read(svsm_caa_pa); - else - return boot_svsm_caa_pa; -} - enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt); void vc_forward_exception(struct es_em_ctxt *ctxt); From 68a501d7fd82454525797971c6a0005ceeb93153 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:13 +0200 Subject: [PATCH 29/49] x86/boot: Drop redundant RMPADJUST in SEV SVSM presence check snp_vmpl will be assigned a non-zero value when executing at a VMPL other than 0, and this is inferred from a call to RMPADJUST, which only works when running at VMPL0. This means that testing snp_vmpl is sufficient, and there is no need to perform the same check again. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-34-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 4873469b2a39..26aa389f802d 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -406,30 +406,16 @@ void sev_enable(struct boot_params *bp) */ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) { u64 hv_features; - int ret; hv_features = get_hv_features(); if (!(hv_features & GHCB_HV_FT_SNP)) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); /* - * Enforce running at VMPL0 or with an SVSM. - * - * Use RMPADJUST (see the rmpadjust() function for a description of - * what the instruction does) to update the VMPL1 permissions of a - * page. If the guest is running at VMPL0, this will succeed. If the - * guest is running at any other VMPL, this will fail. Linux SNP guests - * only ever run at a single VMPL level so permission mask changes of a - * lesser-privileged VMPL are a don't-care. + * Running at VMPL0 is required unless an SVSM is present and + * the hypervisor supports the required SVSM GHCB events. */ - ret = rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, 1); - - /* - * Running at VMPL0 is not required if an SVSM is present and the hypervisor - * supports the required SVSM GHCB events. - */ - if (ret && - !(snp_vmpl && (hv_features & GHCB_HV_FT_SNP_MULTI_VMPL))) + if (snp_vmpl && !(hv_features & GHCB_HV_FT_SNP_MULTI_VMPL)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); } From f27906b287403af53be26341cf86d73798f15fe8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:14 +0200 Subject: [PATCH 30/49] x86/boot: Provide PIC aliases for 5-level paging related constants Provide PIC aliases for the global variables related to 5-level paging, so that the startup code can access them in order to populate the kernel page tables. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-35-ardb+git@google.com --- arch/x86/kernel/head64.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 533fcf5636fc..1bc40d0785ee 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -52,10 +52,13 @@ SYM_PIC_ALIAS(next_early_pgt); pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); unsigned int __pgtable_l5_enabled __ro_after_init; +SYM_PIC_ALIAS(__pgtable_l5_enabled); unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); +SYM_PIC_ALIAS(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; EXPORT_SYMBOL(ptrs_per_p4d); +SYM_PIC_ALIAS(ptrs_per_p4d); unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); From 9723dd0c705eb626bac2cd06b83a2c8514ed697a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:15 +0200 Subject: [PATCH 31/49] x86/sev: Provide PIC aliases for SEV related data objects Provide PIC aliases for data objects that are shared between the SEV startup code and the SEV code that executes later. This is needed so that the confined startup code is permitted to access them. This requires some of these variables to be moved into a source file that is not part of the startup code, as the PIC alias is already implied, and exporting variables in the opposite direction is not supported. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-36-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 3 +++ arch/x86/boot/startup/sev-shared.c | 19 ---------------- arch/x86/boot/startup/sev-startup.c | 9 -------- arch/x86/coco/sev/core.c | 34 +++++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 26aa389f802d..a5e002ff6bff 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -38,6 +38,9 @@ struct ghcb *boot_ghcb; #define __BOOT_COMPRESSED u8 snp_vmpl; +u16 ghcb_version; + +u64 boot_svsm_caa_pa; /* Include code for early handlers */ #include "../../boot/startup/sev-shared.c" diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index fb86f03da663..2a28463edd99 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -19,25 +19,6 @@ #define WARN(condition, format...) (!!(condition)) #endif -/* - * SVSM related information: - * During boot, the page tables are set up as identity mapped and later - * changed to use kernel virtual addresses. Maintain separate virtual and - * physical addresses for the CAA to allow SVSM functions to be used during - * early boot, both with identity mapped virtual addresses and proper kernel - * virtual addresses. - */ -u64 boot_svsm_caa_pa __ro_after_init; - -/* - * Since feature negotiation related variables are set early in the boot - * process they must reside in the .data section so as not to be zeroed - * out when the .bss section is later cleared. - * - * GHCB protocol version negotiated with the hypervisor. - */ -u16 ghcb_version __ro_after_init; - /* Copy of the SNP firmware's CPUID page. */ static struct snp_cpuid_table cpuid_table_copy __ro_after_init; diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index b0fc63f8dee1..138b26f14ff1 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -41,15 +41,6 @@ #include #include -/* Bitmap of SEV features supported by the hypervisor */ -u64 sev_hv_features __ro_after_init; - -/* Secrets page physical address from the CC blob */ -u64 sev_secrets_pa __ro_after_init; - -/* For early boot SVSM communication */ -struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); - /* * Nothing shall interrupt this code path while holding the per-CPU * GHCB. The backup GHCB is only for NMIs interrupting this path. diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 9782ebe30675..b9133c825f90 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -46,6 +46,29 @@ #include #include +/* Bitmap of SEV features supported by the hypervisor */ +u64 sev_hv_features __ro_after_init; +SYM_PIC_ALIAS(sev_hv_features); + +/* Secrets page physical address from the CC blob */ +u64 sev_secrets_pa __ro_after_init; +SYM_PIC_ALIAS(sev_secrets_pa); + +/* For early boot SVSM communication */ +struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); +SYM_PIC_ALIAS(boot_svsm_ca_page); + +/* + * SVSM related information: + * During boot, the page tables are set up as identity mapped and later + * changed to use kernel virtual addresses. Maintain separate virtual and + * physical addresses for the CAA to allow SVSM functions to be used during + * early boot, both with identity mapped virtual addresses and proper kernel + * virtual addresses. + */ +u64 boot_svsm_caa_pa __ro_after_init; +SYM_PIC_ALIAS(boot_svsm_caa_pa); + DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); DEFINE_PER_CPU(u64, svsm_caa_pa); @@ -119,6 +142,17 @@ DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); */ u8 snp_vmpl __ro_after_init; EXPORT_SYMBOL_GPL(snp_vmpl); +SYM_PIC_ALIAS(snp_vmpl); + +/* + * Since feature negotiation related variables are set early in the boot + * process they must reside in the .data section so as not to be zeroed + * out when the .bss section is later cleared. + * + * GHCB protocol version negotiated with the hypervisor. + */ +u16 ghcb_version __ro_after_init; +SYM_PIC_ALIAS(ghcb_version); /* For early boot hypervisor communication in SEV-ES enabled guests */ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); From d4077e6ad35121b97f3233da5d60763de3d23df9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:16 +0200 Subject: [PATCH 32/49] x86/sev: Move __sev_[get|put]_ghcb() into separate noinstr object Rename sev-nmi.c to noinstr.c, and move the get/put GHCB routines into it too, which are also annotated as 'noinstr' and suffer from the same problem as the NMI code, i.e., that GCC may ignore the __no_sanitize_address__ function attribute implied by 'noinstr' and insert KASAN instrumentation anyway. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-37-ardb+git@google.com --- arch/x86/boot/startup/sev-startup.c | 74 ---------------------- arch/x86/coco/sev/Makefile | 8 +-- arch/x86/coco/sev/{sev-nmi.c => noinstr.c} | 74 ++++++++++++++++++++++ 3 files changed, 78 insertions(+), 78 deletions(-) rename arch/x86/coco/sev/{sev-nmi.c => noinstr.c} (61%) diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 138b26f14ff1..9f4b4ca7deaa 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -41,83 +41,9 @@ #include #include -/* - * Nothing shall interrupt this code path while holding the per-CPU - * GHCB. The backup GHCB is only for NMIs interrupting this path. - * - * Callers must disable local interrupts around it. - */ -noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (unlikely(data->ghcb_active)) { - /* GHCB is already in use - save its contents */ - - if (unlikely(data->backup_ghcb_active)) { - /* - * Backup-GHCB is also already in use. There is no way - * to continue here so just kill the machine. To make - * panic() work, mark GHCBs inactive so that messages - * can be printed out. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - instrumentation_begin(); - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - instrumentation_end(); - } - - /* Mark backup_ghcb active before writing to it */ - data->backup_ghcb_active = true; - - state->ghcb = &data->backup_ghcb; - - /* Backup GHCB content */ - *state->ghcb = *ghcb; - } else { - state->ghcb = NULL; - data->ghcb_active = true; - } - - return ghcb; -} - /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" -noinstr void __sev_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - /* - * Invalidate the GHCB so a VMGEXIT instruction issued - * from userspace won't appear to be valid. - */ - vc_ghcb_invalidate(ghcb); - data->ghcb_active = false; - } -} - void __head early_set_pages_state(unsigned long vaddr, unsigned long paddr, unsigned long npages, const struct psc_desc *desc) diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile index 342d79f0ab6a..3b8ae214a6a6 100644 --- a/arch/x86/coco/sev/Makefile +++ b/arch/x86/coco/sev/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += core.o sev-nmi.o vc-handle.o +obj-y += core.o noinstr.o vc-handle.o # Clang 14 and older may fail to respect __no_sanitize_undefined when inlining -UBSAN_SANITIZE_sev-nmi.o := n +UBSAN_SANITIZE_noinstr.o := n # GCC may fail to respect __no_sanitize_address or __no_kcsan when inlining -KASAN_SANITIZE_sev-nmi.o := n -KCSAN_SANITIZE_sev-nmi.o := n +KASAN_SANITIZE_noinstr.o := n +KCSAN_SANITIZE_noinstr.o := n diff --git a/arch/x86/coco/sev/sev-nmi.c b/arch/x86/coco/sev/noinstr.c similarity index 61% rename from arch/x86/coco/sev/sev-nmi.c rename to arch/x86/coco/sev/noinstr.c index d8dfaddfb367..b527eafb6312 100644 --- a/arch/x86/coco/sev/sev-nmi.c +++ b/arch/x86/coco/sev/noinstr.c @@ -106,3 +106,77 @@ void noinstr __sev_es_nmi_complete(void) __sev_put_ghcb(&state); } + +/* + * Nothing shall interrupt this code path while holding the per-CPU + * GHCB. The backup GHCB is only for NMIs interrupting this path. + * + * Callers must disable local interrupts around it. + */ +noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) { + /* + * Backup-GHCB is also already in use. There is no way + * to continue here so just kill the machine. To make + * panic() work, mark GHCBs inactive so that messages + * can be printed out. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + instrumentation_begin(); + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + instrumentation_end(); + } + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +noinstr void __sev_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + /* + * Invalidate the GHCB so a VMGEXIT instruction issued + * from userspace won't appear to be valid. + */ + vc_ghcb_invalidate(ghcb); + data->ghcb_active = false; + } +} From 05ce314ba5155d57c86f8f276cb17f78ac5fb4f0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:17 +0200 Subject: [PATCH 33/49] x86/sev: Export startup routines for later use Create aliases that expose routines that are part of the startup code to other code in the core kernel, so that they can be called later as well. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-38-ardb+git@google.com --- arch/x86/boot/startup/exports.h | 14 ++++++++++++++ arch/x86/kernel/vmlinux.lds.S | 2 ++ 2 files changed, 16 insertions(+) create mode 100644 arch/x86/boot/startup/exports.h diff --git a/arch/x86/boot/startup/exports.h b/arch/x86/boot/startup/exports.h new file mode 100644 index 000000000000..01d2363dc445 --- /dev/null +++ b/arch/x86/boot/startup/exports.h @@ -0,0 +1,14 @@ + +/* + * The symbols below are functions that are implemented by the startup code, + * but called at runtime by the SEV code residing in the core kernel. + */ +PROVIDE(early_set_pages_state = __pi_early_set_pages_state); +PROVIDE(early_snp_set_memory_private = __pi_early_snp_set_memory_private); +PROVIDE(early_snp_set_memory_shared = __pi_early_snp_set_memory_shared); +PROVIDE(get_hv_features = __pi_get_hv_features); +PROVIDE(sev_es_terminate = __pi_sev_es_terminate); +PROVIDE(snp_cpuid = __pi_snp_cpuid); +PROVIDE(snp_cpuid_get_table = __pi_snp_cpuid_get_table); +PROVIDE(svsm_issue_call = __pi_svsm_issue_call); +PROVIDE(svsm_process_result_codes = __pi_svsm_process_result_codes); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4fa0be732af1..5d5e3a95e1f9 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -535,3 +535,5 @@ xen_elfnote_entry_value = xen_elfnote_phys32_entry_value = ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); #endif + +#include "../boot/startup/exports.h" From 0d6e4563fc03d83f948e6a6f7963cc31a4c81914 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:18 +0200 Subject: [PATCH 34/49] objtool: Add action to check for absence of absolute relocations The x86 startup code must not use absolute references to code or data, as it executes before the kernel virtual mapping is up. Add an action to objtool to check all allocatable sections (with the exception of __patchable_function_entries, which uses absolute references for nebulous reasons) and raise an error if any absolute references are found. Note that debug sections typically contain lots of absolute references too, but those are not allocatable so they will be ignored. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/20250828102202.1849035-39-ardb+git@google.com --- tools/objtool/arch/x86/decode.c | 12 +++++++ tools/objtool/builtin-check.c | 2 ++ tools/objtool/check.c | 44 +++++++++++++++++++++++++ tools/objtool/include/objtool/arch.h | 1 + tools/objtool/include/objtool/builtin.h | 1 + 5 files changed, 60 insertions(+) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 98c4713c1b09..0ad5cc70ecbe 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -880,3 +880,15 @@ unsigned int arch_reloc_size(struct reloc *reloc) return 8; } } + +bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) +{ + switch (reloc_type(reloc)) { + case R_X86_64_32: + case R_X86_64_32S: + case R_X86_64_64: + return true; + default: + return false; + } +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 80239843e9f0..0f6b197cfcb0 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -87,6 +87,7 @@ static const struct option check_options[] = { OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"), OPT_BOOLEAN(0 , "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), + OPT_BOOLEAN(0 , "noabs", &opts.noabs, "reject absolute references in allocatable sections"), OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), OPT_GROUP("Options:"), @@ -162,6 +163,7 @@ static bool opts_valid(void) opts.hack_noinstr || opts.ibt || opts.mcount || + opts.noabs || opts.noinstr || opts.orc || opts.retpoline || diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d14f20ef1db1..fb47327075fb 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -4644,6 +4644,47 @@ static void disas_warned_funcs(struct objtool_file *file) disas_funcs(funcs); } +__weak bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) +{ + unsigned int type = reloc_type(reloc); + size_t sz = elf_addr_size(elf); + + return (sz == 8) ? (type == R_ABS64) : (type == R_ABS32); +} + +static int check_abs_references(struct objtool_file *file) +{ + struct section *sec; + struct reloc *reloc; + int ret = 0; + + for_each_sec(file, sec) { + /* absolute references in non-loadable sections are fine */ + if (!(sec->sh.sh_flags & SHF_ALLOC)) + continue; + + /* section must have an associated .rela section */ + if (!sec->rsec) + continue; + + /* + * Special case for compiler generated metadata that is not + * consumed until after boot. + */ + if (!strcmp(sec->name, "__patchable_function_entries")) + continue; + + for_each_reloc(sec->rsec, reloc) { + if (arch_absolute_reloc(file->elf, reloc)) { + WARN("section %s has absolute relocation at offset 0x%lx", + sec->name, reloc_offset(reloc)); + ret++; + } + } + } + return ret; +} + struct insn_chunk { void *addr; struct insn_chunk *next; @@ -4777,6 +4818,9 @@ int check(struct objtool_file *file) goto out; } + if (opts.noabs) + warnings += check_abs_references(file); + if (opts.orc && nr_insns) { ret = orc_create(file); if (ret) diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 01ef6f415adf..be33c7b43180 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -97,6 +97,7 @@ bool arch_is_embedded_insn(struct symbol *sym); int arch_rewrite_retpolines(struct objtool_file *file); bool arch_pc_relative_reloc(struct reloc *reloc); +bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc); unsigned int arch_reloc_size(struct reloc *reloc); unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table); diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 6b08666fa69d..ab22673862e1 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -26,6 +26,7 @@ struct opts { bool uaccess; int prefix; bool cfi; + bool noabs; /* options: */ bool backtrace; From 296650c8ac4f18e886dd2a606152c00adf527219 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:19 +0200 Subject: [PATCH 35/49] x86/boot: Check startup code for absence of absolute relocations Invoke objtool on each startup code object individually to check for the absence of absolute relocations. This is needed because this code will be invoked from the 1:1 mapping of memory before those absolute virtual addresses (which are derived from the kernel virtual base address provided to the linker and possibly shifted at boot) are mapped. Only objects built under arch/x86/boot/startup/ have this restriction, and once they have been incorporated into vmlinux.o, this distinction is difficult to make. So force the invocation of objtool for each object file individually, even if objtool is deferred to vmlinux.o for the rest of the build. In the latter case, only pass --noabs and nothing else; otherwise, append it to the existing objtool command line. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-40-ardb+git@google.com --- arch/x86/boot/startup/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile index b514f7e81332..32737f4ab5a8 100644 --- a/arch/x86/boot/startup/Makefile +++ b/arch/x86/boot/startup/Makefile @@ -19,6 +19,7 @@ KCOV_INSTRUMENT := n obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += sme.o sev-startup.o +pi-objs := $(patsubst %.o,$(obj)/%.o,$(obj-y)) lib-$(CONFIG_X86_64) += la57toggle.o lib-$(CONFIG_EFI_MIXED) += efi-mixed.o @@ -28,3 +29,10 @@ lib-$(CONFIG_EFI_MIXED) += efi-mixed.o # to be linked into the decompressor or the EFI stub but not vmlinux # $(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y + +# +# Invoke objtool for each object individually to check for absolute +# relocations, even if other objtool actions are being deferred. +# +$(pi-objs): objtool-enabled = 1 +$(pi-objs): objtool-args = $(if $(delay-objtool),,$(objtool-args-y)) --noabs From 2578560d2259735d7d51364e7991ea92d85fd56c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:20 +0200 Subject: [PATCH 36/49] x86/boot: Revert "Reject absolute references in .head.text" This reverts commit faf0ed487415 ("x86/boot: Reject absolute references in .head.text") The startup code is checked directly for the absence of absolute symbol references, so checking the .head.text section in the relocs tool is no longer needed. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-41-ardb+git@google.com --- arch/x86/tools/relocs.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 5778bc498415..e5a2b9a912d1 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -740,10 +740,10 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, const char *symname) { - int headtext = !strcmp(sec_name(sec->shdr.sh_info), ".head.text"); unsigned r_type = ELF64_R_TYPE(rel->r_info); ElfW(Addr) offset = rel->r_offset; int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname); + if (sym->st_shndx == SHN_UNDEF) return 0; @@ -783,12 +783,6 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, break; } - if (headtext) { - die("Absolute reference to symbol '%s' not permitted in .head.text\n", - symname); - break; - } - /* * Relocation offsets for 64 bit kernels are output * as 32 bits and sign extended back to 64 bits when From 749627c3980e4421b709857e979e8aa16a4c7147 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:21 +0200 Subject: [PATCH 37/49] x86/kbuild: Incorporate boot/startup/ via Kbuild makefile Using core-y is not the correct way to get kbuild to descend into arch/x86/boot/startup. For instance, building an individual object does not work as expected when the pattern rule is local to the Makefile $ make arch/x86/boot/startup/map_kernel.pi.o GEN Makefile CALL /home/ardb/linux/scripts/checksyscalls.sh DESCEND objtool INSTALL libsubcmd_headers make[3]: *** No rule to make target 'arch/x86/boot/startup/map_kernel.pi.o'. Stop. make[2]: *** [/home/ardb/linux/scripts/Makefile.build:461: arch/x86] Error 2 make[1]: *** [/home/ardb/linux/Makefile:2011: .] Error 2 make: *** [/home/ardb/linux/Makefile:248: __sub-make] Error 2 So use obj-y from arch.x86/Kbuild instead, which makes things work as expected. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-42-ardb+git@google.com --- arch/x86/Kbuild | 2 ++ arch/x86/Makefile | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index f7fb3d88c57b..36b985d0e7bf 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -3,6 +3,8 @@ # Branch profiling isn't noinstr-safe. Disable it for arch/x86/* subdir-ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING +obj-y += boot/startup/ + obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += coco/ obj-y += entry/ diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1913d342969b..9b76e77ff7f7 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -286,7 +286,6 @@ archprepare: $(cpufeaturemasks.hdr) ### # Kernel objects -core-y += arch/x86/boot/startup/ libs-y += arch/x86/lib/ # drivers-y are linked after core-y From 7b38dec3c5af54665a4b29483aa02bd1c1e71cf1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:22 +0200 Subject: [PATCH 38/49] x86/boot: Create a confined code area for startup code In order to be able to have tight control over which code may execute from the early 1:1 mapping of memory, but still link vmlinux as a single executable, prefix all symbol references in startup code with __pi_, and invoke it from outside using the __pi_ prefix. Use objtool to check that no absolute symbol references are present in the startup code, as these cannot be used from code running from the 1:1 mapping. Note that this also requires disabling the latent-entropy GCC plugin, as the global symbol references that it injects would require explicit exports, and given that the startup code rarely executes more than once, it is not a useful source of entropy anyway. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-43-ardb+git@google.com --- arch/x86/boot/startup/Makefile | 14 ++++++++++++++ arch/x86/boot/startup/sev-shared.c | 1 - arch/x86/boot/startup/sme.c | 1 - arch/x86/coco/sev/core.c | 2 +- arch/x86/include/asm/setup.h | 1 + arch/x86/include/asm/sev.h | 1 + arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/head_64.S | 8 ++++---- arch/x86/mm/mem_encrypt_boot.S | 6 +++--- tools/objtool/check.c | 3 ++- 10 files changed, 27 insertions(+), 12 deletions(-) diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile index 32737f4ab5a8..e8fdf020b422 100644 --- a/arch/x86/boot/startup/Makefile +++ b/arch/x86/boot/startup/Makefile @@ -4,6 +4,7 @@ KBUILD_AFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += -D__DISABLE_EXPORTS -mcmodel=small -fPIC \ -Os -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) \ + $(DISABLE_LATENT_ENTROPY_PLUGIN) \ -fno-stack-protector -D__NO_FORTIFY \ -fno-jump-tables \ -include $(srctree)/include/linux/hidden.h @@ -36,3 +37,16 @@ $(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y # $(pi-objs): objtool-enabled = 1 $(pi-objs): objtool-args = $(if $(delay-objtool),,$(objtool-args-y)) --noabs + +# +# Confine the startup code by prefixing all symbols with __pi_ (for position +# independent). This ensures that startup code can only call other startup +# code, or code that has explicitly been made accessible to it via a symbol +# alias. +# +$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ +$(obj)/%.pi.o: $(obj)/%.o FORCE + $(call if_changed,objcopy) + +targets += $(obj-y) +obj-y := $(patsubst %.o,%.pi.o,$(obj-y)) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 2a28463edd99..e09c66845e43 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -12,7 +12,6 @@ #include #ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) #define has_cpuflag(f) boot_cpu_has(f) #else #undef WARN diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index bf9153b9a3d9..52b98e7624fe 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -568,7 +568,6 @@ void __head sme_enable(struct boot_params *bp) #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* Local version for startup code, which never operates on user page tables */ -__weak pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index b9133c825f90..cf9a511b47e0 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -272,7 +272,7 @@ static int svsm_perform_call_protocol(struct svsm_call *call) do { ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); + : __pi_svsm_perform_msr_protocol(call); } while (ret == -EAGAIN); if (sev_cfg.ghcbs_initialized) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 692af46603a1..914eb32581c7 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -53,6 +53,7 @@ extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp); extern void startup_64_setup_gdt_idt(void); extern void startup_64_load_idt(void *vc_handler); +extern void __pi_startup_64_load_idt(void *vc_handler); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 0030c7125b29..f222bef9dca8 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -551,6 +551,7 @@ struct cpuid_leaf { }; int svsm_perform_msr_protocol(struct svsm_call *call); +int __pi_svsm_perform_msr_protocol(struct svsm_call *call); int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), void *ctx, struct cpuid_leaf *leaf); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1bc40d0785ee..fd28b53dbac5 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -319,5 +319,5 @@ void early_setup_idt(void) handler = vc_boot_ghcb; } - startup_64_load_idt(handler); + __pi_startup_64_load_idt(handler); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3e9b3a3bd039..d219963ecb60 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -71,7 +71,7 @@ SYM_CODE_START_NOALIGN(startup_64) xorl %edx, %edx wrmsr - call startup_64_setup_gdt_idt + call __pi_startup_64_setup_gdt_idt /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS @@ -91,7 +91,7 @@ SYM_CODE_START_NOALIGN(startup_64) * subsequent code. Pass the boot_params pointer as the first argument. */ movq %r15, %rdi - call sme_enable + call __pi_sme_enable #endif /* Sanitize CPU configuration */ @@ -111,7 +111,7 @@ SYM_CODE_START_NOALIGN(startup_64) * programmed into CR3. */ movq %r15, %rsi - call __startup_64 + call __pi___startup_64 /* Form the CR3 value being sure to include the CR3 modifier */ leaq early_top_pgt(%rip), %rcx @@ -562,7 +562,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) /* Call C handler */ movq %rsp, %rdi movq ORIG_RAX(%rsp), %rsi - call do_vc_no_ghcb + call __pi_do_vc_no_ghcb /* Unwind pt_regs */ POP_REGS diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index f8a33b25ae86..edbf9c998848 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -16,7 +16,7 @@ .text .code64 -SYM_FUNC_START(sme_encrypt_execute) +SYM_FUNC_START(__pi_sme_encrypt_execute) /* * Entry parameters: @@ -69,9 +69,9 @@ SYM_FUNC_START(sme_encrypt_execute) ANNOTATE_UNRET_SAFE ret int3 -SYM_FUNC_END(sme_encrypt_execute) +SYM_FUNC_END(__pi_sme_encrypt_execute) -SYM_FUNC_START(__enc_copy) +SYM_FUNC_START_LOCAL(__enc_copy) ANNOTATE_NOENDBR /* * Routine used to encrypt memory in place. diff --git a/tools/objtool/check.c b/tools/objtool/check.c index fb47327075fb..d0d20666e872 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3564,7 +3564,8 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { /* Ignore KCFI type preambles, which always fall through */ if (!strncmp(func->name, "__cfi_", 6) || - !strncmp(func->name, "__pfx_", 6)) + !strncmp(func->name, "__pfx_", 6) || + !strncmp(func->name, "__pi___pfx_", 11)) return 0; if (file->ignore_unreachables) From e7b88bc0051c5062bdd73b58837cf277d0057358 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:23 +0200 Subject: [PATCH 39/49] efistub/x86: Remap inittext read-execute when needed Recent EFI x86 systems are more strict when it comes to mapping boot images, and require that mappings are either read-write or read-execute. Now that the boot code is being cleaned up and refactored, most of it is being moved into .init.text [where it arguably belongs] but that implies that when booting on such strict EFI firmware, we need to take care to map .init.text (and the .altinstr_aux section that follows it) read-execute as well. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-44-ardb+git@google.com --- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/misc.c | 2 ++ arch/x86/include/asm/boot.h | 2 ++ arch/x86/kernel/vmlinux.lds.S | 2 ++ drivers/firmware/efi/libstub/x86-stub.c | 4 +++- 5 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3a38fdcdb9bd..74657589264d 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -73,7 +73,7 @@ LDFLAGS_vmlinux += -T hostprogs := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|_sinittext\|__inittext_end\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 94b5991da001..0f41ca0e52c0 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -332,6 +332,8 @@ static size_t parse_elf(void *output) } const unsigned long kernel_text_size = VO___start_rodata - VO__text; +const unsigned long kernel_inittext_offset = VO__sinittext - VO__text; +const unsigned long kernel_inittext_size = VO___inittext_end - VO__sinittext; const unsigned long kernel_total_size = VO__end - VO__text; static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4); diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 02b23aa78955..f7b67cb73915 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -82,6 +82,8 @@ #ifndef __ASSEMBLER__ extern unsigned int output_len; extern const unsigned long kernel_text_size; +extern const unsigned long kernel_inittext_offset; +extern const unsigned long kernel_inittext_size; extern const unsigned long kernel_total_size; unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 5d5e3a95e1f9..4277efb26358 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -227,6 +227,8 @@ SECTIONS */ .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) { *(.altinstr_aux) + . = ALIGN(PAGE_SIZE); + __inittext_end = .; } INIT_DATA_SECTION(16) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index cafc90d4caaf..0d05eac7c72b 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -788,7 +788,9 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry, *kernel_entry = addr + entry; - return efi_adjust_memory_range_protection(addr, kernel_text_size); + return efi_adjust_memory_range_protection(addr, kernel_text_size) ?: + efi_adjust_memory_range_protection(addr + kernel_inittext_offset, + kernel_inittext_size); } static void __noreturn enter_kernel(unsigned long kernel_addr, From c5c30a37369313d1f8b84e96e6a4397b4e2b4eb8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:24 +0200 Subject: [PATCH 40/49] x86/boot: Move startup code out of __head section Move startup code out of the __head section, now that this no longer has a special significance. Move everything into .text or .init.text as appropriate, so that startup code is not kept around unnecessarily. [ bp: Fold in hunk to fix 32-bit CPU hotplug: Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202509022207.56fd97f4-lkp@intel.com ] Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-45-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 3 --- arch/x86/boot/startup/gdt_idt.c | 4 ++-- arch/x86/boot/startup/map_kernel.c | 4 ++-- arch/x86/boot/startup/sev-shared.c | 36 ++++++++++++++--------------- arch/x86/boot/startup/sev-startup.c | 14 +++++------ arch/x86/boot/startup/sme.c | 26 ++++++++++----------- arch/x86/include/asm/init.h | 6 ----- arch/x86/kernel/head_32.S | 5 +++- arch/x86/kernel/head_64.S | 2 +- arch/x86/platform/pvh/head.S | 2 +- 10 files changed, 48 insertions(+), 54 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index a5e002ff6bff..57670c172b25 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -32,9 +32,6 @@ struct ghcb *boot_ghcb; #undef __init #define __init -#undef __head -#define __head - #define __BOOT_COMPRESSED u8 snp_vmpl; diff --git a/arch/x86/boot/startup/gdt_idt.c b/arch/x86/boot/startup/gdt_idt.c index a3112a69b06a..d16102abdaec 100644 --- a/arch/x86/boot/startup/gdt_idt.c +++ b/arch/x86/boot/startup/gdt_idt.c @@ -24,7 +24,7 @@ static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; /* This may run while still in the direct mapping */ -void __head startup_64_load_idt(void *vc_handler) +void startup_64_load_idt(void *vc_handler) { struct desc_ptr desc = { .address = (unsigned long)rip_rel_ptr(bringup_idt_table), @@ -46,7 +46,7 @@ void __head startup_64_load_idt(void *vc_handler) /* * Setup boot CPU state needed before kernel switches to virtual addresses. */ -void __head startup_64_setup_gdt_idt(void) +void __init startup_64_setup_gdt_idt(void) { struct gdt_page *gp = rip_rel_ptr((void *)(__force unsigned long)&gdt_page); void *handler = NULL; diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c index 332dbe6688c4..83ba98d61572 100644 --- a/arch/x86/boot/startup/map_kernel.c +++ b/arch/x86/boot/startup/map_kernel.c @@ -30,7 +30,7 @@ static inline bool check_la57_support(void) return true; } -static unsigned long __head sme_postprocess_startup(struct boot_params *bp, +static unsigned long __init sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd, unsigned long p2v_offset) { @@ -84,7 +84,7 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, * the 1:1 mapping of memory. Kernel virtual addresses can be determined by * subtracting p2v_offset from the RIP-relative address. */ -unsigned long __head __startup_64(unsigned long p2v_offset, +unsigned long __init __startup_64(unsigned long p2v_offset, struct boot_params *bp) { pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts); diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index e09c66845e43..08cc1568d8af 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -33,7 +33,7 @@ static u32 cpuid_ext_range_max __ro_after_init; bool sev_snp_needs_sfw; -void __head __noreturn +void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; @@ -52,7 +52,7 @@ sev_es_terminate(unsigned int set, unsigned int reason) /* * The hypervisor features are available from GHCB version 2 onward. */ -u64 get_hv_features(void) +u64 __init get_hv_features(void) { u64 val; @@ -222,7 +222,7 @@ const struct snp_cpuid_table *snp_cpuid_get_table(void) * * Return: XSAVE area size on success, 0 otherwise. */ -static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); u64 xfeatures_found = 0; @@ -258,7 +258,7 @@ static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) return xsave_size; } -static bool __head +static bool snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -300,7 +300,7 @@ static void snp_cpuid_hv_msr(void *ctx, struct cpuid_leaf *leaf) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); } -static int __head +static int snp_cpuid_postprocess(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), void *ctx, struct cpuid_leaf *leaf) { @@ -396,8 +396,8 @@ snp_cpuid_postprocess(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value * should be treated as fatal by caller. */ -int __head snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), - void *ctx, struct cpuid_leaf *leaf) +int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -439,7 +439,7 @@ int __head snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), * page yet, so it only supports the MSR based communication with the * hypervisor and only the CPUID exit-code. */ -void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); @@ -515,7 +515,7 @@ struct cc_setup_data { * Search for a Confidential Computing blob passed in as a setup_data entry * via the Linux Boot Protocol. */ -static __head +static __init struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) { struct cc_setup_data *sd = NULL; @@ -543,7 +543,7 @@ struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) * mapping needs to be updated in sync with all the changes to virtual memory * layout and related mapping facilities throughout the boot process. */ -static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) { const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; int i; @@ -571,7 +571,7 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) } } -static int __head svsm_call_msr_protocol(struct svsm_call *call) +static int svsm_call_msr_protocol(struct svsm_call *call) { int ret; @@ -582,8 +582,8 @@ static int __head svsm_call_msr_protocol(struct svsm_call *call) return ret; } -static void __head svsm_pval_4k_page(unsigned long paddr, bool validate, - struct svsm_ca *caa, u64 caa_pa) +static void svsm_pval_4k_page(unsigned long paddr, bool validate, + struct svsm_ca *caa, u64 caa_pa) { struct svsm_pvalidate_call *pc; struct svsm_call call = {}; @@ -624,8 +624,8 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate, native_local_irq_restore(flags); } -static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, - bool validate, struct svsm_ca *caa, u64 caa_pa) +static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, + bool validate, struct svsm_ca *caa, u64 caa_pa) { int ret; @@ -645,8 +645,8 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, sev_evict_cache((void *)vaddr, 1); } -static void __head __page_state_change(unsigned long vaddr, unsigned long paddr, - const struct psc_desc *desc) +static void __page_state_change(unsigned long vaddr, unsigned long paddr, + const struct psc_desc *desc) { u64 val, msr; @@ -684,7 +684,7 @@ static void __head __page_state_change(unsigned long vaddr, unsigned long paddr, * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM * services needed when not running in VMPL0. */ -static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info, +static bool __init svsm_setup_ca(const struct cc_blob_sev_info *cc_info, void *page) { struct snp_secrets_page *secrets_page; diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 9f4b4ca7deaa..39465a0ff4e5 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -44,7 +44,7 @@ /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" -void __head +void __init early_set_pages_state(unsigned long vaddr, unsigned long paddr, unsigned long npages, const struct psc_desc *desc) { @@ -63,7 +63,7 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, } } -void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { struct psc_desc d = { @@ -88,7 +88,7 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd early_set_pages_state(vaddr, paddr, npages, &d); } -void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { struct psc_desc d = { @@ -123,7 +123,7 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr * * Scan for the blob in that order. */ -static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +static struct cc_blob_sev_info *__init find_cc_blob(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -149,7 +149,7 @@ found_cc_info: return cc_info; } -static __head void svsm_setup(struct cc_blob_sev_info *cc_info) +static void __init svsm_setup(struct cc_blob_sev_info *cc_info) { struct snp_secrets_page *secrets = (void *)cc_info->secrets_phys; struct svsm_call call = {}; @@ -190,7 +190,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) boot_svsm_caa_pa = pa; } -bool __head snp_init(struct boot_params *bp) +bool __init snp_init(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -219,7 +219,7 @@ bool __head snp_init(struct boot_params *bp) return true; } -void __head __noreturn snp_abort(void) +void __init __noreturn snp_abort(void) { sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); } diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index 52b98e7624fe..2ddde901c8c5 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -91,7 +91,7 @@ struct sme_populate_pgd_data { */ static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); -static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; pgd_t *pgd_p; @@ -106,7 +106,7 @@ static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) memset(pgd_p, 0, pgd_size); } -static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) { pgd_t *pgd; p4d_t *p4d; @@ -143,7 +143,7 @@ static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) return pud; } -static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -159,7 +159,7 @@ static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); } -static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -185,7 +185,7 @@ static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); } -static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd_large(ppd); @@ -195,7 +195,7 @@ static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) } } -static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd(ppd); @@ -205,7 +205,7 @@ static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) } } -static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, pmdval_t pmd_flags, pteval_t pte_flags) { unsigned long vaddr_end; @@ -229,22 +229,22 @@ static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, __sme_map_range_pte(ppd); } -static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); } -static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); } -static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); } -static unsigned long __head sme_pgtable_calc(unsigned long len) +static unsigned long __init sme_pgtable_calc(unsigned long len) { unsigned long entries = 0, tables = 0; @@ -281,7 +281,7 @@ static unsigned long __head sme_pgtable_calc(unsigned long len) return entries + tables; } -void __head sme_encrypt_kernel(struct boot_params *bp) +void __init sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; @@ -485,7 +485,7 @@ void __head sme_encrypt_kernel(struct boot_params *bp) native_write_cr3(__native_read_cr3()); } -void __head sme_enable(struct boot_params *bp) +void __init sme_enable(struct boot_params *bp) { unsigned int eax, ebx, ecx, edx; unsigned long feature_mask; diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 5a68e9db6518..01ccdd168df0 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -2,12 +2,6 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H -#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000 -#define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector -#else -#define __head __section(".head.text") __no_sanitize_undefined __no_kstack_erase -#endif - struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void (*free_pgt_page)(void *, void *); /* free buf for page table */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 76743dfad6ab..80ef5d386b03 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -61,7 +61,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * any particular GDT layout, because we load our own as soon as we * can. */ -__HEAD + __INIT SYM_CODE_START(startup_32) movl pa(initial_stack),%ecx @@ -136,6 +136,9 @@ SYM_CODE_END(startup_32) * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ +#ifdef CONFIG_HOTPLUG_CPU + .text +#endif SYM_FUNC_START(startup_32_smp) cld movl $(__BOOT_DS),%eax diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d219963ecb60..21816b48537c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -33,7 +33,7 @@ * because we need identity-mapped pages. */ - __HEAD + __INIT .code64 SYM_CODE_START_NOALIGN(startup_64) UNWIND_HINT_END_OF_STACK diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 1d78e5631bb8..344030c1a81d 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -24,7 +24,7 @@ #include #include - __HEAD + __INIT /* * Entry point for PVH guests. From ce39a6aa8802e718f9b68bf6892612e4fd7f9d2d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Aug 2025 12:22:25 +0200 Subject: [PATCH 41/49] x86/boot: Get rid of the .head.text section The .head.text section is now empty, so it can be dropped from the linker script. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250828102202.1849035-46-ardb+git@google.com --- arch/x86/kernel/vmlinux.lds.S | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4277efb26358..d7af4a64c211 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -160,11 +160,6 @@ SECTIONS } :text = 0xcccccccc - /* bootstrapping code */ - .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { - HEAD_TEXT - } :text = 0xcccccccc - /* End of text section, which should occupy whole number of pages */ _etext = .; . = ALIGN(PAGE_SIZE); From d4bc3b11c12b41fdb5650f5ad797de97f8dce869 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 3 Sep 2025 17:42:05 +0200 Subject: [PATCH 42/49] x86/apic/savic: Do not use snp_abort() This function is going away so replace the callsites with the equivalent functionality. Add a new SAVIC-specific termination reason. If more granularity is needed there, it will be revisited in the future. Signed-off-by: Borislav Petkov (AMD) --- arch/x86/coco/sev/core.c | 4 ++-- arch/x86/include/asm/sev-common.h | 1 + arch/x86/kernel/apic/x2apic_savic.c | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index b64f43010a12..e858e2979db0 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1129,7 +1129,7 @@ u64 savic_ghcb_msr_read(u32 reg) if (res != ES_OK) { pr_err("Secure AVIC MSR (0x%llx) read returned error (%d)\n", msr, res); /* MSR read failures are treated as fatal errors */ - snp_abort(); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); } __sev_put_ghcb(&state); @@ -1159,7 +1159,7 @@ void savic_ghcb_msr_write(u32 reg, u64 value) if (res != ES_OK) { pr_err("Secure AVIC MSR (0x%llx) write returned error (%d)\n", msr, res); /* MSR writes should never fail. Any failure is fatal error for SNP guest */ - snp_abort(); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); } __sev_put_ghcb(&state); diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 0020d77a0800..01a6e4dbe423 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -208,6 +208,7 @@ struct snp_psc_desc { #define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */ #define GHCB_TERM_SECURE_TSC 10 /* Secure TSC initialization failed */ #define GHCB_TERM_SVSM_CA_REMAP_FAIL 11 /* SVSM is present but CA could not be remapped */ +#define GHCB_TERM_SAVIC_FAIL 12 /* Secure AVIC-specific failure */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c index b846de0fbcfa..dbc5678bc3b6 100644 --- a/arch/x86/kernel/apic/x2apic_savic.c +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -363,7 +363,7 @@ static void savic_setup(void) */ res = savic_register_gpa(gpa); if (res != ES_OK) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, gpa | MSR_AMD64_SAVIC_EN | MSR_AMD64_SAVIC_ALLOWEDNMI); @@ -376,13 +376,13 @@ static int savic_probe(void) if (!x2apic_mode) { pr_err("Secure AVIC enabled in non x2APIC mode\n"); - snp_abort(); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); /* unreachable */ } savic_page = alloc_percpu(struct secure_avic_page); if (!savic_page) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); return 1; } From 9f8d92a1fbb5a08e17f9d405a1ab27be64096d8c Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 3 Sep 2025 18:14:54 +0200 Subject: [PATCH 43/49] x86/sev: Zap snp_abort() It is a silly oneliner anyway. Replace it with its equivalent. No functional changes. Signed-off-by: Borislav Petkov (AMD) --- arch/x86/boot/startup/sev-startup.c | 7 +------ arch/x86/boot/startup/sme.c | 2 +- arch/x86/include/asm/sev.h | 2 -- tools/objtool/noreturns.h | 1 - 4 files changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 39465a0ff4e5..a9b0a9c32d8f 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -144,7 +144,7 @@ static struct cc_blob_sev_info *__init find_cc_blob(struct boot_params *bp) found_cc_info: if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); return cc_info; } @@ -218,8 +218,3 @@ bool __init snp_init(struct boot_params *bp) return true; } - -void __init __noreturn snp_abort(void) -{ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); -} diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index 2ddde901c8c5..e7ea65f3f1d6 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -532,7 +532,7 @@ void __init sme_enable(struct boot_params *bp) * enablement abort the guest. */ if (snp_en ^ !!(msr & MSR_AMD64_SEV_SNP_ENABLED)) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index f222bef9dca8..32c7dd916e4b 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -512,7 +512,6 @@ void snp_set_memory_shared(unsigned long vaddr, unsigned long npages); void snp_set_memory_private(unsigned long vaddr, unsigned long npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); -void __noreturn snp_abort(void); void snp_dmi_setup(void); int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input); void snp_accept_memory(phys_addr_t start, phys_addr_t end); @@ -597,7 +596,6 @@ static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npag static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } -static inline void snp_abort(void) { } static inline void snp_dmi_setup(void) { } static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input) { diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 6a922d046b8e..802895fae3ca 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -45,7 +45,6 @@ NORETURN(rewind_stack_and_make_dead) NORETURN(rust_begin_unwind) NORETURN(rust_helper_BUG) NORETURN(sev_es_terminate) -NORETURN(snp_abort) NORETURN(start_kernel) NORETURN(stop_this_cpu) NORETURN(usercopy_abort) From 26a9f90b6101ea2c9d6f02802cf6d85108104b90 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 8 Sep 2025 13:04:18 -0700 Subject: [PATCH 44/49] objtool: Ignore __pi___cfi_ prefixed symbols When building with CONFIG_CFI_CLANG=y after the recent series to separate the x86 startup code, there are objtool warnings along the lines of: vmlinux.o: warning: objtool: __pi___cfi_startup_64_load_idt() falls through to next function __pi_startup_64_load_idt() vmlinux.o: warning: objtool: __pi___cfi_startup_64_setup_gdt_idt() falls through to next function __pi_startup_64_setup_gdt_idt() vmlinux.o: warning: objtool: __pi___cfi___startup_64() falls through to next function __pi___startup_64() As the comment in validate_branch() states, this is expected, so ignore these symbols in the same way that __cfi_ and __pfx_ symbols are already ignored for the rest of the kernel. Fixes: 7b38dec3c5af ("x86/boot: Create a confined code area for startup code") Signed-off-by: Nathan Chancellor Signed-off-by: Borislav Petkov (AMD) Acked-by: Ard Biesheuvel --- tools/objtool/check.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d0d20666e872..093fcd01dd6e 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3565,6 +3565,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, /* Ignore KCFI type preambles, which always fall through */ if (!strncmp(func->name, "__cfi_", 6) || !strncmp(func->name, "__pfx_", 6) || + !strncmp(func->name, "__pi___cfi_", 11) || !strncmp(func->name, "__pi___pfx_", 11)) return 0; From 8d73829b78ca1a0e6eb93380f3bf5193d58c281c Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 10 Sep 2025 17:19:28 +0200 Subject: [PATCH 45/49] x86/startup/sev: Document the CPUID flow in the boot #VC handler Document the CPUID reading the different SEV guest types do - the SNP one which relies on the presence of a CPUID table and the SEV-ES one, which reads the CPUID supplied by the hypervisor. The intent being to clarify the two back-to-back, similar CPUID invocations. No functional changes. [ bp: Turn into a proper patch. ] Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/fbb24767-0e06-d1d6-36e0-1757d98aca66@amd.com --- arch/x86/boot/startup/sev-shared.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 08cc1568d8af..4e22ffd73516 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -458,6 +458,13 @@ void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) leaf.fn = fn; leaf.subfn = subfn; + /* + * If SNP is active, then snp_cpuid() uses the CPUID table to obtain the + * CPUID values (with possible HV interaction during post-processing of + * the values). But if SNP is not active (no CPUID table present), then + * snp_cpuid() returns -EOPNOTSUPP so that an SEV-ES guest can call the + * HV to obtain the CPUID information. + */ ret = snp_cpuid(snp_cpuid_hv_msr, NULL, &leaf); if (!ret) goto cpuid_done; @@ -465,6 +472,10 @@ void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) if (ret != -EOPNOTSUPP) goto fail; + /* + * This is reached by a SEV-ES guest and needs to invoke the HV for + * the CPUID data. + */ if (__sev_cpuid_hv_msr(&leaf)) goto fail; From e4c00c4ce2aafe61dc7436e763a78d6d112d9e2f Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 16 Sep 2025 21:29:04 +0000 Subject: [PATCH 46/49] x86/sev: Add new dump_rmp parameter to snp_leak_pages() API When leaking certain page types, such as Hypervisor Fixed (HV_FIXED) pages, it does not make sense to dump RMP contents for the 2MB range of the page(s) being leaked. In the case of HV_FIXED pages, this is not an error situation where the surrounding 2MB page RMP entries can provide debug information. Add new __snp_leak_pages() API with dump_rmp bool parameter to support continue adding pages to the snp_leaked_pages_list but not issue dump_rmpentry(). Make snp_leak_pages() a wrapper for the common case which also allows existing users to continue to dump RMP entries. Suggested-by: Thomas Lendacky Suggested-by: Sean Christopherson Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/cover.1758057691.git.ashish.kalra@amd.com --- arch/x86/include/asm/sev.h | 7 ++++++- arch/x86/virt/svm/sev.c | 7 ++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 44dae7472246..f9046c4b9a2b 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -655,9 +655,13 @@ void snp_dump_hva_rmpentry(unsigned long address); int psmash(u64 pfn); int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable); int rmp_make_shared(u64 pfn, enum pg_level level); -void snp_leak_pages(u64 pfn, unsigned int npages); +void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp); void kdump_sev_callback(void); void snp_fixup_e820_tables(void); +static inline void snp_leak_pages(u64 pfn, unsigned int pages) +{ + __snp_leak_pages(pfn, pages, true); +} #else static inline bool snp_probe_rmptable_info(void) { return false; } static inline int snp_rmptable_init(void) { return -ENOSYS; } @@ -670,6 +674,7 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as return -ENODEV; } static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } +static inline void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) {} static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} static inline void kdump_sev_callback(void) { } static inline void snp_fixup_e820_tables(void) {} diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 942372e69b4d..ee643a6cd691 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -1029,7 +1029,7 @@ int rmp_make_shared(u64 pfn, enum pg_level level) } EXPORT_SYMBOL_GPL(rmp_make_shared); -void snp_leak_pages(u64 pfn, unsigned int npages) +void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) { struct page *page = pfn_to_page(pfn); @@ -1052,14 +1052,15 @@ void snp_leak_pages(u64 pfn, unsigned int npages) (PageHead(page) && compound_nr(page) <= npages)) list_add_tail(&page->buddy_list, &snp_leaked_pages_list); - dump_rmpentry(pfn); + if (dump_rmp) + dump_rmpentry(pfn); snp_nr_leaked_pages++; pfn++; page++; } spin_unlock(&snp_leaked_pages_list_lock); } -EXPORT_SYMBOL_GPL(snp_leak_pages); +EXPORT_SYMBOL_GPL(__snp_leak_pages); void kdump_sev_callback(void) { From e09701dcdd9ca06be249091eeb786d57e67b613e Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 16 Sep 2025 21:29:33 +0000 Subject: [PATCH 47/49] crypto: ccp - Add new HV-Fixed page allocation/free API When SEV-SNP is active, the TEE extended command header page and all output buffers for TEE extended commands (such as used by Seamless Firmware servicing support) must be in hypervisor-fixed state, assigned to the hypervisor and marked immutable in the RMP entrie(s). Add a new generic SEV API interface to allocate/free hypervisor fixed pages which abstracts hypervisor fixed page allocation/free for PSP sub devices. The API internally uses SNP_INIT_EX to transition pages to HV-Fixed page state. If SNP is not enabled then the allocator is simply a wrapper over alloc_pages() and __free_pages(). When the sub device free the pages, they are put on a free list and future allocation requests will try to re-use the freed pages from this list. But this list is not preserved across PSP driver load/unload hence this free/reuse support is only supported while PSP driver is loaded. As HV_FIXED page state is only changed at reboot, these pages are leaked as they cannot be returned back to the page allocator and then potentially allocated to guests, which will cause SEV-SNP guests to fail to start or terminate when accessing the HV_FIXED page. Suggested-by: Thomas Lendacky Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Acked-by: Herbert Xu Link: https://lore.kernel.org/cover.1758057691.git.ashish.kalra@amd.com --- drivers/crypto/ccp/sev-dev.c | 182 +++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/sev-dev.h | 3 + 2 files changed, 185 insertions(+) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index e058ba027792..f7b9c6547e18 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -82,6 +82,21 @@ MODULE_FIRMWARE("amd/amd_sev_fam19h_model1xh.sbin"); /* 4th gen EPYC */ static bool psp_dead; static int psp_timeout; +enum snp_hv_fixed_pages_state { + ALLOCATED, + HV_FIXED, +}; + +struct snp_hv_fixed_pages_entry { + struct list_head list; + struct page *page; + unsigned int order; + bool free; + enum snp_hv_fixed_pages_state page_state; +}; + +static LIST_HEAD(snp_hv_fixed_pages); + /* Trusted Memory Region (TMR): * The TMR is a 1MB area that must be 1MB aligned. Use the page allocator * to allocate the memory, which will return aligned memory for the specified @@ -1073,6 +1088,165 @@ static void snp_set_hsave_pa(void *arg) wrmsrq(MSR_VM_HSAVE_PA, 0); } +/* Hypervisor Fixed pages API interface */ +static void snp_hv_fixed_pages_state_update(struct sev_device *sev, + enum snp_hv_fixed_pages_state page_state) +{ + struct snp_hv_fixed_pages_entry *entry; + + /* List is protected by sev_cmd_mutex */ + lockdep_assert_held(&sev_cmd_mutex); + + if (list_empty(&snp_hv_fixed_pages)) + return; + + list_for_each_entry(entry, &snp_hv_fixed_pages, list) + entry->page_state = page_state; +} + +/* + * Allocate HV_FIXED pages in 2MB aligned sizes to ensure the whole + * 2MB pages are marked as HV_FIXED. + */ +struct page *snp_alloc_hv_fixed_pages(unsigned int num_2mb_pages) +{ + struct psp_device *psp_master = psp_get_master_device(); + struct snp_hv_fixed_pages_entry *entry; + struct sev_device *sev; + unsigned int order; + struct page *page; + + if (!psp_master || !psp_master->sev_data) + return NULL; + + sev = psp_master->sev_data; + + order = get_order(PMD_SIZE * num_2mb_pages); + + /* + * SNP_INIT_EX is protected by sev_cmd_mutex, therefore this list + * also needs to be protected using the same mutex. + */ + guard(mutex)(&sev_cmd_mutex); + + /* + * This API uses SNP_INIT_EX to transition allocated pages to HV_Fixed + * page state, fail if SNP is already initialized. + */ + if (sev->snp_initialized) + return NULL; + + /* Re-use freed pages that match the request */ + list_for_each_entry(entry, &snp_hv_fixed_pages, list) { + /* Hypervisor fixed page allocator implements exact fit policy */ + if (entry->order == order && entry->free) { + entry->free = false; + memset(page_address(entry->page), 0, + (1 << entry->order) * PAGE_SIZE); + return entry->page; + } + } + + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!page) + return NULL; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + __free_pages(page, order); + return NULL; + } + + entry->page = page; + entry->order = order; + list_add_tail(&entry->list, &snp_hv_fixed_pages); + + return page; +} + +void snp_free_hv_fixed_pages(struct page *page) +{ + struct psp_device *psp_master = psp_get_master_device(); + struct snp_hv_fixed_pages_entry *entry, *nentry; + + if (!psp_master || !psp_master->sev_data) + return; + + /* + * SNP_INIT_EX is protected by sev_cmd_mutex, therefore this list + * also needs to be protected using the same mutex. + */ + guard(mutex)(&sev_cmd_mutex); + + list_for_each_entry_safe(entry, nentry, &snp_hv_fixed_pages, list) { + if (entry->page != page) + continue; + + /* + * HV_FIXED page state cannot be changed until reboot + * and they cannot be used by an SNP guest, so they cannot + * be returned back to the page allocator. + * Mark the pages as free internally to allow possible re-use. + */ + if (entry->page_state == HV_FIXED) { + entry->free = true; + } else { + __free_pages(page, entry->order); + list_del(&entry->list); + kfree(entry); + } + return; + } +} + +static void snp_add_hv_fixed_pages(struct sev_device *sev, struct sev_data_range_list *range_list) +{ + struct snp_hv_fixed_pages_entry *entry; + struct sev_data_range *range; + int num_elements; + + lockdep_assert_held(&sev_cmd_mutex); + + if (list_empty(&snp_hv_fixed_pages)) + return; + + num_elements = list_count_nodes(&snp_hv_fixed_pages) + + range_list->num_elements; + + /* + * Ensure the list of HV_FIXED pages that will be passed to firmware + * do not exceed the page-sized argument buffer. + */ + if (num_elements * sizeof(*range) + sizeof(*range_list) > PAGE_SIZE) { + dev_warn(sev->dev, "Additional HV_Fixed pages cannot be accommodated, omitting\n"); + return; + } + + range = &range_list->ranges[range_list->num_elements]; + list_for_each_entry(entry, &snp_hv_fixed_pages, list) { + range->base = page_to_pfn(entry->page) << PAGE_SHIFT; + range->page_count = 1 << entry->order; + range++; + } + range_list->num_elements = num_elements; +} + +static void snp_leak_hv_fixed_pages(void) +{ + struct snp_hv_fixed_pages_entry *entry; + + /* List is protected by sev_cmd_mutex */ + lockdep_assert_held(&sev_cmd_mutex); + + if (list_empty(&snp_hv_fixed_pages)) + return; + + list_for_each_entry(entry, &snp_hv_fixed_pages, list) + if (entry->page_state == HV_FIXED) + __snp_leak_pages(page_to_pfn(entry->page), + 1 << entry->order, false); +} + static int snp_filter_reserved_mem_regions(struct resource *rs, void *arg) { struct sev_data_range_list *range_list = arg; @@ -1163,6 +1337,12 @@ static int __sev_snp_init_locked(int *error) return rc; } + /* + * Add HV_Fixed pages from other PSP sub-devices, such as SFS to the + * HV_Fixed page list. + */ + snp_add_hv_fixed_pages(sev, snp_range_list); + memset(&data, 0, sizeof(data)); data.init_rmp = 1; data.list_paddr_en = 1; @@ -1202,6 +1382,7 @@ static int __sev_snp_init_locked(int *error) return rc; } + snp_hv_fixed_pages_state_update(sev, HV_FIXED); sev->snp_initialized = true; dev_dbg(sev->dev, "SEV-SNP firmware initialized\n"); @@ -1784,6 +1965,7 @@ static int __sev_snp_shutdown_locked(int *error, bool panic) return ret; } + snp_leak_hv_fixed_pages(); sev->snp_initialized = false; dev_dbg(sev->dev, "SEV-SNP firmware shutdown\n"); diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h index 3e4e5574e88a..28021abc85ad 100644 --- a/drivers/crypto/ccp/sev-dev.h +++ b/drivers/crypto/ccp/sev-dev.h @@ -65,4 +65,7 @@ void sev_dev_destroy(struct psp_device *psp); void sev_pci_init(void); void sev_pci_exit(void); +struct page *snp_alloc_hv_fixed_pages(unsigned int num_2mb_pages); +void snp_free_hv_fixed_pages(struct page *page); + #endif /* __SEV_DEV_H */ From 648dbccc03a000cd64c2a9d86012d98053545e64 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 16 Sep 2025 21:29:49 +0000 Subject: [PATCH 48/49] crypto: ccp - Add AMD Seamless Firmware Servicing (SFS) driver AMD Seamless Firmware Servicing (SFS) is a secure method to allow non-persistent updates to running firmware and settings without requiring BIOS reflash and/or system reset. SFS does not address anything that runs on the x86 processors and it can be used to update ASP firmware, modules, register settings and update firmware for other microprocessors like TMPM, etc. SFS driver support adds ioctl support to communicate the SFS commands to the ASP/PSP by using the TEE mailbox interface. The Seamless Firmware Servicing (SFS) driver is added as a PSP sub-device. For detailed information, please look at the SFS specifications: https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/58604.pdf Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Acked-by: Herbert Xu Link: https://lore.kernel.org/cover.1758057691.git.ashish.kalra@amd.com --- drivers/crypto/ccp/Makefile | 3 +- drivers/crypto/ccp/psp-dev.c | 20 ++ drivers/crypto/ccp/psp-dev.h | 8 +- drivers/crypto/ccp/sfs.c | 311 ++++++++++++++++++++++++++++ drivers/crypto/ccp/sfs.h | 47 +++++ include/linux/psp-platform-access.h | 2 + include/uapi/linux/psp-sfs.h | 87 ++++++++ 7 files changed, 476 insertions(+), 2 deletions(-) create mode 100644 drivers/crypto/ccp/sfs.c create mode 100644 drivers/crypto/ccp/sfs.h create mode 100644 include/uapi/linux/psp-sfs.h diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index 394484929dae..a9626b30044a 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -13,7 +13,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \ tee-dev.o \ platform-access.o \ dbc.o \ - hsti.o + hsti.o \ + sfs.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 1c5a7189631e..9e21da0e298a 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -17,6 +17,7 @@ #include "psp-dev.h" #include "sev-dev.h" #include "tee-dev.h" +#include "sfs.h" #include "platform-access.h" #include "dbc.h" #include "hsti.h" @@ -182,6 +183,17 @@ static int psp_check_tee_support(struct psp_device *psp) return 0; } +static int psp_check_sfs_support(struct psp_device *psp) +{ + /* Check if device supports SFS feature */ + if (!psp->capability.sfs) { + dev_dbg(psp->dev, "psp does not support SFS\n"); + return -ENODEV; + } + + return 0; +} + static int psp_init(struct psp_device *psp) { int ret; @@ -198,6 +210,12 @@ static int psp_init(struct psp_device *psp) return ret; } + if (!psp_check_sfs_support(psp)) { + ret = sfs_dev_init(psp); + if (ret) + return ret; + } + if (psp->vdata->platform_access) { ret = platform_access_dev_init(psp); if (ret) @@ -302,6 +320,8 @@ void psp_dev_destroy(struct sp_device *sp) tee_dev_destroy(psp); + sfs_dev_destroy(psp); + dbc_dev_destroy(psp); platform_access_dev_destroy(psp); diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index e43ce87ede76..268c83f298cb 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -32,7 +32,8 @@ union psp_cap_register { unsigned int sev :1, tee :1, dbc_thru_ext :1, - rsvd1 :4, + sfs :1, + rsvd1 :3, security_reporting :1, fused_part :1, rsvd2 :1, @@ -68,6 +69,7 @@ struct psp_device { void *tee_data; void *platform_access_data; void *dbc_data; + void *sfs_data; union psp_cap_register capability; }; @@ -118,12 +120,16 @@ struct psp_ext_request { * @PSP_SUB_CMD_DBC_SET_UID: Set UID for DBC * @PSP_SUB_CMD_DBC_GET_PARAMETER: Get parameter from DBC * @PSP_SUB_CMD_DBC_SET_PARAMETER: Set parameter for DBC + * @PSP_SUB_CMD_SFS_GET_FW_VERS: Get firmware versions for ASP and other MP + * @PSP_SUB_CMD_SFS_UPDATE: Command to load, verify and execute SFS package */ enum psp_sub_cmd { PSP_SUB_CMD_DBC_GET_NONCE = PSP_DYNAMIC_BOOST_GET_NONCE, PSP_SUB_CMD_DBC_SET_UID = PSP_DYNAMIC_BOOST_SET_UID, PSP_SUB_CMD_DBC_GET_PARAMETER = PSP_DYNAMIC_BOOST_GET_PARAMETER, PSP_SUB_CMD_DBC_SET_PARAMETER = PSP_DYNAMIC_BOOST_SET_PARAMETER, + PSP_SUB_CMD_SFS_GET_FW_VERS = PSP_SFS_GET_FW_VERSIONS, + PSP_SUB_CMD_SFS_UPDATE = PSP_SFS_UPDATE, }; int psp_extended_mailbox_cmd(struct psp_device *psp, unsigned int timeout_msecs, diff --git a/drivers/crypto/ccp/sfs.c b/drivers/crypto/ccp/sfs.c new file mode 100644 index 000000000000..2f4beaafe7ec --- /dev/null +++ b/drivers/crypto/ccp/sfs.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure Processor Seamless Firmware Servicing support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#include + +#include "sfs.h" +#include "sev-dev.h" + +#define SFS_DEFAULT_TIMEOUT (10 * MSEC_PER_SEC) +#define SFS_MAX_PAYLOAD_SIZE (2 * 1024 * 1024) +#define SFS_NUM_2MB_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PMD_SIZE) +#define SFS_NUM_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PAGE_SIZE) + +static DEFINE_MUTEX(sfs_ioctl_mutex); + +static struct sfs_misc_dev *misc_dev; + +static int send_sfs_cmd(struct sfs_device *sfs_dev, int msg) +{ + int ret; + + sfs_dev->command_buf->hdr.status = 0; + sfs_dev->command_buf->hdr.sub_cmd_id = msg; + + ret = psp_extended_mailbox_cmd(sfs_dev->psp, + SFS_DEFAULT_TIMEOUT, + (struct psp_ext_request *)sfs_dev->command_buf); + if (ret == -EIO) { + dev_dbg(sfs_dev->dev, + "msg 0x%x failed with PSP error: 0x%x, extended status: 0x%x\n", + msg, sfs_dev->command_buf->hdr.status, + *(u32 *)sfs_dev->command_buf->buf); + } + + return ret; +} + +static int send_sfs_get_fw_versions(struct sfs_device *sfs_dev) +{ + /* + * SFS_GET_FW_VERSIONS command needs the output buffer to be + * initialized to 0xC7 in every byte. + */ + memset(sfs_dev->command_buf->sfs_buffer, 0xc7, PAGE_SIZE); + sfs_dev->command_buf->hdr.payload_size = 2 * PAGE_SIZE; + + return send_sfs_cmd(sfs_dev, PSP_SFS_GET_FW_VERSIONS); +} + +static int send_sfs_update_package(struct sfs_device *sfs_dev, const char *payload_name) +{ + char payload_path[PAYLOAD_NAME_SIZE + sizeof("amd/")]; + const struct firmware *firmware; + unsigned long package_size; + int ret; + + /* Sanitize userspace provided payload name */ + if (!strnchr(payload_name, PAYLOAD_NAME_SIZE, '\0')) + return -EINVAL; + + snprintf(payload_path, sizeof(payload_path), "amd/%s", payload_name); + + ret = firmware_request_nowarn(&firmware, payload_path, sfs_dev->dev); + if (ret < 0) { + dev_warn_ratelimited(sfs_dev->dev, "firmware request failed for %s (%d)\n", + payload_path, ret); + return -ENOENT; + } + + /* + * SFS Update Package command's input buffer contains TEE_EXT_CMD_BUFFER + * followed by the Update Package and it should be 64KB aligned. + */ + package_size = ALIGN(firmware->size + PAGE_SIZE, 0x10000U); + + /* + * SFS command buffer is a pre-allocated 2MB buffer, fail update package + * if SFS payload is larger than the pre-allocated command buffer. + */ + if (package_size > SFS_MAX_PAYLOAD_SIZE) { + dev_warn_ratelimited(sfs_dev->dev, + "SFS payload size %ld larger than maximum supported payload size of %u\n", + package_size, SFS_MAX_PAYLOAD_SIZE); + release_firmware(firmware); + return -E2BIG; + } + + /* + * Copy firmware data to a HV_Fixed memory region. + */ + memcpy(sfs_dev->command_buf->sfs_buffer, firmware->data, firmware->size); + sfs_dev->command_buf->hdr.payload_size = package_size; + + release_firmware(firmware); + + return send_sfs_cmd(sfs_dev, PSP_SFS_UPDATE); +} + +static long sfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct sfs_user_get_fw_versions __user *sfs_get_fw_versions; + struct sfs_user_update_package __user *sfs_update_package; + struct psp_device *psp_master = psp_get_master_device(); + char payload_name[PAYLOAD_NAME_SIZE]; + struct sfs_device *sfs_dev; + int ret = 0; + + if (!psp_master || !psp_master->sfs_data) + return -ENODEV; + + sfs_dev = psp_master->sfs_data; + + guard(mutex)(&sfs_ioctl_mutex); + + switch (cmd) { + case SFSIOCFWVERS: + dev_dbg(sfs_dev->dev, "in SFSIOCFWVERS\n"); + + sfs_get_fw_versions = (struct sfs_user_get_fw_versions __user *)arg; + + ret = send_sfs_get_fw_versions(sfs_dev); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_get_fw_versions->blob, sfs_dev->command_buf->sfs_buffer, + PAGE_SIZE)) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_get_fw_versions->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_get_fw_versions->sfs_extended_status))) + return -EFAULT; + break; + case SFSIOCUPDATEPKG: + dev_dbg(sfs_dev->dev, "in SFSIOCUPDATEPKG\n"); + + sfs_update_package = (struct sfs_user_update_package __user *)arg; + + if (copy_from_user(payload_name, sfs_update_package->payload_name, + PAYLOAD_NAME_SIZE)) + return -EFAULT; + + ret = send_sfs_update_package(sfs_dev, payload_name); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_update_package->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_update_package->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_update_package->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_update_package->sfs_extended_status))) + return -EFAULT; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static const struct file_operations sfs_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = sfs_ioctl, +}; + +static void sfs_exit(struct kref *ref) +{ + misc_deregister(&misc_dev->misc); + kfree(misc_dev); + misc_dev = NULL; +} + +void sfs_dev_destroy(struct psp_device *psp) +{ + struct sfs_device *sfs_dev = psp->sfs_data; + + if (!sfs_dev) + return; + + /* + * Change SFS command buffer back to the default "Write-Back" type. + */ + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + + snp_free_hv_fixed_pages(sfs_dev->page); + + if (sfs_dev->misc) + kref_put(&misc_dev->refcount, sfs_exit); + + psp->sfs_data = NULL; +} + +/* Based on sev_misc_init() */ +static int sfs_misc_init(struct sfs_device *sfs) +{ + struct device *dev = sfs->dev; + int ret; + + /* + * SFS feature support can be detected on multiple devices but the SFS + * FW commands must be issued on the master. During probe, we do not + * know the master hence we create /dev/sfs on the first device probe. + */ + if (!misc_dev) { + struct miscdevice *misc; + + misc_dev = kzalloc(sizeof(*misc_dev), GFP_KERNEL); + if (!misc_dev) + return -ENOMEM; + + misc = &misc_dev->misc; + misc->minor = MISC_DYNAMIC_MINOR; + misc->name = "sfs"; + misc->fops = &sfs_fops; + misc->mode = 0600; + + ret = misc_register(misc); + if (ret) + return ret; + + kref_init(&misc_dev->refcount); + } else { + kref_get(&misc_dev->refcount); + } + + sfs->misc = misc_dev; + dev_dbg(dev, "registered SFS device\n"); + + return 0; +} + +int sfs_dev_init(struct psp_device *psp) +{ + struct device *dev = psp->dev; + struct sfs_device *sfs_dev; + struct page *page; + int ret = -ENOMEM; + + sfs_dev = devm_kzalloc(dev, sizeof(*sfs_dev), GFP_KERNEL); + if (!sfs_dev) + return -ENOMEM; + + /* + * Pre-allocate 2MB command buffer for all SFS commands using + * SNP HV_Fixed page allocator which also transitions the + * SFS command buffer to HV_Fixed page state if SNP is enabled. + */ + page = snp_alloc_hv_fixed_pages(SFS_NUM_2MB_PAGES_CMDBUF); + if (!page) { + dev_dbg(dev, "Command Buffer HV-Fixed page allocation failed\n"); + goto cleanup_dev; + } + sfs_dev->page = page; + sfs_dev->command_buf = page_address(page); + + dev_dbg(dev, "Command buffer 0x%px to be marked as HV_Fixed\n", sfs_dev->command_buf); + + /* + * SFS command buffer must be mapped as non-cacheable. + */ + ret = set_memory_uc((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + if (ret) { + dev_dbg(dev, "Set memory uc failed\n"); + goto cleanup_cmd_buf; + } + + dev_dbg(dev, "Command buffer 0x%px marked uncacheable\n", sfs_dev->command_buf); + + psp->sfs_data = sfs_dev; + sfs_dev->dev = dev; + sfs_dev->psp = psp; + + ret = sfs_misc_init(sfs_dev); + if (ret) + goto cleanup_mem_attr; + + dev_notice(sfs_dev->dev, "SFS support is available\n"); + + return 0; + +cleanup_mem_attr: + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + +cleanup_cmd_buf: + snp_free_hv_fixed_pages(page); + +cleanup_dev: + psp->sfs_data = NULL; + devm_kfree(dev, sfs_dev); + + return ret; +} diff --git a/drivers/crypto/ccp/sfs.h b/drivers/crypto/ccp/sfs.h new file mode 100644 index 000000000000..97704c210efd --- /dev/null +++ b/drivers/crypto/ccp/sfs.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Platform Security Processor (PSP) Seamless Firmware (SFS) Support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __SFS_H__ +#define __SFS_H__ + +#include + +#include +#include +#include +#include +#include + +#include "psp-dev.h" + +struct sfs_misc_dev { + struct kref refcount; + struct miscdevice misc; +}; + +struct sfs_command { + struct psp_ext_req_buffer_hdr hdr; + u8 buf[PAGE_SIZE - sizeof(struct psp_ext_req_buffer_hdr)]; + u8 sfs_buffer[]; +} __packed; + +struct sfs_device { + struct device *dev; + struct psp_device *psp; + + struct page *page; + struct sfs_command *command_buf; + + struct sfs_misc_dev *misc; +}; + +void sfs_dev_destroy(struct psp_device *psp); +int sfs_dev_init(struct psp_device *psp); + +#endif /* __SFS_H__ */ diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index 1504fb012c05..540abf7de048 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -7,6 +7,8 @@ enum psp_platform_access_msg { PSP_CMD_NONE = 0x0, + PSP_SFS_GET_FW_VERSIONS, + PSP_SFS_UPDATE, PSP_CMD_HSTI_QUERY = 0x14, PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, diff --git a/include/uapi/linux/psp-sfs.h b/include/uapi/linux/psp-sfs.h new file mode 100644 index 000000000000..94e51670383c --- /dev/null +++ b/include/uapi/linux/psp-sfs.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Userspace interface for AMD Seamless Firmware Servicing (SFS) + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __PSP_SFS_USER_H__ +#define __PSP_SFS_USER_H__ + +#include + +/** + * SFS: AMD Seamless Firmware Support (SFS) interface + */ + +#define PAYLOAD_NAME_SIZE 64 +#define TEE_EXT_CMD_BUFFER_SIZE 4096 + +/** + * struct sfs_user_get_fw_versions - get current level of base firmware (output). + * @blob: current level of base firmware for ASP and patch levels (input/output). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_get_fw_versions { + __u8 blob[TEE_EXT_CMD_BUFFER_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * struct sfs_user_update_package - update SFS package (input). + * @payload_name: name of SFS package to load, verify and execute (input). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_update_package { + char payload_name[PAYLOAD_NAME_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * Seamless Firmware Support (SFS) IOC + * + * possible return codes for all SFS IOCTLs: + * 0: success + * -EINVAL: invalid input + * -E2BIG: excess data passed + * -EFAULT: failed to copy to/from userspace + * -EBUSY: mailbox in recovery or in use + * -ENODEV: driver not bound with PSP device + * -EACCES: request isn't authorized + * -EINVAL: invalid parameter + * -ETIMEDOUT: request timed out + * -EAGAIN: invalid request for state machine + * -ENOENT: not implemented + * -ENFILE: overflow + * -EPERM: invalid signature + * -EIO: PSP I/O error + */ +#define SFS_IOC_TYPE 'S' + +/** + * SFSIOCFWVERS - returns blob containing FW versions + * ASP provides the current level of Base Firmware for the ASP + * and the other microprocessors as well as current patch + * level(s). + */ +#define SFSIOCFWVERS _IOWR(SFS_IOC_TYPE, 0x1, struct sfs_user_get_fw_versions) + +/** + * SFSIOCUPDATEPKG - updates package/payload + * ASP loads, verifies and executes the SFS package. + * By default, the SFS package/payload is loaded from + * /lib/firmware/amd, but alternative firmware loading + * path can be specified using kernel parameter + * firmware_class.path or the firmware loading path + * can be customized using sysfs file: + * /sys/module/firmware_class/parameters/path. + */ +#define SFSIOCUPDATEPKG _IOWR(SFS_IOC_TYPE, 0x2, struct sfs_user_update_package) + +#endif /* __PSP_SFS_USER_H__ */ From 1f6113ae5ac4927fe80256154ebb0461e670fa85 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 24 Sep 2025 17:53:11 +0200 Subject: [PATCH 49/49] x86/boot: Drop erroneous __init annotation from early_set_pages_state() The kexec code will call set_pages_state() after tearing down all the GHCBs, which will therefore result in a call to early_set_pages_state(). This means the __init annotation is wrong, and must be dropped. Fixes: c5c30a373693 ("x86/boot: Move startup code out of __head section") Reported-by: Srikanth Aithal Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Tested-by: Srikanth Aithal --- arch/x86/boot/startup/sev-startup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index a9b0a9c32d8f..09725428d3e6 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -44,7 +44,7 @@ /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" -void __init +void early_set_pages_state(unsigned long vaddr, unsigned long paddr, unsigned long npages, const struct psc_desc *desc) {