From 0c54bf48801d02a4a6b09c16f00ab30896b5e304 Mon Sep 17 00:00:00 2001 From: Michael Young Date: Apr 09 2024 22:50:29 +0000 Subject: x86: Native Branch History Injection [XSA-456, CVE-2024-2201] update to xen 4.17.4, remove patches now included upstream rebase xen.gcc12.fixes.patch x86 HVM hypercalls may trigger Xen bug check [XSA-454, CVE-2023-46842] x86: Incorrect logic for BTC/SRSO mitigations [XSA-455, CVE-2024-31142] --- diff --git a/.gitignore b/.gitignore index e0e02a4..ab960e9 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,4 @@ lwip-1.3.0.tar.gz pciutils-2.2.9.tar.bz2 zlib-1.2.3.tar.gz polarssl-1.1.4-gpl.tgz -/xen-4.17.2.tar.gz +/xen-4.17.4.tar.gz diff --git a/sources b/sources index 730c9d1..4b70847 100644 --- a/sources +++ b/sources @@ -4,4 +4,4 @@ SHA512 (newlib-1.16.0.tar.gz) = 40eb96bbc6736a16b6399e0cdb73e853d0d90b685c967e77 SHA512 (zlib-1.2.3.tar.gz) = 021b958fcd0d346c4ba761bcf0cc40f3522de6186cf5a0a6ea34a70504ce9622b1c2626fce40675bc8282cf5f5ade18473656abc38050f72f5d6480507a2106e SHA512 (polarssl-1.1.4-gpl.tgz) = 88da614e4d3f4409c4fd3bb3e44c7587ba051e3fed4e33d526069a67e8180212e1ea22da984656f50e290049f60ddca65383e5983c0f8884f648d71f698303ad SHA512 (pciutils-2.2.9.tar.bz2) = 2b3d98d027e46d8c08037366dde6f0781ca03c610ef2b380984639e4ef39899ed8d8b8e4cd9c9dc54df101279b95879bd66bfd4d04ad07fef41e847ea7ae32b5 -SHA512 (xen-4.17.2.tar.gz) = 0bc475483676e4aa27735695f9a8d2821059e7a55984adb8a29badb5c09a4e7cf8ea29cbc9691be616cc0d7a5ee6b6dacc59ba29c2b16e0919ebdf7dfc54201a +SHA512 (xen-4.17.4.tar.gz) = 50dc2efd26e48131afdbc8efb1ca90154a84fe5fba5a6bc81d9801a3c13aebea91c8211872b5c69ce2773644f2c935c31cc56584a05b10b81e8728282d89eb84 diff --git a/xen.gcc12.fixes.patch b/xen.gcc12.fixes.patch index 66d13b1..b35440f 100644 --- a/xen.gcc12.fixes.patch +++ b/xen.gcc12.fixes.patch @@ -1,7 +1,7 @@ --- xen-4.16.0/Config.mk.orig 2021-11-30 11:42:42.000000000 +0000 +++ xen-4.16.0/Config.mk 2022-01-24 20:25:16.687125822 +0000 @@ -186,6 +186,7 @@ - $(call cc-option-add,CFLAGS,CC,-Wdeclaration-after-statement) + $(call cc-option-add,CFLAGS,CC,-Wno-unused-but-set-variable) $(call cc-option-add,CFLAGS,CC,-Wno-unused-local-typedefs) +$(call cc-option-add,CFLAGS,CC,-Wno-error=array-bounds) diff --git a/xen.git-0ce25b46ab2fb53a1b58f7682ca14971453f4f2c.patch b/xen.git-0ce25b46ab2fb53a1b58f7682ca14971453f4f2c.patch deleted file mode 100644 index 8fa8051..0000000 --- a/xen.git-0ce25b46ab2fb53a1b58f7682ca14971453f4f2c.patch +++ /dev/null @@ -1,49 +0,0 @@ -From 0ce25b46ab2fb53a1b58f7682ca14971453f4f2c Mon Sep 17 00:00:00 2001 -From: =?utf8?q?Roger=20Pau=20Monn=C3=A9?= -Date: Tue, 5 Mar 2024 11:58:36 +0100 -Subject: [PATCH] x86/spec: do not print thunk option selection if not built-in -MIME-Version: 1.0 -Content-Type: text/plain; charset=utf8 -Content-Transfer-Encoding: 8bit - -Since the thunk built-in enable is printed as part of the "Compiled-in -support:" line, avoid printing anything in "Xen settings:" if the thunk is -disabled at build time. - -Note the BTI-Thunk option printing is also adjusted to print a colon in the -same way the other options on the line do. - -Requested-by: Jan Beulich -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich -master commit: 576528a2a742069af203e90c613c5c93e23c9755 -master date: 2024-02-27 14:58:40 +0100 ---- - xen/arch/x86/spec_ctrl.c | 11 ++++++----- - 1 file changed, 6 insertions(+), 5 deletions(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 098fa3184d..25a18ac598 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -516,11 +516,12 @@ static void __init print_details(enum ind_thunk thunk) - "\n"); - - /* Settings for Xen's protection, irrespective of guests. */ -- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", -- thunk == THUNK_NONE ? "N/A" : -- thunk == THUNK_RETPOLINE ? "RETPOLINE" : -- thunk == THUNK_LFENCE ? "LFENCE" : -- thunk == THUNK_JMP ? "JMP" : "?", -+ printk(" Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", -+ thunk != THUNK_NONE ? "BTI-Thunk: " : "", -+ thunk == THUNK_NONE ? "" : -+ thunk == THUNK_RETPOLINE ? "RETPOLINE, " : -+ thunk == THUNK_LFENCE ? "LFENCE, " : -+ thunk == THUNK_JMP ? "JMP, " : "?, ", - (!boot_cpu_has(X86_FEATURE_IBRSB) && - !boot_cpu_has(X86_FEATURE_IBRS)) ? "No" : - (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", --- -2.30.2 - diff --git a/xen.git-54dacb5c02cba4676879ed077765734326b78e39.patch b/xen.git-54dacb5c02cba4676879ed077765734326b78e39.patch deleted file mode 100644 index de3e244..0000000 --- a/xen.git-54dacb5c02cba4676879ed077765734326b78e39.patch +++ /dev/null @@ -1,101 +0,0 @@ -From 54dacb5c02cba4676879ed077765734326b78e39 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper -Date: Tue, 5 Mar 2024 12:01:22 +0100 -Subject: [PATCH] x86/cpu-policy: Allow for levelling of VERW side effects -MIME-Version: 1.0 -Content-Type: text/plain; charset=utf8 -Content-Transfer-Encoding: 8bit - -MD_CLEAR and FB_CLEAR need OR-ing across a migrate pool. Allow this, by -having them unconditinally set in max, with the host values reflected in -default. Annotate the bits as having special properies. - -Signed-off-by: Andrew Cooper -Reviewed-by: Roger Pau Monné -master commit: de17162cafd27f2865a3102a2ec0f386a02ed03d -master date: 2024-03-01 20:14:19 +0000 ---- - xen/arch/x86/cpu-policy.c | 24 +++++++++++++++++++++ - xen/arch/x86/include/asm/cpufeature.h | 1 + - xen/include/public/arch-x86/cpufeatureset.h | 4 ++-- - 3 files changed, 27 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c -index f0f2c8a1c0..7b875a7221 100644 ---- a/xen/arch/x86/cpu-policy.c -+++ b/xen/arch/x86/cpu-policy.c -@@ -435,6 +435,16 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs) - __set_bit(X86_FEATURE_RSBA, fs); - __set_bit(X86_FEATURE_RRSBA, fs); - -+ /* -+ * These bits indicate that the VERW instruction may have gained -+ * scrubbing side effects. With pooling, they mean "you might migrate -+ * somewhere where scrubbing is necessary", and may need exposing on -+ * unaffected hardware. This is fine, because the VERW instruction -+ * has been around since the 286. -+ */ -+ __set_bit(X86_FEATURE_MD_CLEAR, fs); -+ __set_bit(X86_FEATURE_FB_CLEAR, fs); -+ - /* - * The Gather Data Sampling microcode mitigation (August 2023) has an - * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. -@@ -469,6 +479,20 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs) - cpu_has_rdrand && !is_forced_cpu_cap(X86_FEATURE_RDRAND) ) - __clear_bit(X86_FEATURE_RDRAND, fs); - -+ /* -+ * These bits indicate that the VERW instruction may have gained -+ * scrubbing side effects. The max policy has them set for migration -+ * reasons, so reset the default policy back to the host values in -+ * case we're unaffected. -+ */ -+ __clear_bit(X86_FEATURE_MD_CLEAR, fs); -+ if ( cpu_has_md_clear ) -+ __set_bit(X86_FEATURE_MD_CLEAR, fs); -+ -+ __clear_bit(X86_FEATURE_FB_CLEAR, fs); -+ if ( cpu_has_fb_clear ) -+ __set_bit(X86_FEATURE_FB_CLEAR, fs); -+ - /* - * The Gather Data Sampling microcode mitigation (August 2023) has an - * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. -diff --git a/xen/arch/x86/include/asm/cpufeature.h b/xen/arch/x86/include/asm/cpufeature.h -index 9ef7756593..ec824e8954 100644 ---- a/xen/arch/x86/include/asm/cpufeature.h -+++ b/xen/arch/x86/include/asm/cpufeature.h -@@ -136,6 +136,7 @@ - #define cpu_has_avx512_4fmaps boot_cpu_has(X86_FEATURE_AVX512_4FMAPS) - #define cpu_has_avx512_vp2intersect boot_cpu_has(X86_FEATURE_AVX512_VP2INTERSECT) - #define cpu_has_srbds_ctrl boot_cpu_has(X86_FEATURE_SRBDS_CTRL) -+#define cpu_has_md_clear boot_cpu_has(X86_FEATURE_MD_CLEAR) - #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) - #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) - #define cpu_has_serialize boot_cpu_has(X86_FEATURE_SERIALIZE) -diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h -index 94d211df2f..aec1407613 100644 ---- a/xen/include/public/arch-x86/cpufeatureset.h -+++ b/xen/include/public/arch-x86/cpufeatureset.h -@@ -260,7 +260,7 @@ XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single - XEN_CPUFEATURE(FSRM, 9*32+ 4) /*A Fast Short REP MOVS */ - XEN_CPUFEATURE(AVX512_VP2INTERSECT, 9*32+8) /*a VP2INTERSECT{D,Q} insns */ - XEN_CPUFEATURE(SRBDS_CTRL, 9*32+ 9) /* MSR_MCU_OPT_CTRL and RNGDS_MITG_DIS. */ --XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*A VERW clears microarchitectural buffers */ -+XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*!A VERW clears microarchitectural buffers */ - XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */ - XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */ - XEN_CPUFEATURE(SERIALIZE, 9*32+14) /*A SERIALIZE insn */ -@@ -321,7 +321,7 @@ XEN_CPUFEATURE(DOITM, 16*32+12) /* Data Operand Invariant Timing - XEN_CPUFEATURE(SBDR_SSDP_NO, 16*32+13) /*A No Shared Buffer Data Read or Sideband Stale Data Propagation */ - XEN_CPUFEATURE(FBSDP_NO, 16*32+14) /*A No Fill Buffer Stale Data Propagation */ - XEN_CPUFEATURE(PSDP_NO, 16*32+15) /*A No Primary Stale Data Propagation */ --XEN_CPUFEATURE(FB_CLEAR, 16*32+17) /*A Fill Buffers cleared by VERW */ -+XEN_CPUFEATURE(FB_CLEAR, 16*32+17) /*!A Fill Buffers cleared by VERW */ - XEN_CPUFEATURE(FB_CLEAR_CTRL, 16*32+18) /* MSR_OPT_CPU_CTRL.FB_CLEAR_DIS */ - XEN_CPUFEATURE(RRSBA, 16*32+19) /*! Restricted RSB Alternative */ - XEN_CPUFEATURE(BHI_NO, 16*32+20) /*A No Branch History Injection */ --- -2.30.2 - diff --git a/xen.git-76ea2aab3652cc34e474de0905f0a9cd4df7d087.patch b/xen.git-76ea2aab3652cc34e474de0905f0a9cd4df7d087.patch deleted file mode 100644 index 3603813..0000000 --- a/xen.git-76ea2aab3652cc34e474de0905f0a9cd4df7d087.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 76ea2aab3652cc34e474de0905f0a9cd4df7d087 Mon Sep 17 00:00:00 2001 -From: =?utf8?q?Roger=20Pau=20Monn=C3=A9?= -Date: Tue, 5 Mar 2024 11:57:41 +0100 -Subject: [PATCH] x86/spec: print the built-in SPECULATIVE_HARDEN_* options -MIME-Version: 1.0 -Content-Type: text/plain; charset=utf8 -Content-Transfer-Encoding: 8bit - -Just like it's done for INDIRECT_THUNK and SHADOW_PAGING. - -Reported-by: Jan Beulich -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich -master commit: 6e9507f7d51fe49df8bc70f83e49ce06c92e4e54 -master date: 2024-02-27 14:57:52 +0100 ---- - xen/arch/x86/spec_ctrl.c | 14 +++++++++++++- - 1 file changed, 13 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 661716d695..93f1cf3bb5 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -488,13 +488,25 @@ static void __init print_details(enum ind_thunk thunk) - (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : ""); - - /* Compiled-in support which pertains to mitigations. */ -- if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) -+ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) || -+ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_ARRAY) || -+ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) || -+ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) ) - printk(" Compiled-in support:" - #ifdef CONFIG_INDIRECT_THUNK - " INDIRECT_THUNK" - #endif - #ifdef CONFIG_SHADOW_PAGING - " SHADOW_PAGING" -+#endif -+#ifdef CONFIG_SPECULATIVE_HARDEN_ARRAY -+ " HARDEN_ARRAY" -+#endif -+#ifdef CONFIG_SPECULATIVE_HARDEN_BRANCH -+ " HARDEN_BRANCH" -+#endif -+#ifdef CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS -+ " HARDEN_GUEST_ACCESS" - #endif - "\n"); - --- -2.30.2 - diff --git a/xen.git-7b5155a79ea946dd513847d4e7ad2b7e6a4ebb73.patch b/xen.git-7b5155a79ea946dd513847d4e7ad2b7e6a4ebb73.patch deleted file mode 100644 index e3d77d3..0000000 --- a/xen.git-7b5155a79ea946dd513847d4e7ad2b7e6a4ebb73.patch +++ /dev/null @@ -1,143 +0,0 @@ -From 7b5155a79ea946dd513847d4e7ad2b7e6a4ebb73 Mon Sep 17 00:00:00 2001 -From: =?utf8?q?Roger=20Pau=20Monn=C3=A9?= -Date: Tue, 5 Sep 2023 08:45:29 +0200 -Subject: [PATCH] xen/vcpu: ignore VCPU_SSHOTTMR_future -MIME-Version: 1.0 -Content-Type: text/plain; charset=utf8 -Content-Transfer-Encoding: 8bit - -The usage of VCPU_SSHOTTMR_future in Linux prior to 4.7 is bogus. -When the hypervisor returns -ETIME (timeout in the past) Linux keeps -retrying to setup the timer with a higher timeout instead of -self-injecting a timer interrupt. - -On boxes without any hardware assistance for logdirty we have seen HVM -Linux guests < 4.7 with 32vCPUs give up trying to setup the timer when -logdirty is enabled: - -CE: Reprogramming failure. Giving up -CE: xen increased min_delta_ns to 1000000 nsec -CE: Reprogramming failure. Giving up -CE: Reprogramming failure. Giving up -CE: xen increased min_delta_ns to 506250 nsec -CE: xen increased min_delta_ns to 759375 nsec -CE: xen increased min_delta_ns to 1000000 nsec -CE: Reprogramming failure. Giving up -CE: Reprogramming failure. Giving up -CE: Reprogramming failure. Giving up -Freezing user space processes ... -INFO: rcu_sched detected stalls on CPUs/tasks: { 14} (detected by 10, t=60002 jiffies, g=4006, c=4005, q=14130) -Task dump for CPU 14: -swapper/14 R running task 0 0 1 0x00000000 -Call Trace: - [] ? rcu_eqs_enter_common.isra.30+0x3d/0xf0 - [] ? default_idle+0x1e/0xd0 - [] ? arch_cpu_idle+0x20/0xc0 - [] ? cpu_startup_entry+0x14a/0x1e0 - [] ? start_secondary+0x1f7/0x270 - [] ? start_cpu+0x5/0x14 -INFO: rcu_sched detected stalls on CPUs/tasks: { 26} (detected by 24, t=60002 jiffies, g=6922, c=6921, q=7013) -Task dump for CPU 26: -swapper/26 R running task 0 0 1 0x00000000 -Call Trace: - [] ? rcu_eqs_enter_common.isra.30+0x3d/0xf0 - [] ? default_idle+0x1e/0xd0 - [] ? arch_cpu_idle+0x20/0xc0 - [] ? cpu_startup_entry+0x14a/0x1e0 - [] ? start_secondary+0x1f7/0x270 - [] ? start_cpu+0x5/0x14 -INFO: rcu_sched detected stalls on CPUs/tasks: { 26} (detected by 24, t=60002 jiffies, g=8499, c=8498, q=7664) -Task dump for CPU 26: -swapper/26 R running task 0 0 1 0x00000000 -Call Trace: - [] ? rcu_eqs_enter_common.isra.30+0x3d/0xf0 - [] ? default_idle+0x1e/0xd0 - [] ? arch_cpu_idle+0x20/0xc0 - [] ? cpu_startup_entry+0x14a/0x1e0 - [] ? start_secondary+0x1f7/0x270 - [] ? start_cpu+0x5/0x14 - -Thus leading to CPU stalls and a broken system as a result. - -Workaround this bogus usage by ignoring the VCPU_SSHOTTMR_future in -the hypervisor. Old Linux versions are the only ones known to have -(wrongly) attempted to use the flag, and ignoring it is compatible -with the behavior expected by any guests setting that flag. - -Note the usage of the flag has been removed from Linux by commit: - -c06b6d70feb3 xen/x86: don't lose event interrupts - -Which landed in Linux 4.7. - -Signed-off-by: Roger Pau Monné -Acked-by: Henry Wang # CHANGELOG -Acked-by: Jan Beulich -master commit: 19c6cbd90965b1440bd551069373d6fa3f2f365d -master date: 2023-05-03 13:36:05 +0200 ---- - CHANGELOG.md | 6 ++++++ - xen/common/domain.c | 13 ++++++++++--- - xen/include/public/vcpu.h | 5 ++++- - 3 files changed, 20 insertions(+), 4 deletions(-) - -diff --git a/CHANGELOG.md b/CHANGELOG.md -index 7f4d0f25e9..bb0eceb69a 100644 ---- a/CHANGELOG.md -+++ b/CHANGELOG.md -@@ -4,6 +4,12 @@ Notable changes to Xen will be documented in this file. - - The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - -+## [4.17.3](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.17.3) -+ -+### Changed -+ - Ignore VCPUOP_set_singleshot_timer's VCPU_SSHOTTMR_future flag. The only -+ known user doesn't use it properly, leading to in-guest breakage. -+ - ## [4.17.0](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.17.0) - 2022-12-12 - - ### Changed -diff --git a/xen/common/domain.c b/xen/common/domain.c -index 53f7e734fe..30c2279673 100644 ---- a/xen/common/domain.c -+++ b/xen/common/domain.c -@@ -1691,9 +1691,16 @@ long common_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg) - if ( copy_from_guest(&set, arg, 1) ) - return -EFAULT; - -- if ( (set.flags & VCPU_SSHOTTMR_future) && -- (set.timeout_abs_ns < NOW()) ) -- return -ETIME; -+ if ( set.timeout_abs_ns < NOW() ) -+ { -+ /* -+ * Simplify the logic if the timeout has already expired and just -+ * inject the event. -+ */ -+ stop_timer(&v->singleshot_timer); -+ send_timer_event(v); -+ break; -+ } - - migrate_timer(&v->singleshot_timer, smp_processor_id()); - set_timer(&v->singleshot_timer, set.timeout_abs_ns); -diff --git a/xen/include/public/vcpu.h b/xen/include/public/vcpu.h -index 81a3b3a743..a836b264a9 100644 ---- a/xen/include/public/vcpu.h -+++ b/xen/include/public/vcpu.h -@@ -150,7 +150,10 @@ typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t; - DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t); - - /* Flags to VCPUOP_set_singleshot_timer. */ -- /* Require the timeout to be in the future (return -ETIME if it's passed). */ -+ /* -+ * Request the timeout to be in the future (return -ETIME if it's passed) -+ * but can be ignored by the hypervisor. -+ */ - #define _VCPU_SSHOTTMR_future (0) - #define VCPU_SSHOTTMR_future (1U << _VCPU_SSHOTTMR_future) - --- -2.30.2 - diff --git a/xen.git-91650010815f3da0834bc9781c4359350d1162a5.patch b/xen.git-91650010815f3da0834bc9781c4359350d1162a5.patch deleted file mode 100644 index 4f790c5..0000000 --- a/xen.git-91650010815f3da0834bc9781c4359350d1162a5.patch +++ /dev/null @@ -1,57 +0,0 @@ -From 91650010815f3da0834bc9781c4359350d1162a5 Mon Sep 17 00:00:00 2001 -From: =?utf8?q?Roger=20Pau=20Monn=C3=A9?= -Date: Tue, 27 Feb 2024 14:11:40 +0100 -Subject: [PATCH] x86/spec: fix BRANCH_HARDEN option to only be set when - build-enabled -MIME-Version: 1.0 -Content-Type: text/plain; charset=utf8 -Content-Transfer-Encoding: 8bit - -The current logic to handle the BRANCH_HARDEN option will report it as enabled -even when build-time disabled. Fix this by only allowing the option to be set -when support for it is built into Xen. - -Fixes: 2d6f36daa086 ('x86/nospec: Introduce CONFIG_SPECULATIVE_HARDEN_BRANCH') -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich -master commit: 60e00f77a5cc671d30c5ef3318f5b8e9b74e4aa3 -master date: 2024-02-26 16:06:42 +0100 ---- - xen/arch/x86/spec_ctrl.c | 14 ++++++++++++-- - 1 file changed, 12 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 56e07d7536..661716d695 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -62,7 +62,8 @@ int8_t __initdata opt_psfd = -1; - int8_t __ro_after_init opt_ibpb_ctxt_switch = -1; - int8_t __read_mostly opt_eager_fpu = -1; - int8_t __read_mostly opt_l1d_flush = -1; --static bool __initdata opt_branch_harden = true; -+static bool __initdata opt_branch_harden = -+ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH); - - bool __initdata bsp_delay_spec_ctrl; - uint8_t __read_mostly default_xen_spec_ctrl; -@@ -280,7 +281,16 @@ static int __init cf_check parse_spec_ctrl(const char *s) - else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) - opt_l1d_flush = val; - else if ( (val = parse_boolean("branch-harden", s, ss)) >= 0 ) -- opt_branch_harden = val; -+ { -+ if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) ) -+ opt_branch_harden = val; -+ else -+ { -+ no_config_param("SPECULATIVE_HARDEN_BRANCH", "spec-ctrl", s, -+ ss); -+ rc = -EINVAL; -+ } -+ } - else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) - opt_srb_lock = val; - else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) --- -2.30.2 - diff --git a/xen.git-bb13e631432a6fbcc0391431fc27ac85dc438248.patch b/xen.git-bb13e631432a6fbcc0391431fc27ac85dc438248.patch deleted file mode 100644 index 13379c7..0000000 --- a/xen.git-bb13e631432a6fbcc0391431fc27ac85dc438248.patch +++ /dev/null @@ -1,207 +0,0 @@ -From bb13e631432a6fbcc0391431fc27ac85dc438248 Mon Sep 17 00:00:00 2001 -From: Jan Beulich -Date: Tue, 14 Nov 2023 13:58:18 +0100 -Subject: [PATCH] x86: support data operand independent timing mode -MIME-Version: 1.0 -Content-Type: text/plain; charset=utf8 -Content-Transfer-Encoding: 8bit - -[1] specifies a long list of instructions which are intended to exhibit -timing behavior independent of the data they operate on. On certain -hardware this independence is optional, controlled by a bit in a new -MSR. Provide a command line option to control the mode Xen and its -guests are to operate in, with a build time control over the default. -Longer term we may want to allow guests to control this. - -Since Arm64 supposedly also has such a control, put command line option -and Kconfig control in common files. - -[1] https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/best-practices/data-operand-independent-timing-isa-guidance.html - -Requested-by: Demi Marie Obenour -Signed-off-by: Jan Beulich -Acked-by: Roger Pau Monné -master commit: bad1ac345b1910b820b8a703ad1b9f66412ea844 -master date: 2023-10-20 15:50:05 +0200 ---- - CHANGELOG.md | 4 ++++ - docs/misc/xen-command-line.pandoc | 11 +++++++++++ - xen/arch/x86/Kconfig | 1 + - xen/arch/x86/cpu/common.c | 24 ++++++++++++++++++++++++ - xen/arch/x86/include/asm/cpufeature.h | 1 + - xen/common/Kconfig | 18 ++++++++++++++++++ - xen/common/kernel.c | 5 +++++ - xen/include/xen/param.h | 2 ++ - 8 files changed, 66 insertions(+) - -diff --git a/CHANGELOG.md b/CHANGELOG.md -index bb0eceb69a..3da238d5b9 100644 ---- a/CHANGELOG.md -+++ b/CHANGELOG.md -@@ -10,6 +10,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - - Ignore VCPUOP_set_singleshot_timer's VCPU_SSHOTTMR_future flag. The only - known user doesn't use it properly, leading to in-guest breakage. - -+### Added -+ - On x86, support for enforcing system-wide operation in Data Operand -+ Independent Timing Mode. -+ - ## [4.17.0](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.17.0) - 2022-12-12 - - ### Changed -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index c4afd51a81..5ad24a70a9 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -770,6 +770,17 @@ Specify the size of the console debug trace buffer. By specifying `cpu:` - additionally a trace buffer of the specified size is allocated per cpu. - The debug trace feature is only enabled in debugging builds of Xen. - -+### dit (x86/Intel) -+> `= ` -+ -+> Default: `CONFIG_DIT_DEFAULT` -+ -+Specify whether Xen and guests should operate in Data Independent Timing -+mode (Intel calls this DOITM, Data Operand Independent Timing Mode). Note -+that enabling this option cannot guarantee anything beyond what underlying -+hardware guarantees (with, where available and known to Xen, respective -+tweaks applied). -+ - ### dma_bits - > `= ` - -diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig -index 2a5c3304e2..ab47cc23ac 100644 ---- a/xen/arch/x86/Kconfig -+++ b/xen/arch/x86/Kconfig -@@ -14,6 +14,7 @@ config X86 - select HAS_ALTERNATIVE - select HAS_COMPAT - select HAS_CPUFREQ -+ select HAS_DIT - select HAS_EHCI - select HAS_EX_TABLE - select HAS_FAST_MULTIPLY -diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c -index ffa6099307..54ea7fa831 100644 ---- a/xen/arch/x86/cpu/common.c -+++ b/xen/arch/x86/cpu/common.c -@@ -211,6 +211,28 @@ void ctxt_switch_levelling(const struct vcpu *next) - alternative_vcall(ctxt_switch_masking, next); - } - -+static void setup_doitm(void) -+{ -+ uint64_t msr; -+ -+ if ( !cpu_has_doitm ) -+ return; -+ -+ /* -+ * We don't currently enumerate DOITM to guests. As a conseqeuence, guest -+ * kernels will believe they're safe even when they are not. -+ * -+ * For now, set it unilaterally. This prevents otherwise-correct crypto -+ * code from becoming vulnerable to timing sidechannels. -+ */ -+ -+ rdmsrl(MSR_UARCH_MISC_CTRL, msr); -+ msr |= UARCH_CTRL_DOITM; -+ if ( !opt_dit ) -+ msr &= ~UARCH_CTRL_DOITM; -+ wrmsrl(MSR_UARCH_MISC_CTRL, msr); -+} -+ - bool_t opt_cpu_info; - boolean_param("cpuinfo", opt_cpu_info); - -@@ -596,6 +618,8 @@ void identify_cpu(struct cpuinfo_x86 *c) - - mtrr_bp_init(); - } -+ -+ setup_doitm(); - } - - /* leaf 0xb SMT level */ -diff --git a/xen/arch/x86/include/asm/cpufeature.h b/xen/arch/x86/include/asm/cpufeature.h -index b818ef75c0..9ef7756593 100644 ---- a/xen/arch/x86/include/asm/cpufeature.h -+++ b/xen/arch/x86/include/asm/cpufeature.h -@@ -155,6 +155,7 @@ - #define cpu_has_if_pschange_mc_no boot_cpu_has(X86_FEATURE_IF_PSCHANGE_MC_NO) - #define cpu_has_tsx_ctrl boot_cpu_has(X86_FEATURE_TSX_CTRL) - #define cpu_has_taa_no boot_cpu_has(X86_FEATURE_TAA_NO) -+#define cpu_has_doitm boot_cpu_has(X86_FEATURE_DOITM) - #define cpu_has_fb_clear boot_cpu_has(X86_FEATURE_FB_CLEAR) - #define cpu_has_rrsba boot_cpu_has(X86_FEATURE_RRSBA) - #define cpu_has_gds_ctrl boot_cpu_has(X86_FEATURE_GDS_CTRL) -diff --git a/xen/common/Kconfig b/xen/common/Kconfig -index 855c843113..e7794cb7f6 100644 ---- a/xen/common/Kconfig -+++ b/xen/common/Kconfig -@@ -38,6 +38,9 @@ config HAS_COMPAT - config HAS_DEVICE_TREE - bool - -+config HAS_DIT # Data Independent Timing -+ bool -+ - config HAS_EX_TABLE - bool - -@@ -172,6 +175,21 @@ config SPECULATIVE_HARDEN_GUEST_ACCESS - - endmenu - -+config DIT_DEFAULT -+ bool "Data Independent Timing default" -+ depends on HAS_DIT -+ help -+ Hardware often surfaces instructions the timing of which is dependent -+ on the data they process. Some of these instructions may be used in -+ timing sensitive environments, e.g. cryptography. When such -+ instructions exist, hardware may further surface a control allowing -+ to make the behavior of such instructions independent of the data -+ they act upon. Note the build time value can be overridden at runtime -+ using the "dit" command line option. -+ -+ NB: Intel calls the feature DOITM (Data Operand Independent Timing -+ Mode). -+ - config HYPFS - bool "Hypervisor file system support" - default y -diff --git a/xen/common/kernel.c b/xen/common/kernel.c -index 0e8abe0cf8..f64f7dab37 100644 ---- a/xen/common/kernel.c -+++ b/xen/common/kernel.c -@@ -22,6 +22,11 @@ - - enum system_state system_state = SYS_STATE_early_boot; - -+#ifdef CONFIG_HAS_DIT -+bool __ro_after_init opt_dit = IS_ENABLED(CONFIG_DIT_DEFAULT); -+boolean_param("dit", opt_dit); -+#endif -+ - xen_commandline_t saved_cmdline; - static const char __initconst opt_builtin_cmdline[] = CONFIG_CMDLINE; - -diff --git a/xen/include/xen/param.h b/xen/include/xen/param.h -index 1b2c7db954..93c3fe7cb7 100644 ---- a/xen/include/xen/param.h -+++ b/xen/include/xen/param.h -@@ -184,6 +184,8 @@ extern struct param_hypfs __paramhypfs_start[], __paramhypfs_end[]; - string_param(_name, _var); \ - string_runtime_only_param(_name, _var) - -+extern bool opt_dit; -+ - static inline void no_config_param(const char *cfg, const char *param, - const char *s, const char *e) - { --- -2.30.2 - diff --git a/xen.spec b/xen.spec index b0ecd08..ea2fca4 100644 --- a/xen.spec +++ b/xen.spec @@ -54,8 +54,8 @@ Summary: Xen is a virtual machine monitor Name: xen -Version: 4.17.2 -Release: 8%{?dist} +Version: 4.17.4 +Release: 1%{?dist} License: GPLv2+ and LGPLv2+ and BSD URL: http://xen.org/ Source0: https://downloads.xenproject.org/release/xen/%{version}/xen-%{version}.tar.gz @@ -112,60 +112,6 @@ Patch46: xen.efi.build.patch Patch47: xen.gcc13.fixes.patch Patch49: xen.python3.12.patch Patch50: xen.ocaml5.fixes.patch -Patch51: xsa437.patch -Patch52: xsa438-4.17.patch -Patch53: xsa439-0001-x86-AMD-extend-Zenbleed-check-to-models-good-ucode-i.patch -Patch54: xsa439-0002-x86-spec-ctrl-Fix-confusion-between-SPEC_CTRL_EXIT_T.patch -Patch55: xsa439-0003-x86-spec-ctrl-Fold-DO_SPEC_CTRL_EXIT_TO_XEN-into-it-.patch -Patch56: xsa439-0004-x86-spec-ctrl-Turn-the-remaining-SPEC_CTRL_-ENTRY-EX.patch -Patch57: xsa439-0005-x86-spec-ctrl-Improve-all-SPEC_CTRL_-ENTER-EXIT-_-co.patch -Patch58: xsa439-0006-x86-entry-Adjust-restore_all_xen-to-hold-stack_end-i.patch -Patch59: xsa439-0007-x86-entry-Track-the-IST-ness-of-an-entry-for-the-exi.patch -Patch60: xsa439-0008-x86-spec-ctrl-Issue-VERW-during-IST-exit-to-Xen.patch -Patch61: xsa439-0009-x86-amd-Introduce-is_zen-1-2-_uarch-predicates.patch -Patch62: xsa439-0010-x86-spec-ctrl-Mitigate-the-Zen1-DIV-leakage.patch -Patch63: xsa440-4.17.patch -Patch64: xsa442-4.17.patch -Patch65: xsa443-4.17-01.patch -Patch66: xsa443-4.17-02.patch -Patch67: xsa443-4.17-03.patch -Patch68: xsa443-4.17-04.patch -Patch69: xsa443-4.17-05.patch -Patch70: xsa443-4.17-06.patch -Patch71: xsa443-4.17-07.patch -Patch72: xsa443-4.17-08.patch -Patch73: xsa443-4.17-09.patch -Patch74: xsa443-4.17-10.patch -Patch75: xsa443-4.17-11.patch -Patch76: xsa444-4.17-1.patch -Patch77: xsa444-4.17-2.patch -Patch78: xsa445-4.17.patch -Patch79: xsa446.patch -Patch80: xsa447.patch -Patch81: xsa449.patch -Patch82: xsa450.patch -Patch83: xsa451-4.17.patch -Patch84: xen.git-54dacb5c02cba4676879ed077765734326b78e39.patch -Patch85: xsa452-4.17-1.patch -Patch86: xsa452-4.17-2.patch -Patch87: xsa452-4.17-3.patch -Patch88: xsa452-4.17-4.patch -Patch89: xsa452-4.17-5.patch -Patch90: xsa452-4.17-6.patch -Patch91: xsa452-4.17-7.patch -Patch92: xen.git-91650010815f3da0834bc9781c4359350d1162a5.patch -Patch93: xen.git-76ea2aab3652cc34e474de0905f0a9cd4df7d087.patch -Patch94: xen.git-0ce25b46ab2fb53a1b58f7682ca14971453f4f2c.patch -Patch95: xen.git-7b5155a79ea946dd513847d4e7ad2b7e6a4ebb73.patch -Patch96: xen.git-bb13e631432a6fbcc0391431fc27ac85dc438248.patch -Patch97: xsa453-4.17-1.patch -Patch98: xsa453-4.17-2.patch -Patch99: xsa453-4.17-3.patch -Patch100: xsa453-4.17-4.patch -Patch101: xsa453-4.17-5.patch -Patch102: xsa453-4.17-6.patch -Patch103: xsa453-4.17-7.patch -Patch104: xsa453-4.17-8.patch %if %build_qemutrad @@ -382,60 +328,6 @@ manage Xen virtual machines. %if "%dist" != ".fc38" %patch 50 -p1 %endif -%patch 51 -p1 -%patch 52 -p1 -%patch 53 -p1 -%patch 54 -p1 -%patch 55 -p1 -%patch 56 -p1 -%patch 57 -p1 -%patch 58 -p1 -%patch 59 -p1 -%patch 60 -p1 -%patch 61 -p1 -%patch 62 -p1 -%patch 63 -p1 -%patch 64 -p1 -%patch 65 -p1 -%patch 66 -p1 -%patch 67 -p1 -%patch 68 -p1 -%patch 69 -p1 -%patch 70 -p1 -%patch 71 -p1 -%patch 72 -p1 -%patch 73 -p1 -%patch 74 -p1 -%patch 75 -p1 -%patch 76 -p1 -%patch 77 -p1 -%patch 78 -p1 -%patch 79 -p1 -%patch 80 -p1 -%patch 81 -p1 -%patch 82 -p1 -%patch 83 -p1 -%patch 84 -p1 -%patch 85 -p1 -%patch 86 -p1 -%patch 87 -p1 -%patch 88 -p1 -%patch 89 -p1 -%patch 90 -p1 -%patch 91 -p1 -%patch 92 -p1 -%patch 93 -p1 -%patch 94 -p1 -%patch 95 -p1 -%patch 96 -p1 -%patch 97 -p1 -%patch 98 -p1 -%patch 99 -p1 -%patch 100 -p1 -%patch 101 -p1 -%patch 102 -p1 -%patch 103 -p1 -%patch 104 -p1 # qemu-xen-traditional patches pushd tools/qemu-xen-traditional @@ -1043,6 +935,13 @@ fi %endif %changelog +* Tue Apr 09 2024 Michael Young - 4.17.4-1 +- x86: Native Branch History Injection [XSA-456, CVE-2024-2201] +- update to xen 4.17.4, remove patches now included upstream + rebase xen.gcc12.fixes.patch +- x86 HVM hypercalls may trigger Xen bug check [XSA-454, CVE-2023-46842] +- x86: Incorrect logic for BTC/SRSO mitigations [XSA-455, CVE-2024-31142] + * Thu Mar 14 2024 Michael Young - 4.17.2-8 - x86: Register File Data Sampling [XSA-452, CVE-2023-28746] - GhostRace: Speculative Race Conditions [XSA-453, CVE-2024-2193] diff --git a/xsa437.patch b/xsa437.patch deleted file mode 100644 index 18c9f8f..0000000 --- a/xsa437.patch +++ /dev/null @@ -1,110 +0,0 @@ -From 7fac5971340a13ca9458195305bcfe14df2e52d2 Mon Sep 17 00:00:00 2001 -From: Stefano Stabellini -Date: Thu, 17 Aug 2023 13:41:35 +0100 -Subject: [PATCH] xen/arm: page: Handle cache flush of an element at the top of - the address space - -The region that needs to be cleaned/invalidated may be at the top -of the address space. This means that 'end' (i.e. 'p + size') will -be 0 and therefore nothing will be cleaned/invalidated as the check -in the loop will always be false. - -On Arm64, we only support we only support up to 48-bit Virtual -address space. So this is not a concern there. However, for 32-bit, -the mapcache is using the last 2GB of the address space. Therefore -we may not clean/invalidate properly some pages. This could lead -to memory corruption or data leakage (the scrubbed value may -still sit in the cache when the guest could read directly the memory -and therefore read the old content). - -Rework invalidate_dcache_va_range(), clean_dcache_va_range(), -clean_and_invalidate_dcache_va_range() to handle a cache flush -with an element at the top of the address space. - -This is CVE-2023-34321 / XSA-437. - -Reported-by: Julien Grall -Signed-off-by: Stefano Stabellini -Signed-off-by: Julien Grall -Acked-by: Bertrand Marquis - ---- - xen/arch/arm/include/asm/page.h | 33 ++++++++++++++++++++------------- - 1 file changed, 20 insertions(+), 13 deletions(-) - -diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h -index e7cd62190c7f..d7fe770a5e49 100644 ---- a/xen/arch/arm/include/asm/page.h -+++ b/xen/arch/arm/include/asm/page.h -@@ -160,26 +160,25 @@ static inline size_t read_dcache_line_bytes(void) - - static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - { -- const void *end = p + size; - size_t cacheline_mask = dcache_line_bytes - 1; - - dsb(sy); /* So the CPU issues all writes to the range */ - - if ( (uintptr_t)p & cacheline_mask ) - { -+ size -= dcache_line_bytes - ((uintptr_t)p & cacheline_mask); - p = (void *)((uintptr_t)p & ~cacheline_mask); - asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); - p += dcache_line_bytes; - } -- if ( (uintptr_t)end & cacheline_mask ) -- { -- end = (void *)((uintptr_t)end & ~cacheline_mask); -- asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (end)); -- } - -- for ( ; p < end; p += dcache_line_bytes ) -+ for ( ; size >= dcache_line_bytes; -+ p += dcache_line_bytes, size -= dcache_line_bytes ) - asm volatile (__invalidate_dcache_one(0) : : "r" (p)); - -+ if ( size > 0 ) -+ asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); -+ - dsb(sy); /* So we know the flushes happen before continuing */ - - return 0; -@@ -187,10 +186,14 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - - static inline int clean_dcache_va_range(const void *p, unsigned long size) - { -- const void *end = p + size; -+ size_t cacheline_mask = dcache_line_bytes - 1; -+ - dsb(sy); /* So the CPU issues all writes to the range */ -- p = (void *)((uintptr_t)p & ~(dcache_line_bytes - 1)); -- for ( ; p < end; p += dcache_line_bytes ) -+ size += (uintptr_t)p & cacheline_mask; -+ size = (size + cacheline_mask) & ~cacheline_mask; -+ p = (void *)((uintptr_t)p & ~cacheline_mask); -+ for ( ; size >= dcache_line_bytes; -+ p += dcache_line_bytes, size -= dcache_line_bytes ) - asm volatile (__clean_dcache_one(0) : : "r" (p)); - dsb(sy); /* So we know the flushes happen before continuing */ - /* ARM callers assume that dcache_* functions cannot fail. */ -@@ -200,10 +203,14 @@ static inline int clean_dcache_va_range(const void *p, unsigned long size) - static inline int clean_and_invalidate_dcache_va_range - (const void *p, unsigned long size) - { -- const void *end = p + size; -+ size_t cacheline_mask = dcache_line_bytes - 1; -+ - dsb(sy); /* So the CPU issues all writes to the range */ -- p = (void *)((uintptr_t)p & ~(dcache_line_bytes - 1)); -- for ( ; p < end; p += dcache_line_bytes ) -+ size += (uintptr_t)p & cacheline_mask; -+ size = (size + cacheline_mask) & ~cacheline_mask; -+ p = (void *)((uintptr_t)p & ~cacheline_mask); -+ for ( ; size >= dcache_line_bytes; -+ p += dcache_line_bytes, size -= dcache_line_bytes ) - asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); - dsb(sy); /* So we know the flushes happen before continuing */ - /* ARM callers assume that dcache_* functions cannot fail. */ --- -2.40.1 - diff --git a/xsa438-4.17.patch b/xsa438-4.17.patch deleted file mode 100644 index 12d6ec7..0000000 --- a/xsa438-4.17.patch +++ /dev/null @@ -1,416 +0,0 @@ -From: Jan Beulich -Subject: x86/shadow: defer releasing of PV's top-level shadow reference - -sh_set_toplevel_shadow() re-pinning the top-level shadow we may be -running on is not enough (and at the same time unnecessary when the -shadow isn't what we're running on): That shadow becomes eligible for -blowing away (from e.g. shadow_prealloc()) immediately after the -paging lock was dropped. Yet it needs to remain valid until the actual -page table switch occurred. - -Propagate up the call chain the shadow entry that needs releasing -eventually, and carry out the release immediately after switching page -tables. Handle update_cr3() failures by switching to idle pagetables. -Note that various further uses of update_cr3() are HVM-only or only act -on paused vCPU-s, in which case sh_set_toplevel_shadow() will not defer -releasing of the reference. - -While changing the update_cr3() hook, also convert the "do_locking" -parameter to boolean. - -This is CVE-2023-34322 / XSA-438. - -Reported-by: Tim Deegan -Signed-off-by: Jan Beulich -Reviewed-by: George Dunlap - ---- a/xen/arch/x86/include/asm/mm.h -+++ b/xen/arch/x86/include/asm/mm.h -@@ -552,7 +552,7 @@ void audit_domains(void); - #endif - - void make_cr3(struct vcpu *v, mfn_t mfn); --void update_cr3(struct vcpu *v); -+pagetable_t update_cr3(struct vcpu *v); - int vcpu_destroy_pagetables(struct vcpu *); - void *do_page_walk(struct vcpu *v, unsigned long addr); - ---- a/xen/arch/x86/include/asm/paging.h -+++ b/xen/arch/x86/include/asm/paging.h -@@ -138,7 +138,7 @@ struct paging_mode { - paddr_t ga, uint32_t *pfec, - unsigned int *page_order); - #endif -- void (*update_cr3 )(struct vcpu *v, int do_locking, -+ pagetable_t (*update_cr3 )(struct vcpu *v, bool do_locking, - bool noflush); - void (*update_paging_modes )(struct vcpu *v); - bool (*flush_tlb )(const unsigned long *vcpu_bitmap); -@@ -310,9 +310,9 @@ static inline unsigned long paging_ga_to - /* Update all the things that are derived from the guest's CR3. - * Called when the guest changes CR3; the caller can then use v->arch.cr3 - * as the value to load into the host CR3 to schedule this vcpu */ --static inline void paging_update_cr3(struct vcpu *v, bool noflush) -+static inline pagetable_t paging_update_cr3(struct vcpu *v, bool noflush) - { -- paging_get_hostmode(v)->update_cr3(v, 1, noflush); -+ return paging_get_hostmode(v)->update_cr3(v, 1, noflush); - } - - /* Update all the things that are derived from the guest's CR0/CR3/CR4. ---- a/xen/arch/x86/include/asm/shadow.h -+++ b/xen/arch/x86/include/asm/shadow.h -@@ -99,6 +99,9 @@ int shadow_set_allocation(struct domain - - int shadow_get_allocation_bytes(struct domain *d, uint64_t *size); - -+/* Helper to invoke for deferred releasing of a top-level shadow's reference. */ -+void shadow_put_top_level(struct domain *d, pagetable_t old); -+ - #else /* !CONFIG_SHADOW_PAGING */ - - #define shadow_vcpu_teardown(v) ASSERT(is_pv_vcpu(v)) -@@ -121,6 +124,11 @@ static inline void shadow_prepare_page_t - - static inline void shadow_blow_tables_per_domain(struct domain *d) {} - -+static inline void shadow_put_top_level(struct domain *d, pagetable_t old) -+{ -+ ASSERT_UNREACHABLE(); -+} -+ - static inline int shadow_domctl(struct domain *d, - struct xen_domctl_shadow_op *sc, - XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -739,11 +739,13 @@ static bool cf_check hap_invlpg(struct v - return 1; - } - --static void cf_check hap_update_cr3( -- struct vcpu *v, int do_locking, bool noflush) -+static pagetable_t cf_check hap_update_cr3( -+ struct vcpu *v, bool do_locking, bool noflush) - { - v->arch.hvm.hw_cr[3] = v->arch.hvm.guest_cr[3]; - hvm_update_guest_cr3(v, noflush); -+ -+ return pagetable_null(); - } - - static bool flush_vcpu(const struct vcpu *v, const unsigned long *vcpu_bitmap) ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2590,13 +2590,13 @@ void cf_check shadow_update_paging_modes - } - - /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */ --void sh_set_toplevel_shadow(struct vcpu *v, -- unsigned int slot, -- mfn_t gmfn, -- unsigned int root_type, -- mfn_t (*make_shadow)(struct vcpu *v, -- mfn_t gmfn, -- uint32_t shadow_type)) -+pagetable_t sh_set_toplevel_shadow(struct vcpu *v, -+ unsigned int slot, -+ mfn_t gmfn, -+ unsigned int root_type, -+ mfn_t (*make_shadow)(struct vcpu *v, -+ mfn_t gmfn, -+ uint32_t shadow_type)) - { - mfn_t smfn; - pagetable_t old_entry, new_entry; -@@ -2653,20 +2653,37 @@ void sh_set_toplevel_shadow(struct vcpu - mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry))); - v->arch.paging.shadow.shadow_table[slot] = new_entry; - -- /* Decrement the refcount of the old contents of this slot */ -- if ( !pagetable_is_null(old_entry) ) -+ /* -+ * Decrement the refcount of the old contents of this slot, unless -+ * we're still running on that shadow - in that case it'll need holding -+ * on to until the actual page table switch did occur. -+ */ -+ if ( !pagetable_is_null(old_entry) && (v != current || !is_pv_domain(d)) ) - { -- mfn_t old_smfn = pagetable_get_mfn(old_entry); -- /* Need to repin the old toplevel shadow if it's been unpinned -- * by shadow_prealloc(): in PV mode we're still running on this -- * shadow and it's not safe to free it yet. */ -- if ( !mfn_to_page(old_smfn)->u.sh.pinned && !sh_pin(d, old_smfn) ) -- { -- printk(XENLOG_G_ERR "can't re-pin %"PRI_mfn"\n", mfn_x(old_smfn)); -- domain_crash(d); -- } -- sh_put_ref(d, old_smfn, 0); -+ sh_put_ref(d, pagetable_get_mfn(old_entry), 0); -+ old_entry = pagetable_null(); - } -+ -+ /* -+ * 2- and 3-level shadow mode is used for HVM only. Therefore we never run -+ * on such a shadow, so only call sites requesting an L4 shadow need to pay -+ * attention to the returned value. -+ */ -+ ASSERT(pagetable_is_null(old_entry) || root_type == SH_type_l4_64_shadow); -+ -+ return old_entry; -+} -+ -+/* -+ * Helper invoked when releasing of a top-level shadow's reference was -+ * deferred in sh_set_toplevel_shadow() above. -+ */ -+void shadow_put_top_level(struct domain *d, pagetable_t old_entry) -+{ -+ ASSERT(!pagetable_is_null(old_entry)); -+ paging_lock(d); -+ sh_put_ref(d, pagetable_get_mfn(old_entry), 0); -+ paging_unlock(d); - } - - /**************************************************************************/ ---- a/xen/arch/x86/mm/shadow/multi.c -+++ b/xen/arch/x86/mm/shadow/multi.c -@@ -3224,7 +3224,8 @@ static void cf_check sh_detach_old_table - } - } - --static void cf_check sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) -+static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool do_locking, -+ bool noflush) - /* Updates vcpu->arch.cr3 after the guest has changed CR3. - * Paravirtual guests should set v->arch.guest_table (and guest_table_user, - * if appropriate). -@@ -3238,6 +3239,7 @@ static void cf_check sh_update_cr3(struc - { - struct domain *d = v->domain; - mfn_t gmfn; -+ pagetable_t old_entry = pagetable_null(); - #if GUEST_PAGING_LEVELS == 3 - const guest_l3e_t *gl3e; - unsigned int i, guest_idx; -@@ -3247,7 +3249,7 @@ static void cf_check sh_update_cr3(struc - if ( !is_hvm_domain(d) && !v->is_initialised ) - { - ASSERT(v->arch.cr3 == 0); -- return; -+ return old_entry; - } - - if ( do_locking ) paging_lock(v->domain); -@@ -3320,11 +3322,12 @@ static void cf_check sh_update_cr3(struc - #if GUEST_PAGING_LEVELS == 4 - if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 ) - guest_flush_tlb_mask(d, d->dirty_cpumask); -- sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow); -+ old_entry = sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, -+ sh_make_shadow); - if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) - { - ASSERT(d->is_dying || d->is_shutting_down); -- return; -+ return old_entry; - } - if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) ) - { -@@ -3368,24 +3371,30 @@ static void cf_check sh_update_cr3(struc - gl2gfn = guest_l3e_get_gfn(gl3e[i]); - gl2mfn = get_gfn_query_unlocked(d, gfn_x(gl2gfn), &p2mt); - if ( p2m_is_ram(p2mt) ) -- sh_set_toplevel_shadow(v, i, gl2mfn, SH_type_l2_shadow, -- sh_make_shadow); -+ old_entry = sh_set_toplevel_shadow(v, i, gl2mfn, -+ SH_type_l2_shadow, -+ sh_make_shadow); - else -- sh_set_toplevel_shadow(v, i, INVALID_MFN, 0, -- sh_make_shadow); -+ old_entry = sh_set_toplevel_shadow(v, i, INVALID_MFN, 0, -+ sh_make_shadow); - } - else -- sh_set_toplevel_shadow(v, i, INVALID_MFN, 0, sh_make_shadow); -+ old_entry = sh_set_toplevel_shadow(v, i, INVALID_MFN, 0, -+ sh_make_shadow); -+ -+ ASSERT(pagetable_is_null(old_entry)); - } - } - #elif GUEST_PAGING_LEVELS == 2 - if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 ) - guest_flush_tlb_mask(d, d->dirty_cpumask); -- sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow); -+ old_entry = sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, -+ sh_make_shadow); -+ ASSERT(pagetable_is_null(old_entry)); - if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) - { - ASSERT(d->is_dying || d->is_shutting_down); -- return; -+ return old_entry; - } - #else - #error This should never happen -@@ -3473,6 +3482,8 @@ static void cf_check sh_update_cr3(struc - - /* Release the lock, if we took it (otherwise it's the caller's problem) */ - if ( do_locking ) paging_unlock(v->domain); -+ -+ return old_entry; - } - - ---- a/xen/arch/x86/mm/shadow/none.c -+++ b/xen/arch/x86/mm/shadow/none.c -@@ -52,9 +52,11 @@ static unsigned long cf_check _gva_to_gf - } - #endif - --static void cf_check _update_cr3(struct vcpu *v, int do_locking, bool noflush) -+static pagetable_t cf_check _update_cr3(struct vcpu *v, bool do_locking, -+ bool noflush) - { - ASSERT_UNREACHABLE(); -+ return pagetable_null(); - } - - static void cf_check _update_paging_modes(struct vcpu *v) ---- a/xen/arch/x86/mm/shadow/private.h -+++ b/xen/arch/x86/mm/shadow/private.h -@@ -391,13 +391,13 @@ mfn_t shadow_alloc(struct domain *d, - void shadow_free(struct domain *d, mfn_t smfn); - - /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */ --void sh_set_toplevel_shadow(struct vcpu *v, -- unsigned int slot, -- mfn_t gmfn, -- unsigned int root_type, -- mfn_t (*make_shadow)(struct vcpu *v, -- mfn_t gmfn, -- uint32_t shadow_type)); -+pagetable_t sh_set_toplevel_shadow(struct vcpu *v, -+ unsigned int slot, -+ mfn_t gmfn, -+ unsigned int root_type, -+ mfn_t (*make_shadow)(struct vcpu *v, -+ mfn_t gmfn, -+ uint32_t shadow_type)); - - /* Update the shadows in response to a pagetable write from Xen */ - int sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size); ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -567,15 +567,12 @@ void write_ptbase(struct vcpu *v) - * - * Update ref counts to shadow tables appropriately. - */ --void update_cr3(struct vcpu *v) -+pagetable_t update_cr3(struct vcpu *v) - { - mfn_t cr3_mfn; - - if ( paging_mode_enabled(v->domain) ) -- { -- paging_update_cr3(v, false); -- return; -- } -+ return paging_update_cr3(v, false); - - if ( !(v->arch.flags & TF_kernel_mode) ) - cr3_mfn = pagetable_get_mfn(v->arch.guest_table_user); -@@ -583,6 +580,8 @@ void update_cr3(struct vcpu *v) - cr3_mfn = pagetable_get_mfn(v->arch.guest_table); - - make_cr3(v, cr3_mfn); -+ -+ return pagetable_null(); - } - - static inline void set_tlbflush_timestamp(struct page_info *page) -@@ -3285,6 +3284,7 @@ int new_guest_cr3(mfn_t mfn) - struct domain *d = curr->domain; - int rc; - mfn_t old_base_mfn; -+ pagetable_t old_shadow; - - if ( is_pv_32bit_domain(d) ) - { -@@ -3352,9 +3352,22 @@ int new_guest_cr3(mfn_t mfn) - if ( !VM_ASSIST(d, m2p_strict) ) - fill_ro_mpt(mfn); - curr->arch.guest_table = pagetable_from_mfn(mfn); -- update_cr3(curr); -+ old_shadow = update_cr3(curr); -+ -+ /* -+ * In shadow mode update_cr3() can fail, in which case here we're still -+ * running on the prior top-level shadow (which we're about to release). -+ * Switch to the idle page tables in such an event; the guest will have -+ * been crashed already. -+ */ -+ if ( likely(!mfn_eq(pagetable_get_mfn(old_shadow), -+ maddr_to_mfn(curr->arch.cr3 & ~X86_CR3_NOFLUSH))) ) -+ write_ptbase(curr); -+ else -+ write_ptbase(idle_vcpu[curr->processor]); - -- write_ptbase(curr); -+ if ( !pagetable_is_null(old_shadow) ) -+ shadow_put_top_level(d, old_shadow); - - if ( likely(mfn_x(old_base_mfn) != 0) ) - { ---- a/xen/arch/x86/pv/domain.c -+++ b/xen/arch/x86/pv/domain.c -@@ -424,10 +424,13 @@ bool __init xpti_pcid_enabled(void) - - static void _toggle_guest_pt(struct vcpu *v) - { -+ bool guest_update; -+ pagetable_t old_shadow; - unsigned long cr3; - - v->arch.flags ^= TF_kernel_mode; -- update_cr3(v); -+ guest_update = v->arch.flags & TF_kernel_mode; -+ old_shadow = update_cr3(v); - - /* - * Don't flush user global mappings from the TLB. Don't tick TLB clock. -@@ -436,13 +439,31 @@ static void _toggle_guest_pt(struct vcpu - * TLB flush (for just the incoming PCID), as the top level page table may - * have changed behind our backs. To be on the safe side, suppress the - * no-flush unconditionally in this case. -+ * -+ * Furthermore in shadow mode update_cr3() can fail, in which case here -+ * we're still running on the prior top-level shadow (which we're about -+ * to release). Switch to the idle page tables in such an event; the -+ * guest will have been crashed already. - */ - cr3 = v->arch.cr3; - if ( shadow_mode_enabled(v->domain) ) -+ { - cr3 &= ~X86_CR3_NOFLUSH; -+ -+ if ( unlikely(mfn_eq(pagetable_get_mfn(old_shadow), -+ maddr_to_mfn(cr3))) ) -+ { -+ cr3 = idle_vcpu[v->processor]->arch.cr3; -+ /* Also suppress runstate/time area updates below. */ -+ guest_update = false; -+ } -+ } - write_cr3(cr3); - -- if ( !(v->arch.flags & TF_kernel_mode) ) -+ if ( !pagetable_is_null(old_shadow) ) -+ shadow_put_top_level(v->domain, old_shadow); -+ -+ if ( !guest_update ) - return; - - if ( v->arch.pv.need_update_runstate_area && update_runstate_area(v) ) diff --git a/xsa439-0001-x86-AMD-extend-Zenbleed-check-to-models-good-ucode-i.patch b/xsa439-0001-x86-AMD-extend-Zenbleed-check-to-models-good-ucode-i.patch deleted file mode 100644 index 96e56ec..0000000 --- a/xsa439-0001-x86-AMD-extend-Zenbleed-check-to-models-good-ucode-i.patch +++ /dev/null @@ -1,49 +0,0 @@ -From d2d2dcae879c6cc05227c9620f0a772f35fe6886 Mon Sep 17 00:00:00 2001 -Message-ID: -From: Jan Beulich -Date: Wed, 23 Aug 2023 09:26:36 +0200 -Subject: [XEN PATCH 01/10] x86/AMD: extend Zenbleed check to models "good" - ucode isn't known for - -Reportedly the AMD Custom APU 0405 found on SteamDeck, models 0x90 and -0x91, (quoting the respective Linux commit) is similarly affected. Put -another instance of our Zen1 vs Zen2 distinction checks in -amd_check_zenbleed(), forcing use of the chickenbit irrespective of -ucode version (building upon real hardware never surfacing a version of -0xffffffff). - -Signed-off-by: Jan Beulich -Reviewed-by: Andrew Cooper -(cherry picked from commit 145a69c0944ac70cfcf9d247c85dee9e99d9d302) ---- - xen/arch/x86/cpu/amd.c | 13 ++++++++++--- - 1 file changed, 10 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index 3ea214fc2e..1bb3044be1 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -909,10 +909,17 @@ void amd_check_zenbleed(void) - case 0xa0 ... 0xaf: good_rev = 0x08a00008; break; - default: - /* -- * With the Fam17h check above, parts getting here are Zen1. -- * They're not affected. -+ * With the Fam17h check above, most parts getting here are -+ * Zen1. They're not affected. Assume Zen2 ones making it -+ * here are affected regardless of microcode version. -+ * -+ * Zen1 vs Zen2 isn't a simple model number comparison, so use -+ * STIBP as a heuristic to distinguish. - */ -- return; -+ if (!boot_cpu_has(X86_FEATURE_AMD_STIBP)) -+ return; -+ good_rev = ~0U; -+ break; - } - - rdmsrl(MSR_AMD64_DE_CFG, val); --- -2.41.0 - diff --git a/xsa439-0002-x86-spec-ctrl-Fix-confusion-between-SPEC_CTRL_EXIT_T.patch b/xsa439-0002-x86-spec-ctrl-Fix-confusion-between-SPEC_CTRL_EXIT_T.patch deleted file mode 100644 index 8b8e30a..0000000 --- a/xsa439-0002-x86-spec-ctrl-Fix-confusion-between-SPEC_CTRL_EXIT_T.patch +++ /dev/null @@ -1,77 +0,0 @@ -From dc28aba565f226f9bec24cfde993e78478acfb4e Mon Sep 17 00:00:00 2001 -Message-ID: -In-Reply-To: -References: -From: Andrew Cooper -Date: Tue, 12 Sep 2023 15:06:49 +0100 -Subject: [XEN PATCH 02/10] x86/spec-ctrl: Fix confusion between - SPEC_CTRL_EXIT_TO_XEN{,_IST} - -c/s 3fffaf9c13e9 ("x86/entry: Avoid using alternatives in NMI/#MC paths") -dropped the only user, leaving behind the (incorrect) implication that Xen had -split exit paths. - -Delete the unused SPEC_CTRL_EXIT_TO_XEN and rename SPEC_CTRL_EXIT_TO_XEN_IST -to SPEC_CTRL_EXIT_TO_XEN for consistency. - -No functional change. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 1c18d73774533a55ba9d1cbee8bdace03efdb5e7) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 10 ++-------- - xen/arch/x86/x86_64/entry.S | 2 +- - 2 files changed, 3 insertions(+), 9 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index f23bb105c5..e8fd01243c 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -79,7 +79,6 @@ - * - SPEC_CTRL_ENTRY_FROM_PV - * - SPEC_CTRL_ENTRY_FROM_INTR - * - SPEC_CTRL_ENTRY_FROM_INTR_IST -- * - SPEC_CTRL_EXIT_TO_XEN_IST - * - SPEC_CTRL_EXIT_TO_XEN - * - SPEC_CTRL_EXIT_TO_PV - * -@@ -268,11 +267,6 @@ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=1), \ - X86_FEATURE_SC_MSR_PV - --/* Use when exiting to Xen context. */ --#define SPEC_CTRL_EXIT_TO_XEN \ -- ALTERNATIVE "", \ -- DO_SPEC_CTRL_EXIT_TO_XEN, X86_FEATURE_SC_MSR_PV -- - /* Use when exiting to PV guest context. */ - #define SPEC_CTRL_EXIT_TO_PV \ - ALTERNATIVE "", \ -@@ -339,8 +333,8 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - UNLIKELY_END(\@_serialise) - .endm - --/* Use when exiting to Xen in IST context. */ --.macro SPEC_CTRL_EXIT_TO_XEN_IST -+/* Use when exiting to Xen context. */ -+.macro SPEC_CTRL_EXIT_TO_XEN - /* - * Requires %rbx=stack_end - * Clobbers %rax, %rcx, %rdx -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 7675a59ff0..b45a09823a 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -673,7 +673,7 @@ UNLIKELY_START(ne, exit_cr3) - UNLIKELY_END(exit_cr3) - - /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ -- SPEC_CTRL_EXIT_TO_XEN_IST /* Req: %rbx=end, Clob: acd */ -+ SPEC_CTRL_EXIT_TO_XEN /* Req: %rbx=end, Clob: acd */ - - RESTORE_ALL adj=8 - iretq --- -2.41.0 - diff --git a/xsa439-0003-x86-spec-ctrl-Fold-DO_SPEC_CTRL_EXIT_TO_XEN-into-it-.patch b/xsa439-0003-x86-spec-ctrl-Fold-DO_SPEC_CTRL_EXIT_TO_XEN-into-it-.patch deleted file mode 100644 index 547032e..0000000 --- a/xsa439-0003-x86-spec-ctrl-Fold-DO_SPEC_CTRL_EXIT_TO_XEN-into-it-.patch +++ /dev/null @@ -1,88 +0,0 @@ -From 84690fb82c4f4aecb72a6789d8994efa74841e09 Mon Sep 17 00:00:00 2001 -Message-ID: <84690fb82c4f4aecb72a6789d8994efa74841e09.1695733540.git.m.a.young@durham.ac.uk> -In-Reply-To: -References: -From: Andrew Cooper -Date: Tue, 12 Sep 2023 17:03:16 +0100 -Subject: [XEN PATCH 03/10] x86/spec-ctrl: Fold DO_SPEC_CTRL_EXIT_TO_XEN into - it's single user - -With the SPEC_CTRL_EXIT_TO_XEN{,_IST} confusion fixed, it's now obvious that -there's only a single EXIT_TO_XEN path. Fold DO_SPEC_CTRL_EXIT_TO_XEN into -SPEC_CTRL_EXIT_TO_XEN to simplify further fixes. - -When merging labels, switch the name to .L\@_skip_sc_msr as "skip" on its own -is going to be too generic shortly. - -No functional change. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 694bb0f280fd08a4377e36e32b84b5062def4de2) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 40 ++++++++++-------------- - 1 file changed, 16 insertions(+), 24 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index e8fd01243c..d5f65d80ea 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -211,27 +211,6 @@ - wrmsr - .endm - --.macro DO_SPEC_CTRL_EXIT_TO_XEN --/* -- * Requires %rbx=stack_end -- * Clobbers %rax, %rcx, %rdx -- * -- * When returning to Xen context, look to see whether SPEC_CTRL shadowing is -- * in effect, and reload the shadow value. This covers race conditions which -- * exist with an NMI/MCE/etc hitting late in the return-to-guest path. -- */ -- xor %edx, %edx -- -- testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) -- jz .L\@_skip -- -- mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%rbx), %eax -- mov $MSR_SPEC_CTRL, %ecx -- wrmsr -- --.L\@_skip: --.endm -- - .macro DO_SPEC_CTRL_EXIT_TO_GUEST - /* - * Requires %eax=spec_ctrl, %rsp=regs/cpuinfo -@@ -340,11 +319,24 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - * Clobbers %rax, %rcx, %rdx - */ - testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) -- jz .L\@_skip -+ jz .L\@_skip_sc_msr - -- DO_SPEC_CTRL_EXIT_TO_XEN -+ /* -+ * When returning to Xen context, look to see whether SPEC_CTRL shadowing -+ * is in effect, and reload the shadow value. This covers race conditions -+ * which exist with an NMI/MCE/etc hitting late in the return-to-guest -+ * path. -+ */ -+ xor %edx, %edx - --.L\@_skip: -+ testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) -+ jz .L\@_skip_sc_msr -+ -+ mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%rbx), %eax -+ mov $MSR_SPEC_CTRL, %ecx -+ wrmsr -+ -+.L\@_skip_sc_msr: - .endm - - #endif /* __ASSEMBLY__ */ --- -2.41.0 - diff --git a/xsa439-0004-x86-spec-ctrl-Turn-the-remaining-SPEC_CTRL_-ENTRY-EX.patch b/xsa439-0004-x86-spec-ctrl-Turn-the-remaining-SPEC_CTRL_-ENTRY-EX.patch deleted file mode 100644 index 3350750..0000000 --- a/xsa439-0004-x86-spec-ctrl-Turn-the-remaining-SPEC_CTRL_-ENTRY-EX.patch +++ /dev/null @@ -1,86 +0,0 @@ -From 3952c73bdbd05f0e666986fce633a591237b3c88 Mon Sep 17 00:00:00 2001 -Message-ID: <3952c73bdbd05f0e666986fce633a591237b3c88.1695733540.git.m.a.young@durham.ac.uk> -In-Reply-To: -References: -From: Andrew Cooper -Date: Fri, 1 Sep 2023 11:38:44 +0100 -Subject: [XEN PATCH 04/10] x86/spec-ctrl: Turn the remaining - SPEC_CTRL_{ENTRY,EXIT}_* into asm macros - -These have grown more complex over time, with some already having been -converted. - -Provide full Requires/Clobbers comments, otherwise missing at this level of -indirection. - -No functional change. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 7125429aafb9e3c9c88fc93001fc2300e0ac2cc8) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 37 ++++++++++++++++++------ - 1 file changed, 28 insertions(+), 9 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index d5f65d80ea..c6d5f2ad01 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -231,26 +231,45 @@ - .endm - - /* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */ --#define SPEC_CTRL_ENTRY_FROM_PV \ -+.macro SPEC_CTRL_ENTRY_FROM_PV -+/* -+ * Requires %rsp=regs/cpuinfo, %rdx=0 -+ * Clobbers %rax, %rcx, %rdx -+ */ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=0), \ -- X86_FEATURE_IBPB_ENTRY_PV; \ -- ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \ -+ X86_FEATURE_IBPB_ENTRY_PV -+ -+ ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV -+ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=0), \ - X86_FEATURE_SC_MSR_PV -+.endm - - /* Use in interrupt/exception context. May interrupt Xen or PV context. */ --#define SPEC_CTRL_ENTRY_FROM_INTR \ -+.macro SPEC_CTRL_ENTRY_FROM_INTR -+/* -+ * Requires %rsp=regs, %r14=stack_end, %rdx=0 -+ * Clobbers %rax, %rcx, %rdx -+ */ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=1), \ -- X86_FEATURE_IBPB_ENTRY_PV; \ -- ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \ -+ X86_FEATURE_IBPB_ENTRY_PV -+ -+ ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV -+ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=1), \ - X86_FEATURE_SC_MSR_PV -+.endm - - /* Use when exiting to PV guest context. */ --#define SPEC_CTRL_EXIT_TO_PV \ -- ALTERNATIVE "", \ -- DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV; \ -+.macro SPEC_CTRL_EXIT_TO_PV -+/* -+ * Requires %rax=spec_ctrl, %rsp=regs/info -+ * Clobbers %rcx, %rdx -+ */ -+ ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV -+ - DO_SPEC_CTRL_COND_VERW -+.endm - - /* - * Use in IST interrupt/exception context. May interrupt Xen or PV context. --- -2.41.0 - diff --git a/xsa439-0005-x86-spec-ctrl-Improve-all-SPEC_CTRL_-ENTER-EXIT-_-co.patch b/xsa439-0005-x86-spec-ctrl-Improve-all-SPEC_CTRL_-ENTER-EXIT-_-co.patch deleted file mode 100644 index dda088a..0000000 --- a/xsa439-0005-x86-spec-ctrl-Improve-all-SPEC_CTRL_-ENTER-EXIT-_-co.patch +++ /dev/null @@ -1,109 +0,0 @@ -From ba023e93d0b1e60b80251bf080bab694efb9f8e3 Mon Sep 17 00:00:00 2001 -Message-ID: -In-Reply-To: -References: -From: Andrew Cooper -Date: Wed, 30 Aug 2023 20:11:50 +0100 -Subject: [XEN PATCH 05/10] x86/spec-ctrl: Improve all SPEC_CTRL_{ENTER,EXIT}_* - comments - -... to better explain how they're used. - -Doing so highlights that SPEC_CTRL_EXIT_TO_XEN is missing a VERW flush for the -corner case when e.g. an NMI hits late in an exit-to-guest path. - -Leave a TODO, which will be addressed in subsequent patches which arrange for -VERW flushing to be safe within SPEC_CTRL_EXIT_TO_XEN. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 45f00557350dc7d0756551069803fc49c29184ca) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 36 ++++++++++++++++++++---- - 1 file changed, 31 insertions(+), 5 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index c6d5f2ad01..97c4db31cd 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -230,7 +230,10 @@ - wrmsr - .endm - --/* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */ -+/* -+ * Used after an entry from PV context: SYSCALL, SYSENTER, INT, -+ * etc. There is always a guest speculation state in context. -+ */ - .macro SPEC_CTRL_ENTRY_FROM_PV - /* - * Requires %rsp=regs/cpuinfo, %rdx=0 -@@ -245,7 +248,11 @@ - X86_FEATURE_SC_MSR_PV - .endm - --/* Use in interrupt/exception context. May interrupt Xen or PV context. */ -+/* -+ * Used after an exception or maskable interrupt, hitting Xen or PV context. -+ * There will either be a guest speculation context, or (barring fatal -+ * exceptions) a well-formed Xen speculation context. -+ */ - .macro SPEC_CTRL_ENTRY_FROM_INTR - /* - * Requires %rsp=regs, %r14=stack_end, %rdx=0 -@@ -260,7 +267,10 @@ - X86_FEATURE_SC_MSR_PV - .endm - --/* Use when exiting to PV guest context. */ -+/* -+ * Used when exiting from any entry context, back to PV context. This -+ * includes from an IST entry which moved onto the primary stack. -+ */ - .macro SPEC_CTRL_EXIT_TO_PV - /* - * Requires %rax=spec_ctrl, %rsp=regs/info -@@ -272,7 +282,13 @@ - .endm - - /* -- * Use in IST interrupt/exception context. May interrupt Xen or PV context. -+ * Used after an IST entry hitting Xen or PV context. Special care is needed, -+ * because when hitting Xen context, there may not be a well-formed -+ * speculation context. (i.e. it can hit in the middle of -+ * SPEC_CTRL_{ENTRY,EXIT}_* regions.) -+ * -+ * An IST entry which hits PV context moves onto the primary stack and leaves -+ * via SPEC_CTRL_EXIT_TO_PV, *not* SPEC_CTRL_EXIT_TO_XEN. - */ - .macro SPEC_CTRL_ENTRY_FROM_INTR_IST - /* -@@ -331,7 +347,14 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - UNLIKELY_END(\@_serialise) - .endm - --/* Use when exiting to Xen context. */ -+/* -+ * Use when exiting from any entry context, back to Xen context. This -+ * includes returning to other SPEC_CTRL_{ENTRY,EXIT}_* regions with an -+ * incomplete speculation context. -+ * -+ * Because we might have interrupted Xen beyond SPEC_CTRL_EXIT_TO_$GUEST, we -+ * need to treat this as if it were an EXIT_TO_$GUEST case too. -+ */ - .macro SPEC_CTRL_EXIT_TO_XEN - /* - * Requires %rbx=stack_end -@@ -356,6 +379,9 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - wrmsr - - .L\@_skip_sc_msr: -+ -+ /* TODO VERW */ -+ - .endm - - #endif /* __ASSEMBLY__ */ --- -2.41.0 - diff --git a/xsa439-0006-x86-entry-Adjust-restore_all_xen-to-hold-stack_end-i.patch b/xsa439-0006-x86-entry-Adjust-restore_all_xen-to-hold-stack_end-i.patch deleted file mode 100644 index e44998e..0000000 --- a/xsa439-0006-x86-entry-Adjust-restore_all_xen-to-hold-stack_end-i.patch +++ /dev/null @@ -1,77 +0,0 @@ -From 5f7efd47c8273fde972637d0360851802f76eca9 Mon Sep 17 00:00:00 2001 -Message-ID: <5f7efd47c8273fde972637d0360851802f76eca9.1695733540.git.m.a.young@durham.ac.uk> -In-Reply-To: -References: -From: Andrew Cooper -Date: Wed, 13 Sep 2023 13:48:16 +0100 -Subject: [XEN PATCH 06/10] x86/entry: Adjust restore_all_xen to hold stack_end - in %r14 - -All other SPEC_CTRL_{ENTRY,EXIT}_* helpers hold stack_end in %r14. Adjust it -for consistency. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 7aa28849a1155d856e214e9a80a7e65fffdc3e58) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 8 ++++---- - xen/arch/x86/x86_64/entry.S | 8 ++++---- - 2 files changed, 8 insertions(+), 8 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index 97c4db31cd..66c706496f 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -357,10 +357,10 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - */ - .macro SPEC_CTRL_EXIT_TO_XEN - /* -- * Requires %rbx=stack_end -+ * Requires %r14=stack_end - * Clobbers %rax, %rcx, %rdx - */ -- testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) -+ testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) - jz .L\@_skip_sc_msr - - /* -@@ -371,10 +371,10 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - */ - xor %edx, %edx - -- testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) -+ testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) - jz .L\@_skip_sc_msr - -- mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%rbx), %eax -+ mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%r14), %eax - mov $MSR_SPEC_CTRL, %ecx - wrmsr - -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index b45a09823a..92279a225d 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -665,15 +665,15 @@ restore_all_xen: - * Check whether we need to switch to the per-CPU page tables, in - * case we return to late PV exit code (from an NMI or #MC). - */ -- GET_STACK_END(bx) -- cmpb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx) -+ GET_STACK_END(14) -+ cmpb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) - UNLIKELY_START(ne, exit_cr3) -- mov STACK_CPUINFO_FIELD(pv_cr3)(%rbx), %rax -+ mov STACK_CPUINFO_FIELD(pv_cr3)(%r14), %rax - mov %rax, %cr3 - UNLIKELY_END(exit_cr3) - - /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ -- SPEC_CTRL_EXIT_TO_XEN /* Req: %rbx=end, Clob: acd */ -+ SPEC_CTRL_EXIT_TO_XEN /* Req: %r14=end, Clob: acd */ - - RESTORE_ALL adj=8 - iretq --- -2.41.0 - diff --git a/xsa439-0007-x86-entry-Track-the-IST-ness-of-an-entry-for-the-exi.patch b/xsa439-0007-x86-entry-Track-the-IST-ness-of-an-entry-for-the-exi.patch deleted file mode 100644 index 2e36bcc..0000000 --- a/xsa439-0007-x86-entry-Track-the-IST-ness-of-an-entry-for-the-exi.patch +++ /dev/null @@ -1,112 +0,0 @@ -From e4a71bc0da0baf7464bb0d8e33053f330e5ea366 Mon Sep 17 00:00:00 2001 -Message-ID: -In-Reply-To: -References: -From: Andrew Cooper -Date: Wed, 13 Sep 2023 12:20:12 +0100 -Subject: [XEN PATCH 07/10] x86/entry: Track the IST-ness of an entry for the - exit paths - -Use %r12 to hold an ist_exit boolean. This register is zero elsewhere in the -entry/exit asm, so it only needs setting in the IST path. - -As this is subtle and fragile, add check_ist_exit() to be used in debugging -builds to cross-check that the ist_exit boolean matches the entry vector. - -Write check_ist_exit() it in C, because it's debug only and the logic more -complicated than I care to maintain in asm. - -For now, we only need to use this signal in the exit-to-Xen path, but some -exit-to-guest paths happen in IST context too. Check the correctness in all -exit paths to avoid the logic bit-rotting. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 21bdc25b05a0f8ab6bc73520a9ca01327360732c) - -x86/entry: Partially revert IST-exit checks - -The patch adding check_ist_exit() didn't account for the fact that -reset_stack_and_jump() is not an ABI-preserving boundary. The IST-ness in -%r12 doesn't survive into the next context, and is a stale value C. - -This shows up in Gitlab CI for the Clang build: - - https://gitlab.com/xen-project/people/andyhhp/xen/-/jobs/5112783827 - -and in OSSTest for GCC 8: - - http://logs.test-lab.xenproject.org/osstest/logs/183045/test-amd64-amd64-xl-qemuu-debianhvm-amd64/serial-pinot0.log - -There's no straightforward way to reconstruct the IST-exit-ness on the -exit-to-guest path after a context switch. For now, we only need IST-exit on -the return-to-Xen path. - -Fixes: 21bdc25b05a0 ("x86/entry: Track the IST-ness of an entry for the exit paths") -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 9b57c800b79b96769ea3dcd6468578fa664d19f9) ---- - xen/arch/x86/traps.c | 13 +++++++++++++ - xen/arch/x86/x86_64/entry.S | 13 ++++++++++++- - 2 files changed, 25 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c -index d12004b1c6..e65cc60041 100644 ---- a/xen/arch/x86/traps.c -+++ b/xen/arch/x86/traps.c -@@ -2315,6 +2315,19 @@ void asm_domain_crash_synchronous(unsigned long addr) - do_softirq(); - } - -+#ifdef CONFIG_DEBUG -+void check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit) -+{ -+ const unsigned int ist_mask = -+ (1U << X86_EXC_NMI) | (1U << X86_EXC_DB) | -+ (1U << X86_EXC_DF) | (1U << X86_EXC_MC); -+ uint8_t ev = regs->entry_vector; -+ bool is_ist = (ev < TRAP_nr) && ((1U << ev) & ist_mask); -+ -+ ASSERT(is_ist == ist_exit); -+} -+#endif -+ - /* - * Local variables: - * mode: C -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 92279a225d..4cebc4fbe3 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -659,8 +659,15 @@ ENTRY(early_page_fault) - .section .text.entry, "ax", @progbits - - ALIGN --/* No special register assumptions. */ -+/* %r12=ist_exit */ - restore_all_xen: -+ -+#ifdef CONFIG_DEBUG -+ mov %rsp, %rdi -+ mov %r12, %rsi -+ call check_ist_exit -+#endif -+ - /* - * Check whether we need to switch to the per-CPU page tables, in - * case we return to late PV exit code (from an NMI or #MC). -@@ -1091,6 +1098,10 @@ handle_ist_exception: - .L_ist_dispatch_done: - mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) - mov %bl, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) -+ -+ /* This is an IST exit */ -+ mov $1, %r12d -+ - cmpb $TRAP_nmi,UREGS_entry_vector(%rsp) - jne ret_from_intr - --- -2.41.0 - diff --git a/xsa439-0008-x86-spec-ctrl-Issue-VERW-during-IST-exit-to-Xen.patch b/xsa439-0008-x86-spec-ctrl-Issue-VERW-during-IST-exit-to-Xen.patch deleted file mode 100644 index 6e00ca6..0000000 --- a/xsa439-0008-x86-spec-ctrl-Issue-VERW-during-IST-exit-to-Xen.patch +++ /dev/null @@ -1,92 +0,0 @@ -From 2e2c3efcfc9f183674a8de6ed954ffbe7188b70d Mon Sep 17 00:00:00 2001 -Message-ID: <2e2c3efcfc9f183674a8de6ed954ffbe7188b70d.1695733540.git.m.a.young@durham.ac.uk> -In-Reply-To: -References: -From: Andrew Cooper -Date: Wed, 13 Sep 2023 13:53:33 +0100 -Subject: [XEN PATCH 08/10] x86/spec-ctrl: Issue VERW during IST exit to Xen - -There is a corner case where e.g. an NMI hitting an exit-to-guest path after -SPEC_CTRL_EXIT_TO_* would have run the entire NMI handler *after* the VERW -flush to scrub potentially sensitive data from uarch buffers. - -In order to compensate, issue VERW when exiting to Xen from an IST entry. - -SPEC_CTRL_EXIT_TO_XEN already has two reads of spec_ctrl_flags off the stack, -and we're about to add a third. Load the field into %ebx, and list the -register as clobbered. - -%r12 has been arranged to be the ist_exit signal, so add this as an input -dependency and use it to identify when to issue a VERW. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 3ee6066bcd737756b0990d417d94eddc0b0d2585) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 20 +++++++++++++++----- - xen/arch/x86/x86_64/entry.S | 2 +- - 2 files changed, 16 insertions(+), 6 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index 66c706496f..28a75796e6 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -357,10 +357,12 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - */ - .macro SPEC_CTRL_EXIT_TO_XEN - /* -- * Requires %r14=stack_end -- * Clobbers %rax, %rcx, %rdx -+ * Requires %r12=ist_exit, %r14=stack_end -+ * Clobbers %rax, %rbx, %rcx, %rdx - */ -- testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) -+ movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx -+ -+ testb $SCF_ist_sc_msr, %bl - jz .L\@_skip_sc_msr - - /* -@@ -371,7 +373,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - */ - xor %edx, %edx - -- testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) -+ testb $SCF_use_shadow, %bl - jz .L\@_skip_sc_msr - - mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%r14), %eax -@@ -380,8 +382,16 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - - .L\@_skip_sc_msr: - -- /* TODO VERW */ -+ test %r12, %r12 -+ jz .L\@_skip_ist_exit -+ -+ /* Logically DO_SPEC_CTRL_COND_VERW but without the %rsp=cpuinfo dependency */ -+ testb $SCF_verw, %bl -+ jz .L\@_skip_verw -+ verw STACK_CPUINFO_FIELD(verw_sel)(%r14) -+.L\@_skip_verw: - -+.L\@_skip_ist_exit: - .endm - - #endif /* __ASSEMBLY__ */ -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 4cebc4fbe3..c12e011b4d 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -680,7 +680,7 @@ UNLIKELY_START(ne, exit_cr3) - UNLIKELY_END(exit_cr3) - - /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ -- SPEC_CTRL_EXIT_TO_XEN /* Req: %r14=end, Clob: acd */ -+ SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end, Clob: abcd */ - - RESTORE_ALL adj=8 - iretq --- -2.41.0 - diff --git a/xsa439-0009-x86-amd-Introduce-is_zen-1-2-_uarch-predicates.patch b/xsa439-0009-x86-amd-Introduce-is_zen-1-2-_uarch-predicates.patch deleted file mode 100644 index 5f063b1..0000000 --- a/xsa439-0009-x86-amd-Introduce-is_zen-1-2-_uarch-predicates.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 19ee1e1faa32b79274b3484cb1170a5970f1e602 Mon Sep 17 00:00:00 2001 -Message-ID: <19ee1e1faa32b79274b3484cb1170a5970f1e602.1695733540.git.m.a.young@durham.ac.uk> -In-Reply-To: -References: -From: Andrew Cooper -Date: Fri, 15 Sep 2023 12:13:51 +0100 -Subject: [XEN PATCH 09/10] x86/amd: Introduce is_zen{1,2}_uarch() predicates - -We already have 3 cases using STIBP as a Zen1/2 heuristic, and are about to -introduce a 4th. Wrap the heuristic into a pair of predicates rather than -opencoding it, and the explanation of the heuristic, at each usage site. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit de1d265001397f308c5c3c5d3ffc30e7ef8c0705) ---- - xen/arch/x86/cpu/amd.c | 18 ++++-------------- - xen/arch/x86/include/asm/amd.h | 11 +++++++++++ - 2 files changed, 15 insertions(+), 14 deletions(-) - -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index 1bb3044be1..e94ba5a0e0 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -855,15 +855,13 @@ void amd_set_legacy_ssbd(bool enable) - * non-branch instructions to be ignored. It is to be set unilaterally in - * newer microcode. - * -- * This chickenbit is something unrelated on Zen1, and Zen1 vs Zen2 isn't a -- * simple model number comparison, so use STIBP as a heuristic to separate the -- * two uarches in Fam17h(AMD)/18h(Hygon). -+ * This chickenbit is something unrelated on Zen1. - */ - void amd_init_spectral_chicken(void) - { - uint64_t val, chickenbit = 1 << 1; - -- if (cpu_has_hypervisor || !boot_cpu_has(X86_FEATURE_AMD_STIBP)) -+ if (cpu_has_hypervisor || !is_zen2_uarch()) - return; - - if (rdmsr_safe(MSR_AMD64_DE_CFG2, val) == 0 && !(val & chickenbit)) -@@ -912,11 +910,8 @@ void amd_check_zenbleed(void) - * With the Fam17h check above, most parts getting here are - * Zen1. They're not affected. Assume Zen2 ones making it - * here are affected regardless of microcode version. -- * -- * Zen1 vs Zen2 isn't a simple model number comparison, so use -- * STIBP as a heuristic to distinguish. - */ -- if (!boot_cpu_has(X86_FEATURE_AMD_STIBP)) -+ if (is_zen1_uarch()) - return; - good_rev = ~0U; - break; -@@ -1277,12 +1272,7 @@ static int __init cf_check zen2_c6_errata_check(void) - */ - s_time_t delta; - -- /* -- * Zen1 vs Zen2 isn't a simple model number comparison, so use STIBP as -- * a heuristic to separate the two uarches in Fam17h. -- */ -- if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17 || -- !boot_cpu_has(X86_FEATURE_AMD_STIBP)) -+ if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17 || !is_zen2_uarch()) - return 0; - - /* -diff --git a/xen/arch/x86/include/asm/amd.h b/xen/arch/x86/include/asm/amd.h -index a975d3de26..82324110ab 100644 ---- a/xen/arch/x86/include/asm/amd.h -+++ b/xen/arch/x86/include/asm/amd.h -@@ -140,6 +140,17 @@ - AMD_MODEL_RANGE(0x11, 0x0, 0x0, 0xff, 0xf), \ - AMD_MODEL_RANGE(0x12, 0x0, 0x0, 0xff, 0xf)) - -+/* -+ * The Zen1 and Zen2 microarchitectures are implemented by AMD (Fam17h) and -+ * Hygon (Fam18h) but without simple model number rules. Instead, use STIBP -+ * as a heuristic that distinguishes the two. -+ * -+ * The caller is required to perform the appropriate vendor/family checks -+ * first. -+ */ -+#define is_zen1_uarch() (!boot_cpu_has(X86_FEATURE_AMD_STIBP)) -+#define is_zen2_uarch() boot_cpu_has(X86_FEATURE_AMD_STIBP) -+ - struct cpuinfo_x86; - int cpu_has_amd_erratum(const struct cpuinfo_x86 *, int, ...); - --- -2.41.0 - diff --git a/xsa439-0010-x86-spec-ctrl-Mitigate-the-Zen1-DIV-leakage.patch b/xsa439-0010-x86-spec-ctrl-Mitigate-the-Zen1-DIV-leakage.patch deleted file mode 100644 index 0dc6780..0000000 --- a/xsa439-0010-x86-spec-ctrl-Mitigate-the-Zen1-DIV-leakage.patch +++ /dev/null @@ -1,231 +0,0 @@ -From 9ac2f49f5fa3a5159409241d4f74fb0d721dd4c5 Mon Sep 17 00:00:00 2001 -Message-ID: <9ac2f49f5fa3a5159409241d4f74fb0d721dd4c5.1695733540.git.m.a.young@durham.ac.uk> -In-Reply-To: -References: -From: Andrew Cooper -Date: Wed, 30 Aug 2023 20:24:25 +0100 -Subject: [XEN PATCH 10/10] x86/spec-ctrl: Mitigate the Zen1 DIV leakage - -In the Zen1 microarchitecure, there is one divider in the pipeline which -services uops from both threads. In the case of #DE, the latched result from -the previous DIV to execute will be forwarded speculatively. - -This is an interesting covert channel that allows two threads to communicate -without any system calls. In also allows userspace to obtain the result of -the most recent DIV instruction executed (even speculatively) in the core, -which can be from a higher privilege context. - -Scrub the result from the divider by executing a non-faulting divide. This -needs performing on the exit-to-guest paths, and ist_exit-to-Xen. - -Alternatives in IST context is believed safe now that it's done in NMI -context. - -This is XSA-439 / CVE-2023-20588. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit b5926c6ecf05c28ee99c6248c42d691ccbf0c315) ---- - docs/misc/xen-command-line.pandoc | 6 ++- - xen/arch/x86/hvm/svm/entry.S | 1 + - xen/arch/x86/include/asm/cpufeatures.h | 2 +- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 17 +++++++++ - xen/arch/x86/spec_ctrl.c | 48 +++++++++++++++++++++++- - 5 files changed, 71 insertions(+), 3 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index d9dae740cc..b92c8f969c 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2315,7 +2315,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - > {msr-sc,rsb,md-clear,ibpb-entry}=|{pv,hvm}=, - > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, - > eager-fpu,l1d-flush,branch-harden,srb-lock, --> unpriv-mmio,gds-mit}= ]` -+> unpriv-mmio,gds-mit,div-scrub}= ]` - - Controls for speculative execution sidechannel mitigations. By default, Xen - will pick the most appropriate mitigations based on compiled in support, -@@ -2437,6 +2437,10 @@ has elected not to lock the configuration, Xen will use GDS_CTRL to mitigate - GDS with. Otherwise, Xen will mitigate by disabling AVX, which blocks the use - of the AVX2 Gather instructions. - -+On all hardware, the `div-scrub=` option can be used to force or prevent Xen -+from mitigating the DIV-leakage vulnerability. By default, Xen will mitigate -+DIV-leakage on hardware believed to be vulnerable. -+ - ### sync_console - > `= ` - -diff --git a/xen/arch/x86/hvm/svm/entry.S b/xen/arch/x86/hvm/svm/entry.S -index 981cd82e7c..934f12cf5c 100644 ---- a/xen/arch/x86/hvm/svm/entry.S -+++ b/xen/arch/x86/hvm/svm/entry.S -@@ -74,6 +74,7 @@ __UNLIKELY_END(nsvm_hap) - 1: /* No Spectre v1 concerns. Execution will hit VMRUN imminently. */ - .endm - ALTERNATIVE "", svm_vmentry_spec_ctrl, X86_FEATURE_SC_MSR_HVM -+ ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV - - pop %r15 - pop %r14 -diff --git a/xen/arch/x86/include/asm/cpufeatures.h b/xen/arch/x86/include/asm/cpufeatures.h -index da0593de85..c3aad21c3b 100644 ---- a/xen/arch/x86/include/asm/cpufeatures.h -+++ b/xen/arch/x86/include/asm/cpufeatures.h -@@ -35,7 +35,7 @@ XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM - XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */ - XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* Clear MSR_SPEC_CTRL on idle */ - XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */ --/* Bits 23 unused. */ -+XEN_CPUFEATURE(SC_DIV, X86_SYNTH(23)) /* DIV scrub needed */ - XEN_CPUFEATURE(SC_RSB_IDLE, X86_SYNTH(24)) /* RSB overwrite needed for idle. */ - XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */ - XEN_CPUFEATURE(XEN_SHSTK, X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */ -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index 28a75796e6..f4b8b9d956 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -177,6 +177,19 @@ - .L\@_verw_skip: - .endm - -+.macro DO_SPEC_CTRL_DIV -+/* -+ * Requires nothing -+ * Clobbers %rax -+ * -+ * Issue a DIV for its flushing side effect (Zen1 uarch specific). Any -+ * non-faulting DIV will do; a byte DIV has least latency, and doesn't clobber -+ * %rdx. -+ */ -+ mov $1, %eax -+ div %al -+.endm -+ - .macro DO_SPEC_CTRL_ENTRY maybexen:req - /* - * Requires %rsp=regs (also cpuinfo if !maybexen) -@@ -279,6 +292,8 @@ - ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV - - DO_SPEC_CTRL_COND_VERW -+ -+ ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV - .endm - - /* -@@ -391,6 +406,8 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - verw STACK_CPUINFO_FIELD(verw_sel)(%r14) - .L\@_skip_verw: - -+ ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV -+ - .L\@_skip_ist_exit: - .endm - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 79b98f0fe7..0ff3c895ac 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -79,6 +79,7 @@ static int8_t __initdata opt_srb_lock = -1; - static bool __initdata opt_unpriv_mmio; - static bool __ro_after_init opt_fb_clear_mmio; - static int8_t __initdata opt_gds_mit = -1; -+static int8_t __initdata opt_div_scrub = -1; - - static int __init cf_check parse_spec_ctrl(const char *s) - { -@@ -133,6 +134,7 @@ static int __init cf_check parse_spec_ctrl(const char *s) - opt_srb_lock = 0; - opt_unpriv_mmio = false; - opt_gds_mit = 0; -+ opt_div_scrub = 0; - } - else if ( val > 0 ) - rc = -EINVAL; -@@ -285,6 +287,8 @@ static int __init cf_check parse_spec_ctrl(const char *s) - opt_unpriv_mmio = val; - else if ( (val = parse_boolean("gds-mit", s, ss)) >= 0 ) - opt_gds_mit = val; -+ else if ( (val = parse_boolean("div-scrub", s, ss)) >= 0 ) -+ opt_div_scrub = val; - else - rc = -EINVAL; - -@@ -485,7 +489,7 @@ static void __init print_details(enum ind_thunk thunk) - "\n"); - - /* Settings for Xen's protection, irrespective of guests. */ -- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s\n", -+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", - thunk == THUNK_NONE ? "N/A" : - thunk == THUNK_RETPOLINE ? "RETPOLINE" : - thunk == THUNK_LFENCE ? "LFENCE" : -@@ -510,6 +514,7 @@ static void __init print_details(enum ind_thunk thunk) - opt_l1d_flush ? " L1D_FLUSH" : "", - opt_md_clear_pv || opt_md_clear_hvm || - opt_fb_clear_mmio ? " VERW" : "", -+ opt_div_scrub ? " DIV" : "", - opt_branch_harden ? " BRANCH_HARDEN" : ""); - - /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ -@@ -967,6 +972,45 @@ static void __init srso_calculations(bool hw_smt_enabled) - setup_force_cpu_cap(X86_FEATURE_SRSO_NO); - } - -+/* -+ * The Div leakage issue is specific to the AMD Zen1 microarchitecure. -+ * -+ * However, there's no $FOO_NO bit defined, so if we're virtualised we have no -+ * hope of spotting the case where we might move to vulnerable hardware. We -+ * also can't make any useful conclusion about SMT-ness. -+ * -+ * Don't check the hypervisor bit, so at least we do the safe thing when -+ * booting on something that looks like a Zen1 CPU. -+ */ -+static bool __init has_div_vuln(void) -+{ -+ if ( !(boot_cpu_data.x86_vendor & -+ (X86_VENDOR_AMD | X86_VENDOR_HYGON)) ) -+ return false; -+ -+ if ( boot_cpu_data.x86 != 0x17 && boot_cpu_data.x86 != 0x18 ) -+ return false; -+ -+ return is_zen1_uarch(); -+} -+ -+static void __init div_calculations(bool hw_smt_enabled) -+{ -+ bool cpu_bug_div = has_div_vuln(); -+ -+ if ( opt_div_scrub == -1 ) -+ opt_div_scrub = cpu_bug_div; -+ -+ if ( opt_div_scrub ) -+ setup_force_cpu_cap(X86_FEATURE_SC_DIV); -+ -+ if ( opt_smt == -1 && !cpu_has_hypervisor && cpu_bug_div && hw_smt_enabled ) -+ warning_add( -+ "Booted on leaky-DIV hardware with SMT/Hyperthreading\n" -+ "enabled. Please assess your configuration and choose an\n" -+ "explicit 'smt=' setting. See XSA-439.\n"); -+} -+ - static void __init ibpb_calculations(void) - { - bool def_ibpb_entry = false; -@@ -1726,6 +1770,8 @@ void __init init_speculation_mitigations(void) - - ibpb_calculations(); - -+ div_calculations(hw_smt_enabled); -+ - /* Check whether Eager FPU should be enabled by default. */ - if ( opt_eager_fpu == -1 ) - opt_eager_fpu = should_use_eager_fpu(); --- -2.41.0 - diff --git a/xsa440-4.17.patch b/xsa440-4.17.patch deleted file mode 100644 index 4941afc..0000000 --- a/xsa440-4.17.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 5d8b3d1ec98e56155d9650d7f4a70cd8ba9dc27d Mon Sep 17 00:00:00 2001 -From: Julien Grall -Date: Fri, 22 Sep 2023 11:32:16 +0100 -Subject: tools/xenstored: domain_entry_fix(): Handle conflicting transaction - -The function domain_entry_fix() will be initially called to check if the -quota is correct before attempt to commit any nodes. So it would be -possible that accounting is temporarily negative. This is the case -in the following sequence: - - 1) Create 50 nodes - 2) Start two transactions - 3) Delete all the nodes in each transaction - 4) Commit the two transactions - -Because the first transaction will have succeed and updated the -accounting, there is no guarantee that 'd->nbentry + num' will still -be above 0. So the assert() would be triggered. -The assert() was introduced in dbef1f748289 ("tools/xenstore: simplify -and fix per domain node accounting") with the assumption that the -value can't be negative. As this is not true revert to the original -check but restricted to the path where we don't update. Take the -opportunity to explain the rationale behind the check. - -This CVE-2023-34323 / XSA-440. - -Reported-by: Stanislav Uschakow -Fixes: dbef1f748289 ("tools/xenstore: simplify and fix per domain node accounting") -Signed-off-by: Julien Grall -Reviewed-by: Juergen Gross - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index aa86892fed9e..6074df210c6e 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -1094,10 +1094,20 @@ int domain_entry_fix(unsigned int domid, int num, bool update) - } - - cnt = d->nbentry + num; -- assert(cnt >= 0); - -- if (update) -+ if (update) { -+ assert(cnt >= 0); - d->nbentry = cnt; -+ } else if (cnt < 0) { -+ /* -+ * In a transaction when a node is being added/removed AND -+ * the same node has been added/removed outside the -+ * transaction in parallel, the result value may be negative. -+ * This is no problem, as the transaction will fail due to -+ * the resulting conflict. So override 'cnt'. -+ */ -+ cnt = 0; -+ } - - return domid_is_unprivileged(domid) ? cnt : 0; - } diff --git a/xsa442-4.17.patch b/xsa442-4.17.patch deleted file mode 100644 index a78bfdd..0000000 --- a/xsa442-4.17.patch +++ /dev/null @@ -1,185 +0,0 @@ -From 5b2ccb60ff22fbff44dd66214c2956a434ee6271 Mon Sep 17 00:00:00 2001 -From: Roger Pau Monne -Date: Tue, 13 Jun 2023 15:01:05 +0200 -Subject: [PATCH] iommu/amd-vi: flush IOMMU TLB when flushing the DTE -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The caching invalidation guidelines from the AMD-Vi specification (48882—Rev -3.07-PUB—Oct 2022) seem to be misleading on some hardware, as devices will -malfunction (see stale DMA mappings) if some fields of the DTE are updated but -the IOMMU TLB is not flushed. This has been observed in practice on AMD -systems. Due to the lack of guidance from the currently published -specification this patch aims to increase the flushing done in order to prevent -device malfunction. - -In order to fix, issue an INVALIDATE_IOMMU_PAGES command from -amd_iommu_flush_device(), flushing all the address space. Note this requires -callers to be adjusted in order to pass the DomID on the DTE previous to the -modification. - -Some call sites don't provide a valid DomID to amd_iommu_flush_device() in -order to avoid the flush. That's because the device had address translations -disabled and hence the previous DomID on the DTE is not valid. Note the -current logic relies on the entity disabling address translations to also flush -the TLB of the in use DomID. - -Device I/O TLB flushing when ATS are enabled is not covered by the current -change, as ATS usage is not security supported. - -This is XSA-442 / CVE-2023-34326 - -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich ---- - xen/drivers/passthrough/amd/iommu.h | 3 ++- - xen/drivers/passthrough/amd/iommu_cmd.c | 10 +++++++++- - xen/drivers/passthrough/amd/iommu_guest.c | 5 +++-- - xen/drivers/passthrough/amd/iommu_init.c | 6 +++++- - xen/drivers/passthrough/amd/pci_amd_iommu.c | 14 ++++++++++---- - 5 files changed, 29 insertions(+), 9 deletions(-) - -diff --git a/xen/drivers/passthrough/amd/iommu.h b/xen/drivers/passthrough/amd/iommu.h -index 5429ada58ef5..a58be28bf96d 100644 ---- a/xen/drivers/passthrough/amd/iommu.h -+++ b/xen/drivers/passthrough/amd/iommu.h -@@ -283,7 +283,8 @@ void amd_iommu_flush_pages(struct domain *d, unsigned long dfn, - unsigned int order); - void amd_iommu_flush_iotlb(u8 devfn, const struct pci_dev *pdev, - uint64_t gaddr, unsigned int order); --void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf); -+void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf, -+ domid_t domid); - void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf); - void amd_iommu_flush_all_caches(struct amd_iommu *iommu); - -diff --git a/xen/drivers/passthrough/amd/iommu_cmd.c b/xen/drivers/passthrough/amd/iommu_cmd.c -index 40ddf366bb4d..cb28b36abc38 100644 ---- a/xen/drivers/passthrough/amd/iommu_cmd.c -+++ b/xen/drivers/passthrough/amd/iommu_cmd.c -@@ -363,10 +363,18 @@ void amd_iommu_flush_pages(struct domain *d, - _amd_iommu_flush_pages(d, __dfn_to_daddr(dfn), order); - } - --void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf) -+void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf, -+ domid_t domid) - { - invalidate_dev_table_entry(iommu, bdf); - flush_command_buffer(iommu, 0); -+ -+ /* Also invalidate IOMMU TLB entries when flushing the DTE. */ -+ if ( domid != DOMID_INVALID ) -+ { -+ invalidate_iommu_pages(iommu, INV_IOMMU_ALL_PAGES_ADDRESS, domid, 0); -+ flush_command_buffer(iommu, 0); -+ } - } - - void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf) -diff --git a/xen/drivers/passthrough/amd/iommu_guest.c b/xen/drivers/passthrough/amd/iommu_guest.c -index 80a331f546ed..be86bce6fb03 100644 ---- a/xen/drivers/passthrough/amd/iommu_guest.c -+++ b/xen/drivers/passthrough/amd/iommu_guest.c -@@ -385,7 +385,7 @@ static int do_completion_wait(struct domain *d, cmd_entry_t *cmd) - - static int do_invalidate_dte(struct domain *d, cmd_entry_t *cmd) - { -- uint16_t gbdf, mbdf, req_id, gdom_id, hdom_id; -+ uint16_t gbdf, mbdf, req_id, gdom_id, hdom_id, prev_domid; - struct amd_iommu_dte *gdte, *mdte, *dte_base; - struct amd_iommu *iommu = NULL; - struct guest_iommu *g_iommu; -@@ -445,13 +445,14 @@ static int do_invalidate_dte(struct domain *d, cmd_entry_t *cmd) - req_id = get_dma_requestor_id(iommu->seg, mbdf); - dte_base = iommu->dev_table.buffer; - mdte = &dte_base[req_id]; -+ prev_domid = mdte->domain_id; - - spin_lock_irqsave(&iommu->lock, flags); - dte_set_gcr3_table(mdte, hdom_id, gcr3_mfn << PAGE_SHIFT, gv, glx); - - spin_unlock_irqrestore(&iommu->lock, flags); - -- amd_iommu_flush_device(iommu, req_id); -+ amd_iommu_flush_device(iommu, req_id, prev_domid); - - return 0; - } -diff --git a/xen/drivers/passthrough/amd/iommu_init.c b/xen/drivers/passthrough/amd/iommu_init.c -index 166570648d26..101a60ce1794 100644 ---- a/xen/drivers/passthrough/amd/iommu_init.c -+++ b/xen/drivers/passthrough/amd/iommu_init.c -@@ -1547,7 +1547,11 @@ static int cf_check _invalidate_all_devices( - req_id = ivrs_mappings[bdf].dte_requestor_id; - if ( iommu ) - { -- amd_iommu_flush_device(iommu, req_id); -+ /* -+ * IOMMU TLB flush performed separately (see -+ * invalidate_all_domain_pages()). -+ */ -+ amd_iommu_flush_device(iommu, req_id, DOMID_INVALID); - amd_iommu_flush_intremap(iommu, req_id); - } - } -diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c -index 94e37755064b..8641b84712a0 100644 ---- a/xen/drivers/passthrough/amd/pci_amd_iommu.c -+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c -@@ -192,10 +192,13 @@ static int __must_check amd_iommu_setup_domain_device( - - spin_unlock_irqrestore(&iommu->lock, flags); - -- amd_iommu_flush_device(iommu, req_id); -+ /* DTE didn't have DMA translations enabled, do not flush the TLB. */ -+ amd_iommu_flush_device(iommu, req_id, DOMID_INVALID); - } - else if ( dte->pt_root != mfn_x(page_to_mfn(root_pg)) ) - { -+ domid_t prev_domid = dte->domain_id; -+ - /* - * Strictly speaking if the device is the only one with this requestor - * ID, it could be allowed to be re-assigned regardless of unity map -@@ -252,7 +255,7 @@ static int __must_check amd_iommu_setup_domain_device( - - spin_unlock_irqrestore(&iommu->lock, flags); - -- amd_iommu_flush_device(iommu, req_id); -+ amd_iommu_flush_device(iommu, req_id, prev_domid); - } - else - spin_unlock_irqrestore(&iommu->lock, flags); -@@ -421,6 +424,8 @@ static void amd_iommu_disable_domain_device(const struct domain *domain, - spin_lock_irqsave(&iommu->lock, flags); - if ( dte->tv || dte->v ) - { -+ domid_t prev_domid = dte->domain_id; -+ - /* See the comment in amd_iommu_setup_device_table(). */ - dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_ABORTED; - smp_wmb(); -@@ -439,7 +444,7 @@ static void amd_iommu_disable_domain_device(const struct domain *domain, - - spin_unlock_irqrestore(&iommu->lock, flags); - -- amd_iommu_flush_device(iommu, req_id); -+ amd_iommu_flush_device(iommu, req_id, prev_domid); - - AMD_IOMMU_DEBUG("Disable: device id = %#x, " - "domain = %d, paging mode = %d\n", -@@ -610,7 +615,8 @@ static int cf_check amd_iommu_add_device(u8 devfn, struct pci_dev *pdev) - - spin_unlock_irqrestore(&iommu->lock, flags); - -- amd_iommu_flush_device(iommu, bdf); -+ /* DTE didn't have DMA translations enabled, do not flush the TLB. */ -+ amd_iommu_flush_device(iommu, bdf, DOMID_INVALID); - } - - if ( amd_iommu_reserve_domain_unity_map( --- -2.42.0 - diff --git a/xsa443-4.17-01.patch b/xsa443-4.17-01.patch deleted file mode 100644 index d9ca3f8..0000000 --- a/xsa443-4.17-01.patch +++ /dev/null @@ -1,70 +0,0 @@ -From 7e48562bf34e90f907491a0595782d2daa1ff3ad Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo -Date: Thu, 14 Sep 2023 13:22:50 +0100 -Subject: [PATCH 01/11] libfsimage/xfs: Remove dead code - -xfs_info.agnolog (and related code) and XFS_INO_AGBNO_BITS are dead code -that serve no purpose. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo -Reviewed-by: Jan Beulich ---- - tools/libfsimage/xfs/fsys_xfs.c | 18 ------------------ - 1 file changed, 18 deletions(-) - -diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c -index d735a88e55f3..2800699f5985 100644 ---- a/tools/libfsimage/xfs/fsys_xfs.c -+++ b/tools/libfsimage/xfs/fsys_xfs.c -@@ -37,7 +37,6 @@ struct xfs_info { - int blklog; - int inopblog; - int agblklog; -- int agnolog; - unsigned int nextents; - xfs_daddr_t next; - xfs_daddr_t daddr; -@@ -65,9 +64,7 @@ static struct xfs_info xfs; - - #define XFS_INO_MASK(k) ((xfs_uint32_t)((1ULL << (k)) - 1)) - #define XFS_INO_OFFSET_BITS xfs.inopblog --#define XFS_INO_AGBNO_BITS xfs.agblklog - #define XFS_INO_AGINO_BITS (xfs.agblklog + xfs.inopblog) --#define XFS_INO_AGNO_BITS xfs.agnolog - - static inline xfs_agblock_t - agino2agbno (xfs_agino_t agino) -@@ -149,20 +146,6 @@ xt_len (xfs_bmbt_rec_32_t *r) - return le32(r->l3) & mask32lo(21); - } - --static inline int --xfs_highbit32(xfs_uint32_t v) --{ -- int i; -- -- if (--v) { -- for (i = 0; i < 31; i++, v >>= 1) { -- if (v == 0) -- return i; -- } -- } -- return 0; --} -- - static int - isinxt (xfs_fileoff_t key, xfs_fileoff_t offset, xfs_filblks_t len) - { -@@ -472,7 +455,6 @@ xfs_mount (fsi_file_t *ffi, const char *options) - - xfs.inopblog = super.sb_inopblog; - xfs.agblklog = super.sb_agblklog; -- xfs.agnolog = xfs_highbit32 (le32(super.sb_agcount)); - - xfs.btnode_ptr0_off = - ((xfs.bsize - sizeof(xfs_btree_block_t)) / --- -2.42.0 - diff --git a/xsa443-4.17-02.patch b/xsa443-4.17-02.patch deleted file mode 100644 index 0f2edaf..0000000 --- a/xsa443-4.17-02.patch +++ /dev/null @@ -1,32 +0,0 @@ -From c26327795b78c93f6fa6d5d46e34f59dc4046601 Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo -Date: Thu, 14 Sep 2023 13:22:51 +0100 -Subject: [PATCH 02/11] libfsimage/xfs: Amend mask32lo() to allow the value 32 - -agblklog could plausibly be 32, but that would overflow this shift. -Perform the shift as ULL and cast to u32 at the end instead. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo -Acked-by: Jan Beulich ---- - tools/libfsimage/xfs/fsys_xfs.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c -index 2800699f5985..4720bb4505c8 100644 ---- a/tools/libfsimage/xfs/fsys_xfs.c -+++ b/tools/libfsimage/xfs/fsys_xfs.c -@@ -60,7 +60,7 @@ static struct xfs_info xfs; - #define inode ((xfs_dinode_t *)((char *)FSYS_BUF + 8192)) - #define icore (inode->di_core) - --#define mask32lo(n) (((xfs_uint32_t)1 << (n)) - 1) -+#define mask32lo(n) ((xfs_uint32_t)((1ull << (n)) - 1)) - - #define XFS_INO_MASK(k) ((xfs_uint32_t)((1ULL << (k)) - 1)) - #define XFS_INO_OFFSET_BITS xfs.inopblog --- -2.42.0 - diff --git a/xsa443-4.17-03.patch b/xsa443-4.17-03.patch deleted file mode 100644 index b89721a..0000000 --- a/xsa443-4.17-03.patch +++ /dev/null @@ -1,137 +0,0 @@ -From 199f0538bbec052028679a55ea512437170854c9 Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo -Date: Thu, 14 Sep 2023 13:22:52 +0100 -Subject: [PATCH 03/11] libfsimage/xfs: Sanity-check the superblock during - mounts - -Sanity-check the XFS superblock for wellformedness at the mount handler. -This forces pygrub to abort parsing a potentially malformed filesystem and -ensures the invariants assumed throughout the rest of the code hold. - -Also, derive parameters from previously sanitized parameters where possible -(rather than reading them off the superblock) - -The code doesn't try to avoid overflowing the end of the disk, because -that's an unlikely and benign error. Parameters used in calculations of -xfs_daddr_t (like the root inode index) aren't in critical need of being -sanitized. - -The sanitization of agblklog is basically checking that no obvious -overflows happen on agblklog, and then ensuring agblocks is contained in -the range (2^(sb_agblklog-1), 2^sb_agblklog]. - -This is part of XSA-443 / CVE-2023-34325 - -Reported-by: Ferdinand Nölscher -Signed-off-by: Alejandro Vallejo -Reviewed-by: Jan Beulich ---- - tools/libfsimage/xfs/fsys_xfs.c | 48 ++++++++++++++++++++++++++------- - tools/libfsimage/xfs/xfs.h | 12 +++++++++ - 2 files changed, 50 insertions(+), 10 deletions(-) - -diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c -index 4720bb4505c8..e4eb7e1ee26f 100644 ---- a/tools/libfsimage/xfs/fsys_xfs.c -+++ b/tools/libfsimage/xfs/fsys_xfs.c -@@ -17,6 +17,7 @@ - * along with this program; If not, see . - */ - -+#include - #include - #include "xfs.h" - -@@ -433,29 +434,56 @@ first_dentry (fsi_file_t *ffi, xfs_ino_t *ino) - return next_dentry (ffi, ino); - } - -+static bool -+xfs_sb_is_invalid (const xfs_sb_t *super) -+{ -+ return (le32(super->sb_magicnum) != XFS_SB_MAGIC) -+ || ((le16(super->sb_versionnum) & XFS_SB_VERSION_NUMBITS) != -+ XFS_SB_VERSION_4) -+ || (super->sb_inodelog < XFS_SB_INODELOG_MIN) -+ || (super->sb_inodelog > XFS_SB_INODELOG_MAX) -+ || (super->sb_blocklog < XFS_SB_BLOCKLOG_MIN) -+ || (super->sb_blocklog > XFS_SB_BLOCKLOG_MAX) -+ || (super->sb_blocklog < super->sb_inodelog) -+ || (super->sb_agblklog > XFS_SB_AGBLKLOG_MAX) -+ || ((1ull << super->sb_agblklog) < le32(super->sb_agblocks)) -+ || (((1ull << super->sb_agblklog) >> 1) >= -+ le32(super->sb_agblocks)) -+ || ((super->sb_blocklog + super->sb_dirblklog) >= -+ XFS_SB_DIRBLK_NUMBITS); -+} -+ - static int - xfs_mount (fsi_file_t *ffi, const char *options) - { - xfs_sb_t super; - - if (!devread (ffi, 0, 0, sizeof(super), (char *)&super) -- || (le32(super.sb_magicnum) != XFS_SB_MAGIC) -- || ((le16(super.sb_versionnum) -- & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4) ) { -+ || xfs_sb_is_invalid(&super)) { - return 0; - } - -- xfs.bsize = le32 (super.sb_blocksize); -- xfs.blklog = super.sb_blocklog; -- xfs.bdlog = xfs.blklog - SECTOR_BITS; -+ /* -+ * Not sanitized. It's exclusively used to generate disk addresses, -+ * so it's not important from a security standpoint. -+ */ - xfs.rootino = le64 (super.sb_rootino); -- xfs.isize = le16 (super.sb_inodesize); -- xfs.agblocks = le32 (super.sb_agblocks); -- xfs.dirbsize = xfs.bsize << super.sb_dirblklog; - -- xfs.inopblog = super.sb_inopblog; -+ /* -+ * Sanitized to be consistent with each other, only used to -+ * generate disk addresses, so it's safe -+ */ -+ xfs.agblocks = le32 (super.sb_agblocks); - xfs.agblklog = super.sb_agblklog; - -+ /* Derived from sanitized parameters */ -+ xfs.bsize = 1 << super.sb_blocklog; -+ xfs.blklog = super.sb_blocklog; -+ xfs.bdlog = super.sb_blocklog - SECTOR_BITS; -+ xfs.isize = 1 << super.sb_inodelog; -+ xfs.dirbsize = 1 << (super.sb_blocklog + super.sb_dirblklog); -+ xfs.inopblog = super.sb_blocklog - super.sb_inodelog; -+ - xfs.btnode_ptr0_off = - ((xfs.bsize - sizeof(xfs_btree_block_t)) / - (sizeof (xfs_bmbt_key_t) + sizeof (xfs_bmbt_ptr_t))) -diff --git a/tools/libfsimage/xfs/xfs.h b/tools/libfsimage/xfs/xfs.h -index 40699281e44d..b87e37d3d7e9 100644 ---- a/tools/libfsimage/xfs/xfs.h -+++ b/tools/libfsimage/xfs/xfs.h -@@ -134,6 +134,18 @@ typedef struct xfs_sb - xfs_uint8_t sb_dummy[7]; /* padding */ - } xfs_sb_t; - -+/* Bound taken from xfs.c in GRUB2. It doesn't exist in the spec */ -+#define XFS_SB_DIRBLK_NUMBITS 27 -+/* Implied by the XFS specification. The minimum block size is 512 octets */ -+#define XFS_SB_BLOCKLOG_MIN 9 -+/* Implied by the XFS specification. The maximum block size is 65536 octets */ -+#define XFS_SB_BLOCKLOG_MAX 16 -+/* Implied by the XFS specification. The minimum inode size is 256 octets */ -+#define XFS_SB_INODELOG_MIN 8 -+/* Implied by the XFS specification. The maximum inode size is 2048 octets */ -+#define XFS_SB_INODELOG_MAX 11 -+/* High bound for sb_agblklog */ -+#define XFS_SB_AGBLKLOG_MAX 32 - - /* those are from xfs_btree.h */ - --- -2.42.0 - diff --git a/xsa443-4.17-04.patch b/xsa443-4.17-04.patch deleted file mode 100644 index dde095e..0000000 --- a/xsa443-4.17-04.patch +++ /dev/null @@ -1,61 +0,0 @@ -From c66fd01277939634c624c8340838682d9d4fd839 Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo -Date: Thu, 14 Sep 2023 13:22:53 +0100 -Subject: [PATCH 04/11] libfsimage/xfs: Add compile-time check to libfsimage - -Adds the common tools include folder to the -I compile flags -of libfsimage. This allows us to use: - xen-tools/common-macros.h:BUILD_BUG_ON() - -With it, statically assert a sanitized "blocklog - SECTOR_BITS" cannot -underflow. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo -Reviewed-by: Jan Beulich ---- - tools/libfsimage/common.mk | 2 +- - tools/libfsimage/xfs/fsys_xfs.c | 4 +++- - 2 files changed, 4 insertions(+), 2 deletions(-) - -diff --git a/tools/libfsimage/common.mk b/tools/libfsimage/common.mk -index 4fc8c6679599..e4336837d045 100644 ---- a/tools/libfsimage/common.mk -+++ b/tools/libfsimage/common.mk -@@ -1,7 +1,7 @@ - include $(XEN_ROOT)/tools/Rules.mk - - FSDIR := $(libdir)/xenfsimage --CFLAGS += -Wno-unknown-pragmas -I$(XEN_ROOT)/tools/libfsimage/common/ -DFSIMAGE_FSDIR=\"$(FSDIR)\" -+CFLAGS += -Wno-unknown-pragmas -I$(XEN_ROOT)/tools/libfsimage/common/ $(CFLAGS_xeninclude) -DFSIMAGE_FSDIR=\"$(FSDIR)\" - CFLAGS += -D_GNU_SOURCE - LDFLAGS += -L../common/ - -diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c -index e4eb7e1ee26f..4a8dd6f2397b 100644 ---- a/tools/libfsimage/xfs/fsys_xfs.c -+++ b/tools/libfsimage/xfs/fsys_xfs.c -@@ -19,6 +19,7 @@ - - #include - #include -+#include - #include "xfs.h" - - #define MAX_LINK_COUNT 8 -@@ -477,9 +478,10 @@ xfs_mount (fsi_file_t *ffi, const char *options) - xfs.agblklog = super.sb_agblklog; - - /* Derived from sanitized parameters */ -+ BUILD_BUG_ON(XFS_SB_BLOCKLOG_MIN < SECTOR_BITS); -+ xfs.bdlog = super.sb_blocklog - SECTOR_BITS; - xfs.bsize = 1 << super.sb_blocklog; - xfs.blklog = super.sb_blocklog; -- xfs.bdlog = super.sb_blocklog - SECTOR_BITS; - xfs.isize = 1 << super.sb_inodelog; - xfs.dirbsize = 1 << (super.sb_blocklog + super.sb_dirblklog); - xfs.inopblog = super.sb_blocklog - super.sb_inodelog; --- -2.42.0 - diff --git a/xsa443-4.17-05.patch b/xsa443-4.17-05.patch deleted file mode 100644 index b2f5daa..0000000 --- a/xsa443-4.17-05.patch +++ /dev/null @@ -1,59 +0,0 @@ -From ad5d0db5e68e5d4e79255fa85d9cb0069bb1c5d5 Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo -Date: Mon, 25 Sep 2023 18:32:21 +0100 -Subject: [PATCH 05/11] tools/pygrub: Remove unnecessary hypercall - -There's a hypercall being issued in order to determine whether PV64 is -supported, but since Xen 4.3 that's strictly true so it's not required. - -Plus, this way we can avoid mapping the privcmd interface altogether in the -depriv pygrub. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo -Reviewed-by: Andrew Cooper ---- - tools/pygrub/src/pygrub | 12 +----------- - 1 file changed, 1 insertion(+), 11 deletions(-) - -diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub -index ce7ab0eb8cf3..ce4e07d3e823 100755 ---- a/tools/pygrub/src/pygrub -+++ b/tools/pygrub/src/pygrub -@@ -18,7 +18,6 @@ import os, sys, string, struct, tempfile, re, traceback, stat, errno - import copy - import logging - import platform --import xen.lowlevel.xc - - import curses, _curses, curses.textpad, curses.ascii - import getopt -@@ -668,14 +667,6 @@ def run_grub(file, entry, fs, cfg_args): - - return grubcfg - --def supports64bitPVguest(): -- xc = xen.lowlevel.xc.xc() -- caps = xc.xeninfo()['xen_caps'].split(" ") -- for cap in caps: -- if cap == "xen-3.0-x86_64": -- return True -- return False -- - # If nothing has been specified, look for a Solaris domU. If found, perform the - # necessary tweaks. - def sniff_solaris(fs, cfg): -@@ -684,8 +675,7 @@ def sniff_solaris(fs, cfg): - return cfg - - if not cfg["kernel"]: -- if supports64bitPVguest() and \ -- fs.file_exists("/platform/i86xpv/kernel/amd64/unix"): -+ if fs.file_exists("/platform/i86xpv/kernel/amd64/unix"): - cfg["kernel"] = "/platform/i86xpv/kernel/amd64/unix" - cfg["ramdisk"] = "/platform/i86pc/amd64/boot_archive" - elif fs.file_exists("/platform/i86xpv/kernel/unix"): --- -2.42.0 - diff --git a/xsa443-4.17-06.patch b/xsa443-4.17-06.patch deleted file mode 100644 index 22af109..0000000 --- a/xsa443-4.17-06.patch +++ /dev/null @@ -1,65 +0,0 @@ -From d3ceb0b314005a656dd2ca4b2821575a36f8426d Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo -Date: Mon, 25 Sep 2023 18:32:22 +0100 -Subject: [PATCH 06/11] tools/pygrub: Small refactors - -Small tidy up to ensure output_directory always has a trailing '/' to ease -concatenating paths and that `output` can only be a filename or None. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo -Acked-by: Andrew Cooper ---- - tools/pygrub/src/pygrub | 10 +++++----- - 1 file changed, 5 insertions(+), 5 deletions(-) - -diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub -index ce4e07d3e823..1042c05b8676 100755 ---- a/tools/pygrub/src/pygrub -+++ b/tools/pygrub/src/pygrub -@@ -793,7 +793,7 @@ if __name__ == "__main__": - debug = False - not_really = False - output_format = "sxp" -- output_directory = "/var/run/xen/pygrub" -+ output_directory = "/var/run/xen/pygrub/" - - # what was passed in - incfg = { "kernel": None, "ramdisk": None, "args": "" } -@@ -815,7 +815,8 @@ if __name__ == "__main__": - usage() - sys.exit() - elif o in ("--output",): -- output = a -+ if a != "-": -+ output = a - elif o in ("--kernel",): - incfg["kernel"] = a - elif o in ("--ramdisk",): -@@ -847,12 +848,11 @@ if __name__ == "__main__": - if not os.path.isdir(a): - print("%s is not an existing directory" % a) - sys.exit(1) -- output_directory = a -+ output_directory = a + '/' - - if debug: - logging.basicConfig(level=logging.DEBUG) - -- - try: - os.makedirs(output_directory, 0o700) - except OSError as e: -@@ -861,7 +861,7 @@ if __name__ == "__main__": - else: - raise - -- if output is None or output == "-": -+ if output is None: - fd = sys.stdout.fileno() - else: - fd = os.open(output, os.O_WRONLY) --- -2.42.0 - diff --git a/xsa443-4.17-07.patch b/xsa443-4.17-07.patch deleted file mode 100644 index 94da883..0000000 --- a/xsa443-4.17-07.patch +++ /dev/null @@ -1,105 +0,0 @@ -From 9e80cfecde338cea0db136c2fb5ed78d6081e05f Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo -Date: Mon, 25 Sep 2023 18:32:23 +0100 -Subject: [PATCH 07/11] tools/pygrub: Open the output files earlier - -This patch allows pygrub to get ahold of every RW file descriptor it needs -early on. A later patch will clamp the filesystem it can access so it can't -obtain any others. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo -Acked-by: Andrew Cooper ---- - tools/pygrub/src/pygrub | 37 ++++++++++++++++++++++--------------- - 1 file changed, 22 insertions(+), 15 deletions(-) - -diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub -index 1042c05b8676..91e2ec2ab105 100755 ---- a/tools/pygrub/src/pygrub -+++ b/tools/pygrub/src/pygrub -@@ -738,8 +738,7 @@ if __name__ == "__main__": - def usage(): - print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] [--output-format=sxp|simple|simple0] [--offset=] " %(sys.argv[0],), file=sys.stderr) - -- def copy_from_image(fs, file_to_read, file_type, output_directory, -- not_really): -+ def copy_from_image(fs, file_to_read, file_type, fd_dst, path_dst, not_really): - if not_really: - if fs.file_exists(file_to_read): - return "<%s:%s>" % (file_type, file_to_read) -@@ -750,21 +749,18 @@ if __name__ == "__main__": - except Exception as e: - print(e, file=sys.stderr) - sys.exit("Error opening %s in guest" % file_to_read) -- (tfd, ret) = tempfile.mkstemp(prefix="boot_"+file_type+".", -- dir=output_directory) - dataoff = 0 - while True: - data = datafile.read(FS_READ_MAX, dataoff) - if len(data) == 0: -- os.close(tfd) -+ os.close(fd_dst) - del datafile -- return ret -+ return - try: -- os.write(tfd, data) -+ os.write(fd_dst, data) - except Exception as e: - print(e, file=sys.stderr) -- os.close(tfd) -- os.unlink(ret) -+ os.unlink(path_dst) - del datafile - sys.exit("Error writing temporary copy of "+file_type) - dataoff += len(data) -@@ -861,6 +857,14 @@ if __name__ == "__main__": - else: - raise - -+ if not_really: -+ fd_kernel = path_kernel = fd_ramdisk = path_ramdisk = None -+ else: -+ (fd_kernel, path_kernel) = tempfile.mkstemp(prefix="boot_kernel.", -+ dir=output_directory) -+ (fd_ramdisk, path_ramdisk) = tempfile.mkstemp(prefix="boot_ramdisk.", -+ dir=output_directory) -+ - if output is None: - fd = sys.stdout.fileno() - else: -@@ -920,20 +924,23 @@ if __name__ == "__main__": - if fs is None: - raise RuntimeError("Unable to find partition containing kernel") - -- bootcfg["kernel"] = copy_from_image(fs, chosencfg["kernel"], "kernel", -- output_directory, not_really) -+ copy_from_image(fs, chosencfg["kernel"], "kernel", -+ fd_kernel, path_kernel, not_really) -+ bootcfg["kernel"] = path_kernel - - if chosencfg["ramdisk"]: - try: -- bootcfg["ramdisk"] = copy_from_image(fs, chosencfg["ramdisk"], -- "ramdisk", output_directory, -- not_really) -+ copy_from_image(fs, chosencfg["ramdisk"], "ramdisk", -+ fd_ramdisk, path_ramdisk, not_really) - except: - if not not_really: -- os.unlink(bootcfg["kernel"]) -+ os.unlink(path_kernel) - raise -+ bootcfg["ramdisk"] = path_ramdisk - else: - initrd = None -+ if not not_really: -+ os.unlink(path_ramdisk) - - args = None - if chosencfg["args"]: --- -2.42.0 - diff --git a/xsa443-4.17-08.patch b/xsa443-4.17-08.patch deleted file mode 100644 index bd7de1d..0000000 --- a/xsa443-4.17-08.patch +++ /dev/null @@ -1,126 +0,0 @@ -From 2fb4cdcedd8720f78c4bd44739a5d30dd1a7d9a5 Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo -Date: Mon, 25 Sep 2023 18:32:24 +0100 -Subject: [PATCH 08/11] tools/libfsimage: Export a new function to preload all - plugins - -This is work required in order to let pygrub operate in highly deprivileged -chroot mode. This patch adds a function that preloads every plugin, hence -ensuring that a on function exit, every shared library is loaded in memory. - -The new "init" function is supposed to be used before depriv, but that's -fine because it's not acting on untrusted data. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo -Acked-by: Andrew Cooper ---- - tools/libfsimage/common/fsimage_plugin.c | 4 ++-- - tools/libfsimage/common/mapfile-GNU | 1 + - tools/libfsimage/common/mapfile-SunOS | 1 + - tools/libfsimage/common/xenfsimage.h | 8 ++++++++ - tools/pygrub/src/fsimage/fsimage.c | 15 +++++++++++++++ - 5 files changed, 27 insertions(+), 2 deletions(-) - -diff --git a/tools/libfsimage/common/fsimage_plugin.c b/tools/libfsimage/common/fsimage_plugin.c -index de1412b4233a..d0cb9e96a654 100644 ---- a/tools/libfsimage/common/fsimage_plugin.c -+++ b/tools/libfsimage/common/fsimage_plugin.c -@@ -119,7 +119,7 @@ fail: - return (-1); - } - --static int load_plugins(void) -+int fsi_init(void) - { - const char *fsdir = getenv("XEN_FSIMAGE_FSDIR"); - struct dirent *dp = NULL; -@@ -180,7 +180,7 @@ int find_plugin(fsi_t *fsi, const char *path, const char *options) - fsi_plugin_t *fp; - int ret = 0; - -- if (plugins == NULL && (ret = load_plugins()) != 0) -+ if (plugins == NULL && (ret = fsi_init()) != 0) - goto out; - - for (fp = plugins; fp != NULL; fp = fp->fp_next) { -diff --git a/tools/libfsimage/common/mapfile-GNU b/tools/libfsimage/common/mapfile-GNU -index 26d4d7a69ec7..2d54d527d7f5 100644 ---- a/tools/libfsimage/common/mapfile-GNU -+++ b/tools/libfsimage/common/mapfile-GNU -@@ -1,6 +1,7 @@ - VERSION { - libfsimage.so.1.0 { - global: -+ fsi_init; - fsi_open_fsimage; - fsi_close_fsimage; - fsi_file_exists; -diff --git a/tools/libfsimage/common/mapfile-SunOS b/tools/libfsimage/common/mapfile-SunOS -index e99b90b65077..48deedb4252f 100644 ---- a/tools/libfsimage/common/mapfile-SunOS -+++ b/tools/libfsimage/common/mapfile-SunOS -@@ -1,5 +1,6 @@ - libfsimage.so.1.0 { - global: -+ fsi_init; - fsi_open_fsimage; - fsi_close_fsimage; - fsi_file_exists; -diff --git a/tools/libfsimage/common/xenfsimage.h b/tools/libfsimage/common/xenfsimage.h -index 201abd54f23a..341883b2d71a 100644 ---- a/tools/libfsimage/common/xenfsimage.h -+++ b/tools/libfsimage/common/xenfsimage.h -@@ -35,6 +35,14 @@ extern C { - typedef struct fsi fsi_t; - typedef struct fsi_file fsi_file_t; - -+/* -+ * Optional initialization function. If invoked it loads the associated -+ * dynamic libraries for the backends ahead of time. This is required if -+ * the library is to run as part of a highly deprivileged executable, as -+ * the libraries may not be reachable after depriv. -+ */ -+int fsi_init(void); -+ - fsi_t *fsi_open_fsimage(const char *, uint64_t, const char *); - void fsi_close_fsimage(fsi_t *); - -diff --git a/tools/pygrub/src/fsimage/fsimage.c b/tools/pygrub/src/fsimage/fsimage.c -index 2ebbbe35df92..92fbf2851f01 100644 ---- a/tools/pygrub/src/fsimage/fsimage.c -+++ b/tools/pygrub/src/fsimage/fsimage.c -@@ -286,6 +286,15 @@ fsimage_getbootstring(PyObject *o, PyObject *args) - return Py_BuildValue("s", bootstring); - } - -+static PyObject * -+fsimage_init(PyObject *o, PyObject *args) -+{ -+ if (!PyArg_ParseTuple(args, "")) -+ return (NULL); -+ -+ return Py_BuildValue("i", fsi_init()); -+} -+ - PyDoc_STRVAR(fsimage_open__doc__, - "open(name, [offset=off]) - Open the given file as a filesystem image.\n" - "\n" -@@ -297,7 +306,13 @@ PyDoc_STRVAR(fsimage_getbootstring__doc__, - "getbootstring(fs) - Return the boot string needed for this file system " - "or NULL if none is needed.\n"); - -+PyDoc_STRVAR(fsimage_init__doc__, -+ "init() - Loads every dynamic library contained in xenfsimage " -+ "into memory so that it can be used in chrooted environments.\n"); -+ - static struct PyMethodDef fsimage_module_methods[] = { -+ { "init", (PyCFunction)fsimage_init, -+ METH_VARARGS, fsimage_init__doc__ }, - { "open", (PyCFunction)fsimage_open, - METH_VARARGS|METH_KEYWORDS, fsimage_open__doc__ }, - { "getbootstring", (PyCFunction)fsimage_getbootstring, --- -2.42.0 - diff --git a/xsa443-4.17-09.patch b/xsa443-4.17-09.patch deleted file mode 100644 index 2e3ebd8..0000000 --- a/xsa443-4.17-09.patch +++ /dev/null @@ -1,307 +0,0 @@ -From 150771ce86a07e469e34941a63c56e2cf242223b Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo -Date: Mon, 25 Sep 2023 18:32:25 +0100 -Subject: [PATCH 09/11] tools/pygrub: Deprivilege pygrub - -Introduce a --runas= flag to deprivilege pygrub on Linux and *BSDs. It -also implicitly creates a chroot env where it drops a deprivileged forked -process. The chroot itself is cleaned up at the end. - -If the --runas arg is present, then pygrub forks, leaving the child to -deprivilege itself, and waiting for it to complete. When the child exists, -the parent performs cleanup and exits with the same error code. - -This is roughly what the child does: - 1. Initialize libfsimage (this loads every .so in memory so the chroot - can avoid bind-mounting /{,usr}/lib* - 2. Create a temporary empty chroot directory - 3. Mount tmpfs in it - 4. Bind mount the disk inside, because libfsimage expects a path, not a - file descriptor. - 5. Remount the root tmpfs to be stricter (ro,nosuid,nodev) - 6. Set RLIMIT_FSIZE to a sensibly high amount (128 MiB) - 7. Depriv gid, groups and uid - -With this scheme in place, the "output" files are writable (up to -RLIMIT_FSIZE octets) and the exposed filesystem is immutable and contains -the single only file we can't easily get rid of (the disk). - -If running on Linux, the child process also unshares mount, IPC, and -network namespaces before dropping its privileges. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo -Acked-by: Andrew Cooper ---- - tools/pygrub/setup.py | 2 +- - tools/pygrub/src/pygrub | 162 +++++++++++++++++++++++++++++++++++++--- - 2 files changed, 154 insertions(+), 10 deletions(-) - -diff --git a/tools/pygrub/setup.py b/tools/pygrub/setup.py -index 0e4e3d02d372..06b96733d020 100644 ---- a/tools/pygrub/setup.py -+++ b/tools/pygrub/setup.py -@@ -17,7 +17,7 @@ xenfsimage = Extension("xenfsimage", - pkgs = [ 'grub' ] - - setup(name='pygrub', -- version='0.6', -+ version='0.7', - description='Boot loader that looks a lot like grub for Xen', - author='Jeremy Katz', - author_email='katzj@redhat.com', -diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub -index 91e2ec2ab105..7cea496ade08 100755 ---- a/tools/pygrub/src/pygrub -+++ b/tools/pygrub/src/pygrub -@@ -16,8 +16,11 @@ from __future__ import print_function - - import os, sys, string, struct, tempfile, re, traceback, stat, errno - import copy -+import ctypes, ctypes.util - import logging - import platform -+import resource -+import subprocess - - import curses, _curses, curses.textpad, curses.ascii - import getopt -@@ -27,10 +30,135 @@ import grub.GrubConf - import grub.LiloConf - import grub.ExtLinuxConf - --PYGRUB_VER = 0.6 -+PYGRUB_VER = 0.7 - FS_READ_MAX = 1024 * 1024 - SECTOR_SIZE = 512 - -+# Unless provided through the env variable PYGRUB_MAX_FILE_SIZE_MB, then -+# this is the maximum filesize allowed for files written by the depriv -+# pygrub -+LIMIT_FSIZE = 128 << 20 -+ -+CLONE_NEWNS = 0x00020000 # mount namespace -+CLONE_NEWNET = 0x40000000 # network namespace -+CLONE_NEWIPC = 0x08000000 # IPC namespace -+ -+def unshare(flags): -+ if not sys.platform.startswith("linux"): -+ print("skip_unshare reason=not_linux platform=%s", sys.platform, file=sys.stderr) -+ return -+ -+ libc = ctypes.CDLL(ctypes.util.find_library('c'), use_errno=True) -+ unshare_prototype = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, use_errno=True) -+ unshare = unshare_prototype(('unshare', libc)) -+ -+ if unshare(flags) < 0: -+ raise OSError(ctypes.get_errno(), os.strerror(ctypes.get_errno())) -+ -+def bind_mount(src, dst, options): -+ open(dst, "a").close() # touch -+ -+ rc = subprocess.call(["mount", "--bind", "-o", options, src, dst]) -+ if rc != 0: -+ raise RuntimeError("bad_mount: src=%s dst=%s opts=%s" % -+ (src, dst, options)) -+ -+def downgrade_rlimits(): -+ # Wipe the authority to use unrequired resources -+ resource.setrlimit(resource.RLIMIT_NPROC, (0, 0)) -+ resource.setrlimit(resource.RLIMIT_CORE, (0, 0)) -+ resource.setrlimit(resource.RLIMIT_MEMLOCK, (0, 0)) -+ -+ # py2's resource module doesn't know about resource.RLIMIT_MSGQUEUE -+ # -+ # TODO: Use resource.RLIMIT_MSGQUEUE after python2 is deprecated -+ if sys.platform.startswith('linux'): -+ RLIMIT_MSGQUEUE = 12 -+ resource.setrlimit(RLIMIT_MSGQUEUE, (0, 0)) -+ -+ # The final look of the filesystem for this process is fully RO, but -+ # note we have some file descriptor already open (notably, kernel and -+ # ramdisk). In order to avoid a compromised pygrub from filling up the -+ # filesystem we set RLIMIT_FSIZE to a high bound, so that the file -+ # write permissions are bound. -+ fsize = LIMIT_FSIZE -+ if "PYGRUB_MAX_FILE_SIZE_MB" in os.environ.keys(): -+ fsize = os.environ["PYGRUB_MAX_FILE_SIZE_MB"] << 20 -+ -+ resource.setrlimit(resource.RLIMIT_FSIZE, (fsize, fsize)) -+ -+def depriv(output_directory, output, device, uid, path_kernel, path_ramdisk): -+ # The only point of this call is to force the loading of libfsimage. -+ # That way, we don't need to bind-mount it into the chroot -+ rc = xenfsimage.init() -+ if rc != 0: -+ os.unlink(path_ramdisk) -+ os.unlink(path_kernel) -+ raise RuntimeError("bad_xenfsimage: rc=%d" % rc) -+ -+ # Create a temporary directory for the chroot -+ chroot = tempfile.mkdtemp(prefix=str(uid)+'-', dir=output_directory) + '/' -+ device_path = '/device' -+ -+ pid = os.fork() -+ if pid: -+ # parent -+ _, rc = os.waitpid(pid, 0) -+ -+ for path in [path_kernel, path_ramdisk]: -+ # If the child didn't write anything, just get rid of it, -+ # otherwise we end up consuming a 0-size file when parsing -+ # systems without a ramdisk that the ultimate caller of pygrub -+ # may just be unaware of -+ if rc != 0 or os.path.getsize(path) == 0: -+ os.unlink(path) -+ -+ # Normally, unshare(CLONE_NEWNS) will ensure this is not required. -+ # However, this syscall doesn't exist in *BSD systems and doesn't -+ # auto-unmount everything on older Linux kernels (At least as of -+ # Linux 4.19, but it seems fixed in 5.15). Either way, -+ # recursively unmount everything if needed. Quietly. -+ with open('/dev/null', 'w') as devnull: -+ subprocess.call(["umount", "-f", chroot + device_path], -+ stdout=devnull, stderr=devnull) -+ subprocess.call(["umount", "-f", chroot], -+ stdout=devnull, stderr=devnull) -+ os.rmdir(chroot) -+ -+ sys.exit(rc) -+ -+ # By unsharing the namespace we're making sure it's all bulk-released -+ # at the end, when the namespaces disappear. This means the kernel does -+ # (almost) all the cleanup for us and the parent just has to remove the -+ # temporary directory. -+ unshare(CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWNET) -+ -+ # Set sensible limits using the setrlimit interface -+ downgrade_rlimits() -+ -+ # We'll mount tmpfs on the chroot to ensure the deprivileged child -+ # cannot affect the persistent state. It's RW now in order to -+ # bind-mount the device, but note it's remounted RO after that. -+ rc = subprocess.call(["mount", "-t", "tmpfs", "none", chroot]) -+ if rc != 0: -+ raise RuntimeError("mount_tmpfs rc=%d dst=\"%s\"" % (rc, chroot)) -+ -+ # Bind the untrusted device RO -+ bind_mount(device, chroot + device_path, "ro,nosuid,noexec") -+ -+ rc = subprocess.call(["mount", "-t", "tmpfs", "-o", "remount,ro,nosuid,noexec,nodev", "none", chroot]) -+ if rc != 0: -+ raise RuntimeError("remount_tmpfs rc=%d dst=\"%s\"" % (rc, chroot)) -+ -+ # Drop superpowers! -+ os.chroot(chroot) -+ os.chdir('/') -+ os.setgid(uid) -+ os.setgroups([uid]) -+ os.setuid(uid) -+ -+ return device_path -+ - def read_size_roundup(fd, size): - if platform.system() != 'FreeBSD': - return size -@@ -736,7 +864,7 @@ if __name__ == "__main__": - sel = None - - def usage(): -- print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] [--output-format=sxp|simple|simple0] [--offset=] " %(sys.argv[0],), file=sys.stderr) -+ print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] [--output-format=sxp|simple|simple0] [--runas=] [--offset=] " %(sys.argv[0],), file=sys.stderr) - - def copy_from_image(fs, file_to_read, file_type, fd_dst, path_dst, not_really): - if not_really: -@@ -760,7 +888,8 @@ if __name__ == "__main__": - os.write(fd_dst, data) - except Exception as e: - print(e, file=sys.stderr) -- os.unlink(path_dst) -+ if path_dst: -+ os.unlink(path_dst) - del datafile - sys.exit("Error writing temporary copy of "+file_type) - dataoff += len(data) -@@ -769,7 +898,7 @@ if __name__ == "__main__": - opts, args = getopt.gnu_getopt(sys.argv[1:], 'qilnh::', - ["quiet", "interactive", "list-entries", "not-really", "help", - "output=", "output-format=", "output-directory=", "offset=", -- "entry=", "kernel=", -+ "runas=", "entry=", "kernel=", - "ramdisk=", "args=", "isconfig", "debug"]) - except getopt.GetoptError: - usage() -@@ -790,6 +919,7 @@ if __name__ == "__main__": - not_really = False - output_format = "sxp" - output_directory = "/var/run/xen/pygrub/" -+ uid = None - - # what was passed in - incfg = { "kernel": None, "ramdisk": None, "args": "" } -@@ -813,6 +943,13 @@ if __name__ == "__main__": - elif o in ("--output",): - if a != "-": - output = a -+ elif o in ("--runas",): -+ try: -+ uid = int(a) -+ except ValueError: -+ print("runas value must be an integer user id") -+ usage() -+ sys.exit(1) - elif o in ("--kernel",): - incfg["kernel"] = a - elif o in ("--ramdisk",): -@@ -849,6 +986,10 @@ if __name__ == "__main__": - if debug: - logging.basicConfig(level=logging.DEBUG) - -+ if interactive and uid: -+ print("In order to use --runas, you must also set --entry or -q", file=sys.stderr) -+ sys.exit(1) -+ - try: - os.makedirs(output_directory, 0o700) - except OSError as e: -@@ -870,6 +1011,9 @@ if __name__ == "__main__": - else: - fd = os.open(output, os.O_WRONLY) - -+ if uid: -+ file = depriv(output_directory, output, file, uid, path_kernel, path_ramdisk) -+ - # debug - if isconfig: - chosencfg = run_grub(file, entry, fs, incfg["args"]) -@@ -925,21 +1069,21 @@ if __name__ == "__main__": - raise RuntimeError("Unable to find partition containing kernel") - - copy_from_image(fs, chosencfg["kernel"], "kernel", -- fd_kernel, path_kernel, not_really) -+ fd_kernel, None if uid else path_kernel, not_really) - bootcfg["kernel"] = path_kernel - - if chosencfg["ramdisk"]: - try: - copy_from_image(fs, chosencfg["ramdisk"], "ramdisk", -- fd_ramdisk, path_ramdisk, not_really) -+ fd_ramdisk, None if uid else path_ramdisk, not_really) - except: -- if not not_really: -- os.unlink(path_kernel) -+ if not uid and not not_really: -+ os.unlink(path_kernel) - raise - bootcfg["ramdisk"] = path_ramdisk - else: - initrd = None -- if not not_really: -+ if not uid and not not_really: - os.unlink(path_ramdisk) - - args = None --- -2.42.0 - diff --git a/xsa443-4.17-10.patch b/xsa443-4.17-10.patch deleted file mode 100644 index 7c91f32..0000000 --- a/xsa443-4.17-10.patch +++ /dev/null @@ -1,250 +0,0 @@ -From 698b451473a6d868ca0f60a124fc4f31d81cd7b1 Mon Sep 17 00:00:00 2001 -From: Roger Pau Monne -Date: Mon, 25 Sep 2023 14:30:20 +0200 -Subject: [PATCH 10/11] libxl: add support for running bootloader in restricted - mode -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Much like the device model depriv mode, add the same kind of support for the -bootloader. Such feature allows passing a UID as a parameter for the -bootloader to run as, together with the bootloader itself taking the necessary -actions to isolate. - -Note that the user to run the bootloader as must have the right permissions to -access the guest disk image (in read mode only), and that the bootloader will -be run in non-interactive mode when restricted. - -If enabled bootloader restrict mode will attempt to re-use the user(s) from the -QEMU depriv implementation if no user is provided on the configuration file or -the environment. See docs/features/qemu-deprivilege.pandoc for more -information about how to setup those users. - -Bootloader restrict mode is not enabled by default as it requires certain -setup to be done first (setup of the user(s) to use in restrict mode). - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Roger Pau Monné -Reviewed-by: Anthony PERARD ---- - docs/man/xl.1.pod.in | 33 +++++++++++ - tools/libs/light/libxl_bootloader.c | 89 ++++++++++++++++++++++++++++- - tools/libs/light/libxl_dm.c | 8 +-- - tools/libs/light/libxl_internal.h | 8 +++ - 4 files changed, 131 insertions(+), 7 deletions(-) - -diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in -index 101e14241d1c..4831e122427d 100644 ---- a/docs/man/xl.1.pod.in -+++ b/docs/man/xl.1.pod.in -@@ -1957,6 +1957,39 @@ ignored: - - =back - -+=head1 ENVIRONMENT VARIABLES -+ -+The following environment variables shall affect the execution of xl: -+ -+=over 4 -+ -+=item LIBXL_BOOTLOADER_RESTRICT -+ -+Attempt to restrict the bootloader after startup, to limit the -+consequences of security vulnerabilities due to parsing guest -+owned image files. -+ -+See docs/features/qemu-deprivilege.pandoc for more information -+on how to setup the unprivileged users. -+ -+Note that running the bootloader in restricted mode also implies using -+non-interactive mode, and the disk image must be readable by the -+restricted user. -+ -+Having this variable set is equivalent to enabling the option, even if the -+value is 0. -+ -+=item LIBXL_BOOTLOADER_USER -+ -+When using bootloader_restrict, run the bootloader as this user. If -+not set the default QEMU restrict users will be used. -+ -+NOTE: Each domain MUST have a SEPARATE username. -+ -+See docs/features/qemu-deprivilege.pandoc for more information. -+ -+=back -+ - =head1 SEE ALSO - - The following man pages: -diff --git a/tools/libs/light/libxl_bootloader.c b/tools/libs/light/libxl_bootloader.c -index 108329b4a5bb..23c0ef3e8935 100644 ---- a/tools/libs/light/libxl_bootloader.c -+++ b/tools/libs/light/libxl_bootloader.c -@@ -14,6 +14,7 @@ - - #include "libxl_osdeps.h" /* must come before any other headers */ - -+#include - #include - #ifdef HAVE_UTMP_H - #include -@@ -42,8 +43,71 @@ static void bootloader_arg(libxl__bootloader_state *bl, const char *arg) - bl->args[bl->nargs++] = arg; - } - --static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl, -- const char *bootloader_path) -+static int bootloader_uid(libxl__gc *gc, domid_t guest_domid, -+ const char *user, uid_t *intended_uid) -+{ -+ struct passwd *user_base, user_pwbuf; -+ int rc; -+ -+ if (user) { -+ rc = userlookup_helper_getpwnam(gc, user, &user_pwbuf, &user_base); -+ if (rc) return rc; -+ -+ if (!user_base) { -+ LOGD(ERROR, guest_domid, "Couldn't find user %s", user); -+ return ERROR_INVAL; -+ } -+ -+ *intended_uid = user_base->pw_uid; -+ return 0; -+ } -+ -+ /* Re-use QEMU user range for the bootloader. */ -+ rc = userlookup_helper_getpwnam(gc, LIBXL_QEMU_USER_RANGE_BASE, -+ &user_pwbuf, &user_base); -+ if (rc) return rc; -+ -+ if (user_base) { -+ struct passwd *user_clash, user_clash_pwbuf; -+ uid_t temp_uid = user_base->pw_uid + guest_domid; -+ -+ rc = userlookup_helper_getpwuid(gc, temp_uid, &user_clash_pwbuf, -+ &user_clash); -+ if (rc) return rc; -+ -+ if (user_clash) { -+ LOGD(ERROR, guest_domid, -+ "wanted to use uid %ld (%s + %d) but that is user %s !", -+ (long)temp_uid, LIBXL_QEMU_USER_RANGE_BASE, -+ guest_domid, user_clash->pw_name); -+ return ERROR_INVAL; -+ } -+ -+ *intended_uid = temp_uid; -+ return 0; -+ } -+ -+ rc = userlookup_helper_getpwnam(gc, LIBXL_QEMU_USER_SHARED, &user_pwbuf, -+ &user_base); -+ if (rc) return rc; -+ -+ if (user_base) { -+ LOGD(WARN, guest_domid, "Could not find user %s, falling back to %s", -+ LIBXL_QEMU_USER_RANGE_BASE, LIBXL_QEMU_USER_SHARED); -+ *intended_uid = user_base->pw_uid; -+ -+ return 0; -+ } -+ -+ LOGD(ERROR, guest_domid, -+ "Could not find user %s or range base pseudo-user %s, cannot restrict", -+ LIBXL_QEMU_USER_SHARED, LIBXL_QEMU_USER_RANGE_BASE); -+ -+ return ERROR_INVAL; -+} -+ -+static int make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl, -+ const char *bootloader_path) - { - const libxl_domain_build_info *info = bl->info; - -@@ -61,6 +125,23 @@ static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl, - ARG(GCSPRINTF("--ramdisk=%s", info->ramdisk)); - if (info->cmdline && *info->cmdline != '\0') - ARG(GCSPRINTF("--args=%s", info->cmdline)); -+ if (getenv("LIBXL_BOOTLOADER_RESTRICT") || -+ getenv("LIBXL_BOOTLOADER_USER")) { -+ uid_t uid = -1; -+ int rc = bootloader_uid(gc, bl->domid, getenv("LIBXL_BOOTLOADER_USER"), -+ &uid); -+ -+ if (rc) return rc; -+ -+ assert(uid != -1); -+ if (!uid) { -+ LOGD(ERROR, bl->domid, "bootloader restrict UID is 0 (root)!"); -+ return ERROR_INVAL; -+ } -+ LOGD(DEBUG, bl->domid, "using uid %ld", (long)uid); -+ ARG(GCSPRINTF("--runas=%ld", (long)uid)); -+ ARG("--quiet"); -+ } - - ARG(GCSPRINTF("--output=%s", bl->outputpath)); - ARG("--output-format=simple0"); -@@ -79,6 +160,7 @@ static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl, - /* Sentinel for execv */ - ARG(NULL); - -+ return 0; - #undef ARG - } - -@@ -443,7 +525,8 @@ static void bootloader_disk_attached_cb(libxl__egc *egc, - bootloader = bltmp; - } - -- make_bootloader_args(gc, bl, bootloader); -+ rc = make_bootloader_args(gc, bl, bootloader); -+ if (rc) goto out; - - bl->openpty.ao = ao; - bl->openpty.callback = bootloader_gotptys; -diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c -index fc264a3a13a6..14b593110f7c 100644 ---- a/tools/libs/light/libxl_dm.c -+++ b/tools/libs/light/libxl_dm.c -@@ -80,10 +80,10 @@ static int libxl__create_qemu_logfile(libxl__gc *gc, char *name) - * On error, return a libxl-style error code. - */ - #define DEFINE_USERLOOKUP_HELPER(NAME,SPEC_TYPE,STRUCTNAME,SYSCONF) \ -- static int userlookup_helper_##NAME(libxl__gc *gc, \ -- SPEC_TYPE spec, \ -- struct STRUCTNAME *resultbuf, \ -- struct STRUCTNAME **out) \ -+ int userlookup_helper_##NAME(libxl__gc *gc, \ -+ SPEC_TYPE spec, \ -+ struct STRUCTNAME *resultbuf, \ -+ struct STRUCTNAME **out) \ - { \ - struct STRUCTNAME *resultp = NULL; \ - char *buf = NULL; \ -diff --git a/tools/libs/light/libxl_internal.h b/tools/libs/light/libxl_internal.h -index 7ad38de30e0b..f1e3a9a15b13 100644 ---- a/tools/libs/light/libxl_internal.h -+++ b/tools/libs/light/libxl_internal.h -@@ -4873,6 +4873,14 @@ struct libxl__cpu_policy { - struct xc_msr *msr; - }; - -+struct passwd; -+_hidden int userlookup_helper_getpwnam(libxl__gc*, const char *user, -+ struct passwd *res, -+ struct passwd **out); -+_hidden int userlookup_helper_getpwuid(libxl__gc*, uid_t uid, -+ struct passwd *res, -+ struct passwd **out); -+ - #endif - - /* --- -2.42.0 - diff --git a/xsa443-4.17-11.patch b/xsa443-4.17-11.patch deleted file mode 100644 index 27e6f78..0000000 --- a/xsa443-4.17-11.patch +++ /dev/null @@ -1,157 +0,0 @@ -From 9d480426bfa2c68843ac8395b512e06fbdbcf53e Mon Sep 17 00:00:00 2001 -From: Roger Pau Monne -Date: Thu, 28 Sep 2023 12:22:35 +0200 -Subject: [PATCH 11/11] libxl: limit bootloader execution in restricted mode -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Introduce a timeout for bootloader execution when running in restricted mode. - -Allow overwriting the default time out with an environment provided value. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Roger Pau Monné -Reviewed-by: Anthony PERARD ---- - docs/man/xl.1.pod.in | 8 ++++++ - tools/libs/light/libxl_bootloader.c | 40 +++++++++++++++++++++++++++++ - tools/libs/light/libxl_internal.h | 2 ++ - 3 files changed, 50 insertions(+) - -diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in -index 4831e122427d..c3eb6570ab8b 100644 ---- a/docs/man/xl.1.pod.in -+++ b/docs/man/xl.1.pod.in -@@ -1988,6 +1988,14 @@ NOTE: Each domain MUST have a SEPARATE username. - - See docs/features/qemu-deprivilege.pandoc for more information. - -+=item LIBXL_BOOTLOADER_TIMEOUT -+ -+Timeout in seconds for bootloader execution when running in restricted mode. -+Otherwise the build time default in LIBXL_BOOTLOADER_TIMEOUT will be used. -+ -+If defined the value must be an unsigned integer between 0 and INT_MAX, -+otherwise behavior is undefined. Setting to 0 disables the timeout. -+ - =back - - =head1 SEE ALSO -diff --git a/tools/libs/light/libxl_bootloader.c b/tools/libs/light/libxl_bootloader.c -index 23c0ef3e8935..ee26d08f3765 100644 ---- a/tools/libs/light/libxl_bootloader.c -+++ b/tools/libs/light/libxl_bootloader.c -@@ -30,6 +30,8 @@ static void bootloader_keystrokes_copyfail(libxl__egc *egc, - libxl__datacopier_state *dc, int rc, int onwrite, int errnoval); - static void bootloader_display_copyfail(libxl__egc *egc, - libxl__datacopier_state *dc, int rc, int onwrite, int errnoval); -+static void bootloader_timeout(libxl__egc *egc, libxl__ev_time *ev, -+ const struct timeval *requested_abs, int rc); - static void bootloader_domaindeath(libxl__egc*, libxl__domaindeathcheck *dc, - int rc); - static void bootloader_finished(libxl__egc *egc, libxl__ev_child *child, -@@ -297,6 +299,7 @@ void libxl__bootloader_init(libxl__bootloader_state *bl) - bl->ptys[0].master = bl->ptys[0].slave = 0; - bl->ptys[1].master = bl->ptys[1].slave = 0; - libxl__ev_child_init(&bl->child); -+ libxl__ev_time_init(&bl->time); - libxl__domaindeathcheck_init(&bl->deathcheck); - bl->keystrokes.ao = bl->ao; libxl__datacopier_init(&bl->keystrokes); - bl->display.ao = bl->ao; libxl__datacopier_init(&bl->display); -@@ -314,6 +317,7 @@ static void bootloader_cleanup(libxl__egc *egc, libxl__bootloader_state *bl) - libxl__domaindeathcheck_stop(gc,&bl->deathcheck); - libxl__datacopier_kill(&bl->keystrokes); - libxl__datacopier_kill(&bl->display); -+ libxl__ev_time_deregister(gc, &bl->time); - for (i=0; i<2; i++) { - libxl__carefd_close(bl->ptys[i].master); - libxl__carefd_close(bl->ptys[i].slave); -@@ -375,6 +379,7 @@ static void bootloader_stop(libxl__egc *egc, - - libxl__datacopier_kill(&bl->keystrokes); - libxl__datacopier_kill(&bl->display); -+ libxl__ev_time_deregister(gc, &bl->time); - if (libxl__ev_child_inuse(&bl->child)) { - r = kill(bl->child.pid, SIGTERM); - if (r) LOGED(WARN, bl->domid, "%sfailed to kill bootloader [%lu]", -@@ -637,6 +642,25 @@ static void bootloader_gotptys(libxl__egc *egc, libxl__openpty_state *op) - - struct termios termattr; - -+ if (getenv("LIBXL_BOOTLOADER_RESTRICT") || -+ getenv("LIBXL_BOOTLOADER_USER")) { -+ const char *timeout_env = getenv("LIBXL_BOOTLOADER_TIMEOUT"); -+ int timeout = timeout_env ? atoi(timeout_env) -+ : LIBXL_BOOTLOADER_TIMEOUT; -+ -+ if (timeout) { -+ /* Set execution timeout */ -+ rc = libxl__ev_time_register_rel(ao, &bl->time, -+ bootloader_timeout, -+ timeout * 1000); -+ if (rc) { -+ LOGED(ERROR, bl->domid, -+ "unable to register timeout for bootloader execution"); -+ goto out; -+ } -+ } -+ } -+ - pid_t pid = libxl__ev_child_fork(gc, &bl->child, bootloader_finished); - if (pid == -1) { - rc = ERROR_FAIL; -@@ -702,6 +726,21 @@ static void bootloader_display_copyfail(libxl__egc *egc, - libxl__bootloader_state *bl = CONTAINER_OF(dc, *bl, display); - bootloader_copyfail(egc, "bootloader output", bl, 1, rc,onwrite,errnoval); - } -+static void bootloader_timeout(libxl__egc *egc, libxl__ev_time *ev, -+ const struct timeval *requested_abs, int rc) -+{ -+ libxl__bootloader_state *bl = CONTAINER_OF(ev, *bl, time); -+ STATE_AO_GC(bl->ao); -+ -+ libxl__ev_time_deregister(gc, &bl->time); -+ -+ assert(libxl__ev_child_inuse(&bl->child)); -+ LOGD(ERROR, bl->domid, "killing bootloader because of timeout"); -+ -+ libxl__ev_child_kill_deregister(ao, &bl->child, SIGKILL); -+ -+ bootloader_callback(egc, bl, rc); -+} - - static void bootloader_domaindeath(libxl__egc *egc, - libxl__domaindeathcheck *dc, -@@ -718,6 +757,7 @@ static void bootloader_finished(libxl__egc *egc, libxl__ev_child *child, - STATE_AO_GC(bl->ao); - int rc; - -+ libxl__ev_time_deregister(gc, &bl->time); - libxl__datacopier_kill(&bl->keystrokes); - libxl__datacopier_kill(&bl->display); - -diff --git a/tools/libs/light/libxl_internal.h b/tools/libs/light/libxl_internal.h -index f1e3a9a15b13..d05783617ff5 100644 ---- a/tools/libs/light/libxl_internal.h -+++ b/tools/libs/light/libxl_internal.h -@@ -102,6 +102,7 @@ - #define LIBXL_QMP_CMD_TIMEOUT 10 - #define LIBXL_STUBDOM_START_TIMEOUT 30 - #define LIBXL_QEMU_BODGE_TIMEOUT 2 -+#define LIBXL_BOOTLOADER_TIMEOUT 120 - #define LIBXL_XENCONSOLE_LIMIT 1048576 - #define LIBXL_XENCONSOLE_PROTOCOL "vt100" - #define LIBXL_MAXMEM_CONSTANT 1024 -@@ -3744,6 +3745,7 @@ struct libxl__bootloader_state { - libxl__openpty_state openpty; - libxl__openpty_result ptys[2]; /* [0] is for bootloader */ - libxl__ev_child child; -+ libxl__ev_time time; - libxl__domaindeathcheck deathcheck; - int nargs, argsspace; - const char **args; --- -2.42.0 - diff --git a/xsa444-4.17-1.patch b/xsa444-4.17-1.patch deleted file mode 100644 index 5a4b2e5..0000000 --- a/xsa444-4.17-1.patch +++ /dev/null @@ -1,93 +0,0 @@ -From: Andrew Cooper -Subject: x86/svm: Fix asymmetry with AMD DR MASK context switching - -The handling of MSR_DR{0..3}_MASK is asymmetric between PV and HVM guests. - -HVM guests context switch in based on the guest view of DBEXT, whereas PV -guest switch in base on the host capability. Both guest types leave the -context dirty for the next vCPU. - -This leads to the following issue: - - * PV or HVM guest has debugging active (%dr7 + mask) - * Switch-out deactivates %dr7 but leaves other state stale in hardware - * Another HVM guest with masks unavailable has debugging active - * Switch in loads %dr7 but leaves the mask MSRs alone - -Now, the second guest's vCPU is operating in the context of the prior vCPU's -mask MSR, while the environment the vCPU can see says there are no mask MSRs. - -As a stopgap, adjust the HVM path to switch in the masks based on host -capabilities rather than guest visibility (i.e. like the PV path). Adjustment -of the intercepts still needs to be dependent on the guest visibility of -DBEXT. - -This is part of XSA-444 / CVE-2023-34327 - -Fixes: c097f54912d3 ("x86/SVM: support data breakpoint extension registers") -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich - -diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c -index e8f50e7c5ec7..fd32600ae364 100644 ---- a/xen/arch/x86/hvm/svm/svm.c -+++ b/xen/arch/x86/hvm/svm/svm.c -@@ -339,6 +339,10 @@ static void svm_save_dr(struct vcpu *v) - v->arch.hvm.flag_dr_dirty = 0; - vmcb_set_dr_intercepts(vmcb, ~0u); - -+ /* -+ * The guest can only have changed the mask MSRs if we previous dropped -+ * intercepts. Re-read them from hardware. -+ */ - if ( v->domain->arch.cpuid->extd.dbext ) - { - svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_RW); -@@ -370,17 +374,25 @@ static void __restore_debug_registers(struct vmcb_struct *vmcb, struct vcpu *v) - - ASSERT(v == current); - -- if ( v->domain->arch.cpuid->extd.dbext ) -+ /* -+ * Both the PV and HVM paths leave stale DR_MASK values in hardware on -+ * context-switch-out. If we're activating %dr7 for the guest, we must -+ * sync the DR_MASKs too, whether or not the guest can see them. -+ */ -+ if ( boot_cpu_has(X86_FEATURE_DBEXT) ) - { -- svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE); -- svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE); -- svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE); -- svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE); -- - wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.msrs->dr_mask[0]); - wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.msrs->dr_mask[1]); - wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.msrs->dr_mask[2]); - wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.msrs->dr_mask[3]); -+ -+ if ( v->domain->arch.cpuid->extd.dbext ) -+ { -+ svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE); -+ svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE); -+ svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE); -+ svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE); -+ } - } - - write_debugreg(0, v->arch.dr[0]); -diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c -index e65cc6004148..06c4f3868b7a 100644 ---- a/xen/arch/x86/traps.c -+++ b/xen/arch/x86/traps.c -@@ -2281,6 +2281,11 @@ void activate_debugregs(const struct vcpu *curr) - if ( curr->arch.dr7 & DR7_ACTIVE_MASK ) - write_debugreg(7, curr->arch.dr7); - -+ /* -+ * Both the PV and HVM paths leave stale DR_MASK values in hardware on -+ * context-switch-out. If we're activating %dr7 for the guest, we must -+ * sync the DR_MASKs too, whether or not the guest can see them. -+ */ - if ( boot_cpu_has(X86_FEATURE_DBEXT) ) - { - wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.msrs->dr_mask[0]); diff --git a/xsa444-4.17-2.patch b/xsa444-4.17-2.patch deleted file mode 100644 index 2687bd1..0000000 --- a/xsa444-4.17-2.patch +++ /dev/null @@ -1,72 +0,0 @@ -From: Andrew Cooper -Subject: x86/pv: Correct the auditing of guest breakpoint addresses - -The use of access_ok() is buggy, because it permits access to the compat -translation area. 64bit PV guests don't use the XLAT area, but on AMD -hardware, the DBEXT feature allows a breakpoint to match up to a 4G aligned -region, allowing the breakpoint to reach outside of the XLAT area. - -Prior to c/s cda16c1bb223 ("x86: mirror compat argument translation area for -32-bit PV"), the live GDT was within 4G of the XLAT area. - -All together, this allowed a malicious 64bit PV guest on AMD hardware to place -a breakpoint over the live GDT, and trigger a #DB livelock (CVE-2015-8104). - -Introduce breakpoint_addr_ok() and explain why __addr_ok() happens to be an -appropriate check in this case. - -For Xen 4.14 and later, this is a latent bug because the XLAT area has moved -to be on its own with nothing interesting adjacent. For Xen 4.13 and older on -AMD hardware, this fixes a PV-trigger-able DoS. - -This is part of XSA-444 / CVE-2023-34328. - -Fixes: 65e355490817 ("x86/PV: support data breakpoint extension registers") -Signed-off-by: Andrew Cooper -Reviewed-by: Roger Pau Monné -Reviewed-by: Jan Beulich - -diff --git a/xen/arch/x86/include/asm/debugreg.h b/xen/arch/x86/include/asm/debugreg.h -index c57914efc6e8..cc298265244b 100644 ---- a/xen/arch/x86/include/asm/debugreg.h -+++ b/xen/arch/x86/include/asm/debugreg.h -@@ -77,6 +77,26 @@ - asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) ); \ - __val; \ - }) -+ -+/* -+ * Architecturally, %dr{0..3} can have any arbitrary value. However, Xen -+ * can't allow the guest to breakpoint the Xen address range, so we limit the -+ * guest to the lower canonical half, or above the Xen range in the higher -+ * canonical half. -+ * -+ * Breakpoint lengths are specified to mask the low order address bits, -+ * meaning all breakpoints are naturally aligned. With %dr7, the widest -+ * breakpoint is 8 bytes. With DBEXT, the widest breakpoint is 4G. Both of -+ * the Xen boundaries have >4G alignment. -+ * -+ * In principle we should account for HYPERVISOR_COMPAT_VIRT_START(d), but -+ * 64bit Xen has never enforced this for compat guests, and there's no problem -+ * (to Xen) if the guest breakpoints it's alias of the M2P. Skipping this -+ * aspect simplifies the logic, and causes us not to reject a migrating guest -+ * which operated fine on prior versions of Xen. -+ */ -+#define breakpoint_addr_ok(a) __addr_ok(a) -+ - long set_debugreg(struct vcpu *, unsigned int reg, unsigned long value); - void activate_debugregs(const struct vcpu *); - -diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c -index aaaf70eb6330..f8636de907ae 100644 ---- a/xen/arch/x86/pv/misc-hypercalls.c -+++ b/xen/arch/x86/pv/misc-hypercalls.c -@@ -72,7 +72,7 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) - switch ( reg ) - { - case 0 ... 3: -- if ( !access_ok(value, sizeof(long)) ) -+ if ( !breakpoint_addr_ok(value) ) - return -EPERM; - - v->arch.dr[reg] = value; diff --git a/xsa445-4.17.patch b/xsa445-4.17.patch deleted file mode 100644 index db66d7c..0000000 --- a/xsa445-4.17.patch +++ /dev/null @@ -1,63 +0,0 @@ -From a43127d4f1f9a364334fe16b6239c211b35fd238 Mon Sep 17 00:00:00 2001 -From: Roger Pau Monne -Date: Wed, 11 Oct 2023 13:14:21 +0200 -Subject: [PATCH] iommu/amd-vi: use correct level for quarantine domain page - tables -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The current setup of the quarantine page tables assumes that the quarantine -domain (dom_io) has been initialized with an address width of -DEFAULT_DOMAIN_ADDRESS_WIDTH (48). - -However dom_io being a PV domain gets the AMD-Vi IOMMU page tables levels based -on the maximum (hot pluggable) RAM address, and hence on systems with no RAM -above the 512GB mark only 3 page-table levels are configured in the IOMMU. - -On systems without RAM above the 512GB boundary amd_iommu_quarantine_init() -will setup page tables for the scratch page with 4 levels, while the IOMMU will -be configured to use 3 levels only. The page destined to be used as level 1, -and to contain a directory of PTEs ends up being the address in a PTE itself, -and thus level 1 page becomes the leaf page. Without the level mismatch it's -level 0 page that should be the leaf page instead. - -The level 1 page won't be used as such, and hence it's not possible to use it -to gain access to other memory on the system. However that page is not cleared -in amd_iommu_quarantine_init() as part of re-initialization of the device -quarantine page tables, and hence data on the level 1 page can be leaked -between device usages. - -Fix this by making sure the paging levels setup by amd_iommu_quarantine_init() -match the number configured on the IOMMUs. - -Note that IVMD regions are not affected by this issue, as those areas are -mapped taking the configured paging levels into account. - -This is XSA-445 / CVE-2023-46835 - -Fixes: ea38867831da ('x86 / iommu: set up a scratch page in the quarantine domain') -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich ---- - xen/drivers/passthrough/amd/iommu_map.c | 4 +--- - 1 file changed, 1 insertion(+), 3 deletions(-) - -diff --git a/xen/drivers/passthrough/amd/iommu_map.c b/xen/drivers/passthrough/amd/iommu_map.c -index 993bac6f8878..e0f4fe736a8d 100644 ---- a/xen/drivers/passthrough/amd/iommu_map.c -+++ b/xen/drivers/passthrough/amd/iommu_map.c -@@ -837,9 +837,7 @@ static int fill_qpt(union amd_iommu_pte *this, unsigned int level, - int cf_check amd_iommu_quarantine_init(struct pci_dev *pdev, bool scratch_page) - { - struct domain_iommu *hd = dom_iommu(dom_io); -- unsigned long end_gfn = -- 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT); -- unsigned int level = amd_iommu_get_paging_mode(end_gfn); -+ unsigned int level = hd->arch.amd.paging_mode; - unsigned int req_id = get_dma_requestor_id(pdev->seg, pdev->sbdf.bdf); - const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); - int rc; --- -2.42.0 - diff --git a/xsa446.patch b/xsa446.patch deleted file mode 100644 index acf1d0f..0000000 --- a/xsa446.patch +++ /dev/null @@ -1,115 +0,0 @@ -From 80d5aada598c3a800a350003d5d582931545e13c Mon Sep 17 00:00:00 2001 -From: Andrew Cooper -Date: Thu, 26 Oct 2023 14:37:38 +0100 -Subject: [PATCH] x86/spec-ctrl: Remove conditional IRQs-on-ness for INT - $0x80/0x82 paths -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Before speculation defences, some paths in Xen could genuinely get away with -being IRQs-on at entry. But XPTI invalidated this property on most paths, and -attempting to maintain it on the remaining paths was a mistake. - -Fast forward, and DO_SPEC_CTRL_COND_IBPB (protection for AMD BTC/SRSO) is not -IRQ-safe, running with IRQs enabled in some cases. The other actions taken on -these paths happen to be IRQ-safe. - -Make entry_int82() and int80_direct_trap() unconditionally Interrupt Gates -rather than Trap Gates. Remove the conditional re-adjustment of -int80_direct_trap() in smp_prepare_cpus(), and have entry_int82() explicitly -enable interrupts when safe to do so. - -In smp_prepare_cpus(), with the conditional re-adjustment removed, the -clearing of pv_cr3 is the only remaining action gated on XPTI, and it is out -of place anyway, repeating work already done by smp_prepare_boot_cpu(). Drop -the entire if() condition to avoid leaving an incorrect vestigial remnant. - -Also drop comments which make incorrect statements about when its safe to -enable interrupts. - -This is XSA-446 / CVE-2023-46836 - -Signed-off-by: Andrew Cooper -Reviewed-by: Roger Pau Monné ---- - xen/arch/x86/pv/traps.c | 4 ++-- - xen/arch/x86/smpboot.c | 14 -------------- - xen/arch/x86/x86_64/compat/entry.S | 2 ++ - xen/arch/x86/x86_64/entry.S | 1 - - 4 files changed, 4 insertions(+), 17 deletions(-) - -diff --git a/xen/arch/x86/pv/traps.c b/xen/arch/x86/pv/traps.c -index 74f333da7e1c..240d1a2db7a3 100644 ---- a/xen/arch/x86/pv/traps.c -+++ b/xen/arch/x86/pv/traps.c -@@ -139,11 +139,11 @@ void __init pv_trap_init(void) - #ifdef CONFIG_PV32 - /* The 32-on-64 hypercall vector is only accessible from ring 1. */ - _set_gate(idt_table + HYPERCALL_VECTOR, -- SYS_DESC_trap_gate, 1, entry_int82); -+ SYS_DESC_irq_gate, 1, entry_int82); - #endif - - /* Fast trap for int80 (faster than taking the #GP-fixup path). */ -- _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_trap_gate, 3, -+ _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_irq_gate, 3, - &int80_direct_trap); - - open_softirq(NMI_SOFTIRQ, nmi_softirq); -diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c -index 3a1a659082c6..4c54ecbc91d7 100644 ---- a/xen/arch/x86/smpboot.c -+++ b/xen/arch/x86/smpboot.c -@@ -1158,20 +1158,6 @@ void __init smp_prepare_cpus(void) - - stack_base[0] = (void *)((unsigned long)stack_start & ~(STACK_SIZE - 1)); - -- if ( opt_xpti_hwdom || opt_xpti_domu ) -- { -- get_cpu_info()->pv_cr3 = 0; -- --#ifdef CONFIG_PV -- /* -- * All entry points which may need to switch page tables have to start -- * with interrupts off. Re-write what pv_trap_init() has put there. -- */ -- _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_irq_gate, 3, -- &int80_direct_trap); --#endif -- } -- - set_nr_sockets(); - - socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets); -diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S -index bd5abd8040bd..fcc3a721f147 100644 ---- a/xen/arch/x86/x86_64/compat/entry.S -+++ b/xen/arch/x86/x86_64/compat/entry.S -@@ -21,6 +21,8 @@ ENTRY(entry_int82) - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - -+ sti -+ - CR4_PV32_RESTORE - - GET_CURRENT(bx) -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 5ca74f5f62b2..9a7b129aa7e4 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -327,7 +327,6 @@ ENTRY(sysenter_entry) - #ifdef CONFIG_XEN_SHSTK - ALTERNATIVE "", "setssbsy", X86_FEATURE_XEN_SHSTK - #endif -- /* sti could live here when we don't switch page tables below. */ - pushq $FLAT_USER_SS - pushq $0 - pushfq - -base-commit: 7befef87cc9b1bb8ca15d866ce1ecd9165ccb58c -prerequisite-patch-id: 142a87c707411d49e136c3fb76f1b14963ec6dc8 --- -2.30.2 - diff --git a/xsa447.patch b/xsa447.patch deleted file mode 100644 index 2e26396..0000000 --- a/xsa447.patch +++ /dev/null @@ -1,117 +0,0 @@ -From 084c7312fa6c1d4a7fa343efa1d7d73693dafff4 Mon Sep 17 00:00:00 2001 -From: Michal Orzel -Date: Thu, 23 Nov 2023 15:53:02 +0100 -Subject: [PATCH] xen/arm: page: Avoid pointer overflow on cache clean & - invalidate - -On Arm32, after cleaning and invalidating the last dcache line of the top -domheap page i.e. VA = 0xfffff000 (as a result of flushing the page to -RAM), we end up adding the value of a dcache line size to the pointer -once again, which results in a pointer arithmetic overflow (with 64B line -size, operation 0xffffffc0 + 0x40 overflows to 0x0). Such behavior is -undefined and given the wide range of compiler versions we support, it is -difficult to determine what could happen in such scenario. - -Modify clean_and_invalidate_dcache_va_range() as well as -clean_dcache_va_range() and invalidate_dcache_va_range() due to similarity -of handling to prevent pointer arithmetic overflow. Modify the loops to -use an additional variable to store the index of the next cacheline. -Add an assert to prevent passing a region that wraps around which is -illegal and would end up in a page fault anyway (region 0-2MB is -unmapped). Lastly, return early if size passed is 0. - -Note that on Arm64, we don't have this problem given that the max VA -space we support is 48-bits. - -This is XSA-447 / CVE-2023-46837. - -Signed-off-by: Michal Orzel -Reviewed-by: Julien Grall ---- - xen/arch/arm/include/asm/page.h | 35 ++++++++++++++++++++++++++------- - 1 file changed, 28 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h -index ebaf5964f114..69f817d1e68a 100644 ---- a/xen/arch/arm/include/asm/page.h -+++ b/xen/arch/arm/include/asm/page.h -@@ -162,6 +162,13 @@ static inline size_t read_dcache_line_bytes(void) - static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - { - size_t cacheline_mask = dcache_line_bytes - 1; -+ unsigned long idx = 0; -+ -+ if ( !size ) -+ return 0; -+ -+ /* Passing a region that wraps around is illegal */ -+ ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p); - - dsb(sy); /* So the CPU issues all writes to the range */ - -@@ -174,11 +181,11 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - } - - for ( ; size >= dcache_line_bytes; -- p += dcache_line_bytes, size -= dcache_line_bytes ) -- asm volatile (__invalidate_dcache_one(0) : : "r" (p)); -+ idx += dcache_line_bytes, size -= dcache_line_bytes ) -+ asm volatile (__invalidate_dcache_one(0) : : "r" (p + idx)); - - if ( size > 0 ) -- asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); -+ asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx)); - - dsb(sy); /* So we know the flushes happen before continuing */ - -@@ -188,14 +195,21 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - static inline int clean_dcache_va_range(const void *p, unsigned long size) - { - size_t cacheline_mask = dcache_line_bytes - 1; -+ unsigned long idx = 0; -+ -+ if ( !size ) -+ return 0; -+ -+ /* Passing a region that wraps around is illegal */ -+ ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p); - - dsb(sy); /* So the CPU issues all writes to the range */ - size += (uintptr_t)p & cacheline_mask; - size = (size + cacheline_mask) & ~cacheline_mask; - p = (void *)((uintptr_t)p & ~cacheline_mask); - for ( ; size >= dcache_line_bytes; -- p += dcache_line_bytes, size -= dcache_line_bytes ) -- asm volatile (__clean_dcache_one(0) : : "r" (p)); -+ idx += dcache_line_bytes, size -= dcache_line_bytes ) -+ asm volatile (__clean_dcache_one(0) : : "r" (p + idx)); - dsb(sy); /* So we know the flushes happen before continuing */ - /* ARM callers assume that dcache_* functions cannot fail. */ - return 0; -@@ -205,14 +219,21 @@ static inline int clean_and_invalidate_dcache_va_range - (const void *p, unsigned long size) - { - size_t cacheline_mask = dcache_line_bytes - 1; -+ unsigned long idx = 0; -+ -+ if ( !size ) -+ return 0; -+ -+ /* Passing a region that wraps around is illegal */ -+ ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p); - - dsb(sy); /* So the CPU issues all writes to the range */ - size += (uintptr_t)p & cacheline_mask; - size = (size + cacheline_mask) & ~cacheline_mask; - p = (void *)((uintptr_t)p & ~cacheline_mask); - for ( ; size >= dcache_line_bytes; -- p += dcache_line_bytes, size -= dcache_line_bytes ) -- asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); -+ idx += dcache_line_bytes, size -= dcache_line_bytes ) -+ asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx)); - dsb(sy); /* So we know the flushes happen before continuing */ - /* ARM callers assume that dcache_* functions cannot fail. */ - return 0; --- -2.40.1 - diff --git a/xsa449.patch b/xsa449.patch deleted file mode 100644 index 80aeac2..0000000 --- a/xsa449.patch +++ /dev/null @@ -1,89 +0,0 @@ -From d8b92b21b224126860978e4c604302f3c1e3bf75 Mon Sep 17 00:00:00 2001 -From: Roger Pau Monne -Date: Wed, 13 Dec 2023 15:51:59 +0100 -Subject: [PATCH] pci: fail device assignment if phantom functions cannot be - assigned -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The current behavior is that no error is reported if (some) phantom functions -fail to be assigned during device add or assignment, so the operation succeeds -even if some phantom functions are not correctly setup. - -This can lead to devices possibly being successfully assigned to a domU while -some of the device phantom functions are still assigned to dom0. Even when the -device is assigned domIO before being assigned to a domU phantom functions -might fail to be assigned to domIO, and also fail to be assigned to the domU, -leaving them assigned to dom0. - -Since the device can generate requests using the IDs of those phantom -functions, given the scenario above a device in such state would be in control -of a domU, but still capable of generating transactions that use a context ID -targeting dom0 owned memory. - -Modify device assign in order to attempt to deassign the device if phantom -functions failed to be assigned. - -Note that device addition is not modified in the same way, as in that case the -device is assigned to a trusted domain, and hence partial assign can lead to -device malfunction but not a security issue. - -This is XSA-449 / CVE-2023-46839 - -Fixes: 4e9950dc1bd2 ('IOMMU: add phantom function support') -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich ---- - xen/drivers/passthrough/pci.c | 27 +++++++++++++++++++++------ - 1 file changed, 21 insertions(+), 6 deletions(-) - -diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c -index 1439d1ef2b26..47c0eee7bdcc 100644 ---- a/xen/drivers/passthrough/pci.c -+++ b/xen/drivers/passthrough/pci.c -@@ -1488,11 +1488,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) - - pdev->fault.count = 0; - -- if ( (rc = iommu_call(hd->platform_ops, assign_device, d, devfn, -- pci_to_dev(pdev), flag)) ) -- goto done; -+ rc = iommu_call(hd->platform_ops, assign_device, d, devfn, pci_to_dev(pdev), -+ flag); - -- for ( ; pdev->phantom_stride; rc = 0 ) -+ while ( pdev->phantom_stride && !rc ) - { - devfn += pdev->phantom_stride; - if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) -@@ -1503,8 +1502,24 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) - - done: - if ( rc ) -- printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n", -- d, &PCI_SBDF(seg, bus, devfn), rc); -+ { -+ printk(XENLOG_G_WARNING "%pd: assign %s(%pp) failed (%d)\n", -+ d, devfn != pdev->devfn ? "phantom function " : "", -+ &PCI_SBDF(seg, bus, devfn), rc); -+ -+ if ( devfn != pdev->devfn && deassign_device(d, seg, bus, pdev->devfn) ) -+ { -+ /* -+ * Device with phantom functions that failed to both assign and -+ * rollback. Mark the device as broken and crash the target domain, -+ * as the state of the functions at this point is unknown and Xen -+ * has no way to assert consistent context assignment among them. -+ */ -+ pdev->broken = true; -+ if ( !is_hardware_domain(d) && d != dom_io ) -+ domain_crash(d); -+ } -+ } - /* The device is assigned to dom_io so mark it as quarantined */ - else if ( d == dom_io ) - pdev->quarantine = true; --- -2.43.0 - diff --git a/xsa450.patch b/xsa450.patch deleted file mode 100644 index e94933b..0000000 --- a/xsa450.patch +++ /dev/null @@ -1,59 +0,0 @@ -From: Andrew Cooper -Subject: VT-d: Fix "else" vs "#endif" misplacement - -In domain_pgd_maddr() the "#endif" is misplaced with respect to "else". This -generates incorrect logic when CONFIG_HVM is compiled out, as the "else" body -is executed unconditionally. - -Rework the logic to use IS_ENABLED() instead of explicit #ifdef-ary, as it's -clearer to follow. This in turn involves adjusting p2m_get_pagetable() to -compile when CONFIG_HVM is disabled. - -This is XSA-450 / CVE-2023-46840. - -Reported-by: Reported-by: Teddy Astie -Fixes: 033ff90aa9c1 ("x86/P2M: p2m_{alloc,free}_ptp() and p2m_alloc_table() are HVM-only") -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich - -diff --git a/xen/arch/x86/include/asm/p2m.h b/xen/arch/x86/include/asm/p2m.h -index 32f3f394b05a..6ada585eaac2 100644 ---- a/xen/arch/x86/include/asm/p2m.h -+++ b/xen/arch/x86/include/asm/p2m.h -@@ -435,7 +435,14 @@ static inline bool p2m_is_altp2m(const struct p2m_domain *p2m) - return p2m->p2m_class == p2m_alternate; - } - --#define p2m_get_pagetable(p2m) ((p2m)->phys_table) -+#ifdef CONFIG_HVM -+static inline pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m) -+{ -+ return p2m->phys_table; -+} -+#else -+pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m); -+#endif - - /* - * Ensure any deferred p2m TLB flush has been completed on all VCPUs. -diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c -index 99b642f12ef9..4244855032ee 100644 ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -438,15 +438,13 @@ static paddr_t domain_pgd_maddr(struct domain *d, paddr_t pgd_maddr, - - if ( pgd_maddr ) - /* nothing */; --#ifdef CONFIG_HVM -- else if ( iommu_use_hap_pt(d) ) -+ else if ( IS_ENABLED(CONFIG_HVM) && iommu_use_hap_pt(d) ) - { - pagetable_t pgt = p2m_get_pagetable(p2m_get_hostp2m(d)); - - pgd_maddr = pagetable_get_paddr(pgt); - } - else --#endif - { - if ( !hd->arch.vtd.pgd_maddr ) - { diff --git a/xsa451-4.17.patch b/xsa451-4.17.patch deleted file mode 100644 index 0206bef..0000000 --- a/xsa451-4.17.patch +++ /dev/null @@ -1,193 +0,0 @@ -From: Jan Beulich -Subject: x86: account for shadow stack in exception-from-stub recovery - -Dealing with exceptions raised from within emulation stubs involves -discarding return address (replaced by exception related information). -Such discarding of course also requires removing the corresponding entry -from the shadow stack. - -Also amend the comment in fixup_exception_return(), to further clarify -why use of ptr[1] can't be an out-of-bounds access. - -This is CVE-2023-46841 / XSA-451. - -Fixes: 209fb9919b50 ("x86/extable: Adjust extable handling to be shadow stack compatible") -Signed-off-by: Jan Beulich -Reviewed-by: Andrew Cooper - ---- a/xen/arch/x86/extable.c -+++ b/xen/arch/x86/extable.c -@@ -86,26 +86,29 @@ search_one_extable(const struct exceptio - } - - unsigned long --search_exception_table(const struct cpu_user_regs *regs) -+search_exception_table(const struct cpu_user_regs *regs, unsigned long *stub_ra) - { - const struct virtual_region *region = find_text_region(regs->rip); - unsigned long stub = this_cpu(stubs.addr); - - if ( region && region->ex ) -+ { -+ *stub_ra = 0; - return search_one_extable(region->ex, region->ex_end, regs->rip); -+ } - - if ( regs->rip >= stub + STUB_BUF_SIZE / 2 && - regs->rip < stub + STUB_BUF_SIZE && - regs->rsp > (unsigned long)regs && - regs->rsp < (unsigned long)get_cpu_info() ) - { -- unsigned long retptr = *(unsigned long *)regs->rsp; -+ unsigned long retaddr = *(unsigned long *)regs->rsp, fixup; - -- region = find_text_region(retptr); -- retptr = region && region->ex -- ? search_one_extable(region->ex, region->ex_end, retptr) -- : 0; -- if ( retptr ) -+ region = find_text_region(retaddr); -+ fixup = region && region->ex -+ ? search_one_extable(region->ex, region->ex_end, retaddr) -+ : 0; -+ if ( fixup ) - { - /* - * Put trap number and error code on the stack (in place of the -@@ -117,7 +120,8 @@ search_exception_table(const struct cpu_ - }; - - *(unsigned long *)regs->rsp = token.raw; -- return retptr; -+ *stub_ra = retaddr; -+ return fixup; - } - } - ---- a/xen/arch/x86/include/asm/uaccess.h -+++ b/xen/arch/x86/include/asm/uaccess.h -@@ -421,7 +421,8 @@ union stub_exception_token { - unsigned long raw; - }; - --extern unsigned long search_exception_table(const struct cpu_user_regs *regs); -+extern unsigned long search_exception_table(const struct cpu_user_regs *regs, -+ unsigned long *stub_ra); - extern void sort_exception_tables(void); - extern void sort_exception_table(struct exception_table_entry *start, - const struct exception_table_entry *stop); ---- a/xen/arch/x86/traps.c -+++ b/xen/arch/x86/traps.c -@@ -856,7 +856,7 @@ void do_unhandled_trap(struct cpu_user_r - } - - static void fixup_exception_return(struct cpu_user_regs *regs, -- unsigned long fixup) -+ unsigned long fixup, unsigned long stub_ra) - { - if ( IS_ENABLED(CONFIG_XEN_SHSTK) ) - { -@@ -873,7 +873,8 @@ static void fixup_exception_return(struc - /* - * Search for %rip. The shstk currently looks like this: - * -- * ... [Likely pointed to by SSP] -+ * tok [Supervisor token, == &tok | BUSY, only with FRED inactive] -+ * ... [Pointed to by SSP for most exceptions, empty in IST cases] - * %cs [== regs->cs] - * %rip [== regs->rip] - * SSP [Likely points to 3 slots higher, above %cs] -@@ -891,7 +892,56 @@ static void fixup_exception_return(struc - */ - if ( ptr[0] == regs->rip && ptr[1] == regs->cs ) - { -+ unsigned long primary_shstk = -+ (ssp & ~(STACK_SIZE - 1)) + -+ (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8; -+ - wrss(fixup, ptr); -+ -+ if ( !stub_ra ) -+ goto shstk_done; -+ -+ /* -+ * Stub recovery ought to happen only when the outer context -+ * was on the main shadow stack. We need to also "pop" the -+ * stub's return address from the interrupted context's shadow -+ * stack. That is, -+ * - if we're still on the main stack, we need to move the -+ * entire stack (up to and including the exception frame) -+ * up by one slot, incrementing the original SSP in the -+ * exception frame, -+ * - if we're on an IST stack, we need to increment the -+ * original SSP. -+ */ -+ BUG_ON((ptr[-1] ^ primary_shstk) >> PAGE_SHIFT); -+ -+ if ( (ssp ^ primary_shstk) >> PAGE_SHIFT ) -+ { -+ /* -+ * We're on an IST stack. First make sure the two return -+ * addresses actually match. Then increment the interrupted -+ * context's SSP. -+ */ -+ BUG_ON(stub_ra != *(unsigned long*)ptr[-1]); -+ wrss(ptr[-1] + 8, &ptr[-1]); -+ goto shstk_done; -+ } -+ -+ /* Make sure the two return addresses actually match. */ -+ BUG_ON(stub_ra != ptr[2]); -+ -+ /* Move exception frame, updating SSP there. */ -+ wrss(ptr[1], &ptr[2]); /* %cs */ -+ wrss(ptr[0], &ptr[1]); /* %rip */ -+ wrss(ptr[-1] + 8, &ptr[0]); /* SSP */ -+ -+ /* Move all newer entries. */ -+ while ( --ptr != _p(ssp) ) -+ wrss(ptr[-1], &ptr[0]); -+ -+ /* Finally account for our own stack having shifted up. */ -+ asm volatile ( "incsspd %0" :: "r" (2) ); -+ - goto shstk_done; - } - } -@@ -912,7 +962,8 @@ static void fixup_exception_return(struc - - static bool extable_fixup(struct cpu_user_regs *regs, bool print) - { -- unsigned long fixup = search_exception_table(regs); -+ unsigned long stub_ra = 0; -+ unsigned long fixup = search_exception_table(regs, &stub_ra); - - if ( unlikely(fixup == 0) ) - return false; -@@ -926,7 +977,7 @@ static bool extable_fixup(struct cpu_use - vector_name(regs->entry_vector), regs->error_code, - _p(regs->rip), _p(regs->rip), _p(fixup)); - -- fixup_exception_return(regs, fixup); -+ fixup_exception_return(regs, fixup, stub_ra); - this_cpu(last_extable_addr) = regs->rip; - - return true; -@@ -1214,7 +1265,7 @@ void do_invalid_op(struct cpu_user_regs - void (*fn)(struct cpu_user_regs *) = bug_ptr(bug); - - fn(regs); -- fixup_exception_return(regs, (unsigned long)eip); -+ fixup_exception_return(regs, (unsigned long)eip, 0); - return; - } - -@@ -1235,7 +1286,7 @@ void do_invalid_op(struct cpu_user_regs - case BUGFRAME_warn: - printk("Xen WARN at %s%s:%d\n", prefix, filename, lineno); - show_execution_state(regs); -- fixup_exception_return(regs, (unsigned long)eip); -+ fixup_exception_return(regs, (unsigned long)eip, 0); - return; - - case BUGFRAME_bug: diff --git a/xsa452-4.17-1.patch b/xsa452-4.17-1.patch deleted file mode 100644 index cdec10e..0000000 --- a/xsa452-4.17-1.patch +++ /dev/null @@ -1,304 +0,0 @@ -From: Andrew Cooper -Subject: x86/entry: Introduce EFRAME_* constants - -restore_all_guest() does a lot of manipulation of the stack after popping the -GPRs, and uses raw %rsp displacements to do so. Also, almost all entrypaths -use raw %rsp displacements prior to pushing GPRs. - -Provide better mnemonics, to aid readability and reduce the chance of errors -when editing. - -No functional change. The resulting binary is identical. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 37541208f119a9c552c6c6c3246ea61be0d44035) - -diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c -index 287dac101ad4..31fa63b77fd1 100644 ---- a/xen/arch/x86/x86_64/asm-offsets.c -+++ b/xen/arch/x86/x86_64/asm-offsets.c -@@ -51,6 +51,23 @@ void __dummy__(void) - OFFSET(UREGS_kernel_sizeof, struct cpu_user_regs, es); - BLANK(); - -+ /* -+ * EFRAME_* is for the entry/exit logic where %rsp is pointing at -+ * UREGS_error_code and GPRs are still/already guest values. -+ */ -+#define OFFSET_EF(sym, mem) \ -+ DEFINE(sym, offsetof(struct cpu_user_regs, mem) - \ -+ offsetof(struct cpu_user_regs, error_code)) -+ -+ OFFSET_EF(EFRAME_entry_vector, entry_vector); -+ OFFSET_EF(EFRAME_rip, rip); -+ OFFSET_EF(EFRAME_cs, cs); -+ OFFSET_EF(EFRAME_eflags, eflags); -+ OFFSET_EF(EFRAME_rsp, rsp); -+ BLANK(); -+ -+#undef OFFSET_EF -+ - OFFSET(VCPU_processor, struct vcpu, processor); - OFFSET(VCPU_domain, struct vcpu, domain); - OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info); -diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S -index 253bb1688c4f..7c211314d885 100644 ---- a/xen/arch/x86/x86_64/compat/entry.S -+++ b/xen/arch/x86/x86_64/compat/entry.S -@@ -15,7 +15,7 @@ ENTRY(entry_int82) - ENDBR64 - ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP - pushq $0 -- movl $HYPERCALL_VECTOR, 4(%rsp) -+ movl $HYPERCALL_VECTOR, EFRAME_entry_vector(%rsp) - SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */ - - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 585b0c955191..412cbeb3eca4 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -190,15 +190,15 @@ restore_all_guest: - SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ - - RESTORE_ALL -- testw $TRAP_syscall,4(%rsp) -+ testw $TRAP_syscall, EFRAME_entry_vector(%rsp) - jz iret_exit_to_guest - -- movq 24(%rsp),%r11 # RFLAGS -+ mov EFRAME_eflags(%rsp), %r11 - andq $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), %r11 - orq $X86_EFLAGS_IF,%r11 - - /* Don't use SYSRET path if the return address is not canonical. */ -- movq 8(%rsp),%rcx -+ mov EFRAME_rip(%rsp), %rcx - sarq $47,%rcx - incl %ecx - cmpl $1,%ecx -@@ -213,20 +213,20 @@ restore_all_guest: - ALTERNATIVE "", rag_clrssbsy, X86_FEATURE_XEN_SHSTK - #endif - -- movq 8(%rsp), %rcx # RIP -- cmpw $FLAT_USER_CS32,16(%rsp)# CS -- movq 32(%rsp),%rsp # RSP -+ mov EFRAME_rip(%rsp), %rcx -+ cmpw $FLAT_USER_CS32, EFRAME_cs(%rsp) -+ mov EFRAME_rsp(%rsp), %rsp - je 1f - sysretq - 1: sysretl - - ALIGN - .Lrestore_rcx_iret_exit_to_guest: -- movq 8(%rsp), %rcx # RIP -+ mov EFRAME_rip(%rsp), %rcx - /* No special register assumptions. */ - iret_exit_to_guest: -- andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), 24(%rsp) -- orl $X86_EFLAGS_IF,24(%rsp) -+ andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp) -+ orl $X86_EFLAGS_IF, EFRAME_eflags(%rsp) - addq $8,%rsp - .Lft0: iretq - _ASM_PRE_EXTABLE(.Lft0, handle_exception) -@@ -257,7 +257,7 @@ ENTRY(lstar_enter) - pushq $FLAT_KERNEL_CS64 - pushq %rcx - pushq $0 -- movl $TRAP_syscall, 4(%rsp) -+ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) - SAVE_ALL - - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ -@@ -294,7 +294,7 @@ ENTRY(cstar_enter) - pushq $FLAT_USER_CS32 - pushq %rcx - pushq $0 -- movl $TRAP_syscall, 4(%rsp) -+ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) - SAVE_ALL - - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ -@@ -335,7 +335,7 @@ GLOBAL(sysenter_eflags_saved) - pushq $3 /* ring 3 null cs */ - pushq $0 /* null rip */ - pushq $0 -- movl $TRAP_syscall, 4(%rsp) -+ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) - SAVE_ALL - - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ -@@ -389,7 +389,7 @@ ENTRY(int80_direct_trap) - ENDBR64 - ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP - pushq $0 -- movl $0x80, 4(%rsp) -+ movl $0x80, EFRAME_entry_vector(%rsp) - SAVE_ALL - - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ -@@ -649,7 +649,7 @@ ret_from_intr: - .section .init.text, "ax", @progbits - ENTRY(early_page_fault) - ENDBR64 -- movl $TRAP_page_fault, 4(%rsp) -+ movl $TRAP_page_fault, EFRAME_entry_vector(%rsp) - SAVE_ALL - movq %rsp, %rdi - call do_early_page_fault -@@ -716,7 +716,7 @@ ENTRY(common_interrupt) - - ENTRY(page_fault) - ENDBR64 -- movl $TRAP_page_fault,4(%rsp) -+ movl $TRAP_page_fault, EFRAME_entry_vector(%rsp) - /* No special register assumptions. */ - GLOBAL(handle_exception) - ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP -@@ -892,90 +892,90 @@ FATAL_exception_with_ints_disabled: - ENTRY(divide_error) - ENDBR64 - pushq $0 -- movl $TRAP_divide_error,4(%rsp) -+ movl $TRAP_divide_error, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(coprocessor_error) - ENDBR64 - pushq $0 -- movl $TRAP_copro_error,4(%rsp) -+ movl $TRAP_copro_error, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(simd_coprocessor_error) - ENDBR64 - pushq $0 -- movl $TRAP_simd_error,4(%rsp) -+ movl $TRAP_simd_error, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(device_not_available) - ENDBR64 - pushq $0 -- movl $TRAP_no_device,4(%rsp) -+ movl $TRAP_no_device, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(debug) - ENDBR64 - pushq $0 -- movl $TRAP_debug,4(%rsp) -+ movl $TRAP_debug, EFRAME_entry_vector(%rsp) - jmp handle_ist_exception - - ENTRY(int3) - ENDBR64 - pushq $0 -- movl $TRAP_int3,4(%rsp) -+ movl $TRAP_int3, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(overflow) - ENDBR64 - pushq $0 -- movl $TRAP_overflow,4(%rsp) -+ movl $TRAP_overflow, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(bounds) - ENDBR64 - pushq $0 -- movl $TRAP_bounds,4(%rsp) -+ movl $TRAP_bounds, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(invalid_op) - ENDBR64 - pushq $0 -- movl $TRAP_invalid_op,4(%rsp) -+ movl $TRAP_invalid_op, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(invalid_TSS) - ENDBR64 -- movl $TRAP_invalid_tss,4(%rsp) -+ movl $TRAP_invalid_tss, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(segment_not_present) - ENDBR64 -- movl $TRAP_no_segment,4(%rsp) -+ movl $TRAP_no_segment, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(stack_segment) - ENDBR64 -- movl $TRAP_stack_error,4(%rsp) -+ movl $TRAP_stack_error, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(general_protection) - ENDBR64 -- movl $TRAP_gp_fault,4(%rsp) -+ movl $TRAP_gp_fault, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(alignment_check) - ENDBR64 -- movl $TRAP_alignment_check,4(%rsp) -+ movl $TRAP_alignment_check, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_CP) - ENDBR64 -- movl $X86_EXC_CP, 4(%rsp) -+ movl $X86_EXC_CP, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(double_fault) - ENDBR64 -- movl $TRAP_double_fault,4(%rsp) -+ movl $TRAP_double_fault, EFRAME_entry_vector(%rsp) - /* Set AC to reduce chance of further SMAP faults */ - ALTERNATIVE "", stac, X86_FEATURE_XEN_SMAP - SAVE_ALL -@@ -1001,7 +1001,7 @@ ENTRY(double_fault) - ENTRY(nmi) - ENDBR64 - pushq $0 -- movl $TRAP_nmi,4(%rsp) -+ movl $TRAP_nmi, EFRAME_entry_vector(%rsp) - handle_ist_exception: - ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP - SAVE_ALL -@@ -1134,7 +1134,7 @@ handle_ist_exception: - ENTRY(machine_check) - ENDBR64 - pushq $0 -- movl $TRAP_machine_check,4(%rsp) -+ movl $TRAP_machine_check, EFRAME_entry_vector(%rsp) - jmp handle_ist_exception - - /* No op trap handler. Required for kexec crash path. */ -@@ -1171,7 +1171,7 @@ autogen_stubs: /* Automatically generated stubs. */ - 1: - ENDBR64 - pushq $0 -- movb $vec,4(%rsp) -+ movb $vec, EFRAME_entry_vector(%rsp) - jmp common_interrupt - - entrypoint 1b -@@ -1185,7 +1185,7 @@ autogen_stubs: /* Automatically generated stubs. */ - test $8,%spl /* 64bit exception frames are 16 byte aligned, but the word */ - jz 2f /* size is 8 bytes. Check whether the processor gave us an */ - pushq $0 /* error code, and insert an empty one if not. */ --2: movb $vec,4(%rsp) -+2: movb $vec, EFRAME_entry_vector(%rsp) - jmp handle_exception - - entrypoint 1b diff --git a/xsa452-4.17-2.patch b/xsa452-4.17-2.patch deleted file mode 100644 index 4535397..0000000 --- a/xsa452-4.17-2.patch +++ /dev/null @@ -1,90 +0,0 @@ -From: Andrew Cooper -Subject: x86: Resync intel-family.h from Linux - -From v6.8-rc6 - -Signed-off-by: Andrew Cooper -Acked-by: Jan Beulich -(cherry picked from commit 195e75371b13c4f7ecdf7b5c50aed0d02f2d7ce8) - -diff --git a/xen/arch/x86/include/asm/intel-family.h b/xen/arch/x86/include/asm/intel-family.h -index ffc49151befe..b65e9c46b922 100644 ---- a/xen/arch/x86/include/asm/intel-family.h -+++ b/xen/arch/x86/include/asm/intel-family.h -@@ -26,6 +26,9 @@ - * _G - parts with extra graphics on - * _X - regular server parts - * _D - micro server parts -+ * _N,_P - other mobile parts -+ * _H - premium mobile parts -+ * _S - other client parts - * - * Historical OPTDIFFs: - * -@@ -37,6 +40,9 @@ - * their own names :-( - */ - -+/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ -+#define INTEL_FAM6_ANY X86_MODEL_ANY -+ - #define INTEL_FAM6_CORE_YONAH 0x0E - - #define INTEL_FAM6_CORE2_MEROM 0x0F -@@ -93,8 +99,6 @@ - #define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ - #define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ - --#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ -- - #define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ - - #define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ -@@ -102,12 +106,31 @@ - - #define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ - -+#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF -+ -+#define INTEL_FAM6_GRANITERAPIDS_X 0xAD -+#define INTEL_FAM6_GRANITERAPIDS_D 0xAE -+ -+/* "Hybrid" Processors (P-Core/E-Core) */ -+ -+#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ -+ - #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ - #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ - --#define INTEL_FAM6_RAPTORLAKE 0xB7 -+#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */ -+#define INTEL_FAM6_RAPTORLAKE_P 0xBA -+#define INTEL_FAM6_RAPTORLAKE_S 0xBF -+ -+#define INTEL_FAM6_METEORLAKE 0xAC -+#define INTEL_FAM6_METEORLAKE_L 0xAA -+ -+#define INTEL_FAM6_ARROWLAKE_H 0xC5 -+#define INTEL_FAM6_ARROWLAKE 0xC6 -+ -+#define INTEL_FAM6_LUNARLAKE_M 0xBD - --/* "Small Core" Processors (Atom) */ -+/* "Small Core" Processors (Atom/E-Core) */ - - #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ - #define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ -@@ -134,6 +157,13 @@ - #define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ - #define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ - -+#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */ -+ -+#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ -+#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ -+ -+#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */ -+ - /* Xeon Phi */ - - #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ diff --git a/xsa452-4.17-3.patch b/xsa452-4.17-3.patch deleted file mode 100644 index 0a39333..0000000 --- a/xsa452-4.17-3.patch +++ /dev/null @@ -1,135 +0,0 @@ -From: Andrew Cooper -Subject: x86/vmx: Perform VERW flushing later in the VMExit path - -Broken out of the following patch because this change is subtle enough on its -own. See it for the rational of why we're moving VERW. - -As for how, extend the trick already used to hold one condition in -flags (RESUME vs LAUNCH) through the POPing of GPRs. - -Move the MOV CR earlier. Intel specify flags to be undefined across it. - -Encode the two conditions we want using SF and PF. See the code comment for -exactly how. - -Leave a comment to explain the lack of any content around -SPEC_CTRL_EXIT_TO_VMX, but leave the block in place. Sods law says if we -delete it, we'll need to reintroduce it. - -This is part of XSA-452 / CVE-2023-28746. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 475fa20b7384464210f42bad7195f87bd6f1c63f) - -diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S -index 5f5de45a1309..cdde76e13892 100644 ---- a/xen/arch/x86/hvm/vmx/entry.S -+++ b/xen/arch/x86/hvm/vmx/entry.S -@@ -87,17 +87,39 @@ UNLIKELY_END(realmode) - - /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ - /* SPEC_CTRL_EXIT_TO_VMX Req: %rsp=regs/cpuinfo Clob: */ -- DO_SPEC_CTRL_COND_VERW -+ /* -+ * All speculation safety work happens to be elsewhere. VERW is after -+ * popping the GPRs, while restoring the guest MSR_SPEC_CTRL is left -+ * to the MSR load list. -+ */ - - mov VCPU_hvm_guest_cr2(%rbx),%rax -+ mov %rax, %cr2 -+ -+ /* -+ * We need to perform two conditional actions (VERW, and Resume vs -+ * Launch) after popping GPRs. With some cunning, we can encode both -+ * of these in eflags together. -+ * -+ * Parity is only calculated over the bottom byte of the answer, while -+ * Sign is simply the top bit. -+ * -+ * Therefore, the final OR instruction ends up producing: -+ * SF = VCPU_vmx_launched -+ * PF = !SCF_verw -+ */ -+ BUILD_BUG_ON(SCF_verw & ~0xff) -+ movzbl VCPU_vmx_launched(%rbx), %ecx -+ shl $31, %ecx -+ movzbl CPUINFO_spec_ctrl_flags(%rsp), %eax -+ and $SCF_verw, %eax -+ or %eax, %ecx - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp -- mov %rax,%cr2 -- cmpb $0,VCPU_vmx_launched(%rbx) - pop %rbx - pop %r11 - pop %r10 -@@ -108,7 +130,13 @@ UNLIKELY_END(realmode) - pop %rdx - pop %rsi - pop %rdi -- je .Lvmx_launch -+ -+ jpe .L_skip_verw -+ /* VERW clobbers ZF, but preserves all others, including SF. */ -+ verw STK_REL(CPUINFO_verw_sel, CPUINFO_error_code)(%rsp) -+.L_skip_verw: -+ -+ jns .Lvmx_launch - - /*.Lvmx_resume:*/ - VMRESUME -diff --git a/xen/arch/x86/include/asm/asm_defns.h b/xen/arch/x86/include/asm/asm_defns.h -index d9431180cfba..abc6822b08c8 100644 ---- a/xen/arch/x86/include/asm/asm_defns.h -+++ b/xen/arch/x86/include/asm/asm_defns.h -@@ -81,6 +81,14 @@ register unsigned long current_stack_pointer asm("rsp"); - - #ifdef __ASSEMBLY__ - -+.macro BUILD_BUG_ON condstr, cond:vararg -+ .if \cond -+ .error "Condition \"\condstr\" not satisfied" -+ .endif -+.endm -+/* preprocessor macro to make error message more user friendly */ -+#define BUILD_BUG_ON(cond) BUILD_BUG_ON #cond, cond -+ - #ifdef HAVE_AS_QUOTED_SYM - #define SUBSECTION_LBL(tag) \ - .ifndef .L.tag; \ -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index f4b8b9d9561c..ca9cb0f5dd1d 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -164,6 +164,13 @@ - #endif - .endm - -+/* -+ * Helper to improve the readibility of stack dispacements with %rsp in -+ * unusual positions. Both @field and @top_of_stack should be constants from -+ * the same object. @top_of_stack should be where %rsp is currently pointing. -+ */ -+#define STK_REL(field, top_of_stk) ((field) - (top_of_stk)) -+ - .macro DO_SPEC_CTRL_COND_VERW - /* - * Requires %rsp=cpuinfo -diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c -index 31fa63b77fd1..a4e94d693024 100644 ---- a/xen/arch/x86/x86_64/asm-offsets.c -+++ b/xen/arch/x86/x86_64/asm-offsets.c -@@ -135,6 +135,7 @@ void __dummy__(void) - #endif - - OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); -+ OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code); - OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel); - OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); - OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset); diff --git a/xsa452-4.17-4.patch b/xsa452-4.17-4.patch deleted file mode 100644 index fa9bb12..0000000 --- a/xsa452-4.17-4.patch +++ /dev/null @@ -1,197 +0,0 @@ -From: Andrew Cooper -Subject: x86/spec-ctrl: Perform VERW flushing later in exit paths - -On parts vulnerable to RFDS, VERW's side effects are extended to scrub all -non-architectural entries in various Physical Register Files. To remove all -of Xen's values, the VERW must be after popping the GPRs. - -Rework SPEC_CTRL_COND_VERW to default to an CPUINFO_error_code %rsp position, -but with overrides for other contexts. Identify that it clobbers eflags; this -is particularly relevant for the SYSRET path. - -For the IST exit return to Xen, have the main SPEC_CTRL_EXIT_TO_XEN put a -shadow copy of spec_ctrl_flags, as GPRs can't be used at the point we want to -issue the VERW. - -This is part of XSA-452 / CVE-2023-28746. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 0a666cf2cd99df6faf3eebc81a1fc286e4eca4c7) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index ca9cb0f5dd1d..97a97b2b82c9 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -171,16 +171,23 @@ - */ - #define STK_REL(field, top_of_stk) ((field) - (top_of_stk)) - --.macro DO_SPEC_CTRL_COND_VERW -+.macro SPEC_CTRL_COND_VERW \ -+ scf=STK_REL(CPUINFO_spec_ctrl_flags, CPUINFO_error_code), \ -+ sel=STK_REL(CPUINFO_verw_sel, CPUINFO_error_code) - /* -- * Requires %rsp=cpuinfo -+ * Requires \scf and \sel as %rsp-relative expressions -+ * Clobbers eflags -+ * -+ * VERW needs to run after guest GPRs have been restored, where only %rsp is -+ * good to use. Default to expecting %rsp pointing at CPUINFO_error_code. -+ * Contexts where this is not true must provide an alternative \scf and \sel. - * - * Issue a VERW for its flushing side effect, if indicated. This is a Spectre - * v1 gadget, but the IRET/VMEntry is serialising. - */ -- testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp) -+ testb $SCF_verw, \scf(%rsp) - jz .L\@_verw_skip -- verw CPUINFO_verw_sel(%rsp) -+ verw \sel(%rsp) - .L\@_verw_skip: - .endm - -@@ -298,8 +305,6 @@ - */ - ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV - -- DO_SPEC_CTRL_COND_VERW -- - ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV - .endm - -@@ -379,7 +384,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - */ - .macro SPEC_CTRL_EXIT_TO_XEN - /* -- * Requires %r12=ist_exit, %r14=stack_end -+ * Requires %r12=ist_exit, %r14=stack_end, %rsp=regs - * Clobbers %rax, %rbx, %rcx, %rdx - */ - movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx -@@ -407,11 +412,18 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - test %r12, %r12 - jz .L\@_skip_ist_exit - -- /* Logically DO_SPEC_CTRL_COND_VERW but without the %rsp=cpuinfo dependency */ -- testb $SCF_verw, %bl -- jz .L\@_skip_verw -- verw STACK_CPUINFO_FIELD(verw_sel)(%r14) --.L\@_skip_verw: -+ /* -+ * Stash SCF and verw_sel above eflags in the case of an IST_exit. The -+ * VERW logic needs to run after guest GPRs have been restored; i.e. where -+ * we cannot use %r12 or %r14 for the purposes they have here. -+ * -+ * When the CPU pushed this exception frame, it zero-extended eflags. -+ * Therefore it is safe for the VERW logic to look at the stashed SCF -+ * outside of the ist_exit condition. Also, this stashing won't influence -+ * any other restore_all_guest() paths. -+ */ -+ or $(__HYPERVISOR_DS32 << 16), %ebx -+ mov %ebx, UREGS_eflags + 4(%rsp) /* EFRAME_shadow_scf/sel */ - - ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV - -diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c -index a4e94d693024..4cd5938d7b9d 100644 ---- a/xen/arch/x86/x86_64/asm-offsets.c -+++ b/xen/arch/x86/x86_64/asm-offsets.c -@@ -55,14 +55,22 @@ void __dummy__(void) - * EFRAME_* is for the entry/exit logic where %rsp is pointing at - * UREGS_error_code and GPRs are still/already guest values. - */ --#define OFFSET_EF(sym, mem) \ -+#define OFFSET_EF(sym, mem, ...) \ - DEFINE(sym, offsetof(struct cpu_user_regs, mem) - \ -- offsetof(struct cpu_user_regs, error_code)) -+ offsetof(struct cpu_user_regs, error_code) __VA_ARGS__) - - OFFSET_EF(EFRAME_entry_vector, entry_vector); - OFFSET_EF(EFRAME_rip, rip); - OFFSET_EF(EFRAME_cs, cs); - OFFSET_EF(EFRAME_eflags, eflags); -+ -+ /* -+ * These aren't real fields. They're spare space, used by the IST -+ * exit-to-xen path. -+ */ -+ OFFSET_EF(EFRAME_shadow_scf, eflags, +4); -+ OFFSET_EF(EFRAME_shadow_sel, eflags, +6); -+ - OFFSET_EF(EFRAME_rsp, rsp); - BLANK(); - -@@ -136,6 +144,7 @@ void __dummy__(void) - - OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); - OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code); -+ OFFSET(CPUINFO_rip, struct cpu_info, guest_cpu_user_regs.rip); - OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel); - OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); - OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset); -diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S -index 7c211314d885..3b2fbcd8733a 100644 ---- a/xen/arch/x86/x86_64/compat/entry.S -+++ b/xen/arch/x86/x86_64/compat/entry.S -@@ -161,6 +161,12 @@ ENTRY(compat_restore_all_guest) - SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ - - RESTORE_ALL adj=8 compat=1 -+ -+ /* Account for ev/ec having already been popped off the stack. */ -+ SPEC_CTRL_COND_VERW \ -+ scf=STK_REL(CPUINFO_spec_ctrl_flags, CPUINFO_rip), \ -+ sel=STK_REL(CPUINFO_verw_sel, CPUINFO_rip) -+ - .Lft0: iretq - _ASM_PRE_EXTABLE(.Lft0, handle_exception) - -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 412cbeb3eca4..ef517e2945b0 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -214,6 +214,9 @@ restore_all_guest: - #endif - - mov EFRAME_rip(%rsp), %rcx -+ -+ SPEC_CTRL_COND_VERW /* Req: %rsp=eframe Clob: efl */ -+ - cmpw $FLAT_USER_CS32, EFRAME_cs(%rsp) - mov EFRAME_rsp(%rsp), %rsp - je 1f -@@ -227,6 +230,9 @@ restore_all_guest: - iret_exit_to_guest: - andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp) - orl $X86_EFLAGS_IF, EFRAME_eflags(%rsp) -+ -+ SPEC_CTRL_COND_VERW /* Req: %rsp=eframe Clob: efl */ -+ - addq $8,%rsp - .Lft0: iretq - _ASM_PRE_EXTABLE(.Lft0, handle_exception) -@@ -679,9 +685,22 @@ UNLIKELY_START(ne, exit_cr3) - UNLIKELY_END(exit_cr3) - - /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ -- SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end, Clob: abcd */ -+ SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end %rsp=regs, Clob: abcd */ - - RESTORE_ALL adj=8 -+ -+ /* -+ * When the CPU pushed this exception frame, it zero-extended eflags. -+ * For an IST exit, SPEC_CTRL_EXIT_TO_XEN stashed shadow copies of -+ * spec_ctrl_flags and ver_sel above eflags, as we can't use any GPRs, -+ * and we're at a random place on the stack, not in a CPUFINFO block. -+ * -+ * Account for ev/ec having already been popped off the stack. -+ */ -+ SPEC_CTRL_COND_VERW \ -+ scf=STK_REL(EFRAME_shadow_scf, EFRAME_rip), \ -+ sel=STK_REL(EFRAME_shadow_sel, EFRAME_rip) -+ - iretq - - ENTRY(common_interrupt) diff --git a/xsa452-4.17-5.patch b/xsa452-4.17-5.patch deleted file mode 100644 index 0230b33..0000000 --- a/xsa452-4.17-5.patch +++ /dev/null @@ -1,239 +0,0 @@ -From: Andrew Cooper -Subject: x86/spec-ctrl: Rename VERW related options - -VERW is going to be used for a 3rd purpose, and the existing nomenclature -didn't survive the Stale MMIO issues terribly well. - -Rename the command line option from `md-clear=` to `verw=`. This is more -consistent with other options which tend to be named based on what they're -doing, not which feature enumeration they use behind the scenes. Retain -`md-clear=` as a deprecated alias. - -Rename opt_md_clear_{pv,hvm} and opt_fb_clear_mmio to opt_verw_{pv,hvm,mmio}, -which has a side effect of making spec_ctrl_init_domain() rather clearer to -follow. - -No functional change. - -This is part of XSA-452 / CVE-2023-28746. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit f7603ca252e4226739eb3129a5290ee3da3f8ea4) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index 2006697226de..d909ec94fe7c 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2324,7 +2324,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - - ### spec-ctrl (x86) - > `= List of [ , xen=, {pv,hvm}=, --> {msr-sc,rsb,md-clear,ibpb-entry}=|{pv,hvm}=, -+> {msr-sc,rsb,verw,ibpb-entry}=|{pv,hvm}=, - > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, - > eager-fpu,l1d-flush,branch-harden,srb-lock, - > unpriv-mmio,gds-mit,div-scrub}= ]` -@@ -2349,7 +2349,7 @@ in place for guests to use. - - Use of a positive boolean value for either of these options is invalid. - --The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `md-clear=` and `ibpb-entry=` options -+The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `verw=` and `ibpb-entry=` options - offer fine grained control over the primitives by Xen. These impact Xen's - ability to protect itself, and/or Xen's ability to virtualise support for - guests to use. -@@ -2366,11 +2366,12 @@ guests to use. - guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc. - * `rsb=` offers control over whether to overwrite the Return Stack Buffer / - Return Address Stack on entry to Xen and on idle. --* `md-clear=` offers control over whether to use VERW to flush -- microarchitectural buffers on idle and exit from Xen. *Note: For -- compatibility with development versions of this fix, `mds=` is also accepted -- on Xen 4.12 and earlier as an alias. Consult vendor documentation in -- preference to here.* -+* `verw=` offers control over whether to use VERW for its scrubbing side -+ effects at appropriate privilege transitions. The exact side effects are -+ microarchitecture and microcode specific. *Note: `md-clear=` is accepted as -+ a deprecated alias. For compatibility with development versions of XSA-297, -+ `mds=` is also accepted on Xen 4.12 and earlier as an alias. Consult vendor -+ documentation in preference to here.* - * `ibpb-entry=` offers control over whether IBPB (Indirect Branch Prediction - Barrier) is used on entry to Xen. This is used by default on hardware - vulnerable to Branch Type Confusion, and hardware vulnerable to Speculative -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 25a18ac598fa..e12ec9930cf7 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -37,8 +37,8 @@ static bool __initdata opt_msr_sc_pv = true; - static bool __initdata opt_msr_sc_hvm = true; - static int8_t __initdata opt_rsb_pv = -1; - static bool __initdata opt_rsb_hvm = true; --static int8_t __ro_after_init opt_md_clear_pv = -1; --static int8_t __ro_after_init opt_md_clear_hvm = -1; -+static int8_t __ro_after_init opt_verw_pv = -1; -+static int8_t __ro_after_init opt_verw_hvm = -1; - - static int8_t __ro_after_init opt_ibpb_entry_pv = -1; - static int8_t __ro_after_init opt_ibpb_entry_hvm = -1; -@@ -78,7 +78,7 @@ static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. - - static int8_t __initdata opt_srb_lock = -1; - static bool __initdata opt_unpriv_mmio; --static bool __ro_after_init opt_fb_clear_mmio; -+static bool __ro_after_init opt_verw_mmio; - static int8_t __initdata opt_gds_mit = -1; - static int8_t __initdata opt_div_scrub = -1; - -@@ -120,8 +120,8 @@ static int __init cf_check parse_spec_ctrl(const char *s) - disable_common: - opt_rsb_pv = false; - opt_rsb_hvm = false; -- opt_md_clear_pv = 0; -- opt_md_clear_hvm = 0; -+ opt_verw_pv = 0; -+ opt_verw_hvm = 0; - opt_ibpb_entry_pv = 0; - opt_ibpb_entry_hvm = 0; - opt_ibpb_entry_dom0 = false; -@@ -152,14 +152,14 @@ static int __init cf_check parse_spec_ctrl(const char *s) - { - opt_msr_sc_pv = val; - opt_rsb_pv = val; -- opt_md_clear_pv = val; -+ opt_verw_pv = val; - opt_ibpb_entry_pv = val; - } - else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) - { - opt_msr_sc_hvm = val; - opt_rsb_hvm = val; -- opt_md_clear_hvm = val; -+ opt_verw_hvm = val; - opt_ibpb_entry_hvm = val; - } - else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 ) -@@ -204,21 +204,22 @@ static int __init cf_check parse_spec_ctrl(const char *s) - break; - } - } -- else if ( (val = parse_boolean("md-clear", s, ss)) != -1 ) -+ else if ( (val = parse_boolean("verw", s, ss)) != -1 || -+ (val = parse_boolean("md-clear", s, ss)) != -1 ) - { - switch ( val ) - { - case 0: - case 1: -- opt_md_clear_pv = opt_md_clear_hvm = val; -+ opt_verw_pv = opt_verw_hvm = val; - break; - - case -2: -- s += strlen("md-clear="); -+ s += (*s == 'v') ? strlen("verw=") : strlen("md-clear="); - if ( (val = parse_boolean("pv", s, ss)) >= 0 ) -- opt_md_clear_pv = val; -+ opt_verw_pv = val; - else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) -- opt_md_clear_hvm = val; -+ opt_verw_hvm = val; - else - default: - rc = -EINVAL; -@@ -540,8 +541,8 @@ static void __init print_details(enum ind_thunk thunk) - opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", - opt_ibpb_ctxt_switch ? " IBPB-ctxt" : "", - opt_l1d_flush ? " L1D_FLUSH" : "", -- opt_md_clear_pv || opt_md_clear_hvm || -- opt_fb_clear_mmio ? " VERW" : "", -+ opt_verw_pv || opt_verw_hvm || -+ opt_verw_mmio ? " VERW" : "", - opt_div_scrub ? " DIV" : "", - opt_branch_harden ? " BRANCH_HARDEN" : ""); - -@@ -562,13 +563,13 @@ static void __init print_details(enum ind_thunk thunk) - boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) || - amd_virt_spec_ctrl || -- opt_eager_fpu || opt_md_clear_hvm) ? "" : " None", -+ opt_eager_fpu || opt_verw_hvm) ? "" : " None", - boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", - (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || - amd_virt_spec_ctrl) ? " MSR_VIRT_SPEC_CTRL" : "", - boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", - opt_eager_fpu ? " EAGER_FPU" : "", -- opt_md_clear_hvm ? " MD_CLEAR" : "", -+ opt_verw_hvm ? " VERW" : "", - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : ""); - - #endif -@@ -577,11 +578,11 @@ static void __init print_details(enum ind_thunk thunk) - (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || - boot_cpu_has(X86_FEATURE_SC_RSB_PV) || - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) || -- opt_eager_fpu || opt_md_clear_pv) ? "" : " None", -+ opt_eager_fpu || opt_verw_pv) ? "" : " None", - boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", - boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", - opt_eager_fpu ? " EAGER_FPU" : "", -- opt_md_clear_pv ? " MD_CLEAR" : "", -+ opt_verw_pv ? " VERW" : "", - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : ""); - - printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n", -@@ -1514,8 +1515,8 @@ void spec_ctrl_init_domain(struct domain *d) - { - bool pv = is_pv_domain(d); - -- bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) || -- (opt_fb_clear_mmio && is_iommu_enabled(d))); -+ bool verw = ((pv ? opt_verw_pv : opt_verw_hvm) || -+ (opt_verw_mmio && is_iommu_enabled(d))); - - bool ibpb = ((pv ? opt_ibpb_entry_pv : opt_ibpb_entry_hvm) && - (d->domain_id != 0 || opt_ibpb_entry_dom0)); -@@ -1878,19 +1879,20 @@ void __init init_speculation_mitigations(void) - * the return-to-guest path. - */ - if ( opt_unpriv_mmio ) -- opt_fb_clear_mmio = cpu_has_fb_clear; -+ opt_verw_mmio = cpu_has_fb_clear; - - /* - * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. - * This will only be a token effort for MLPDS/MFBDS when HT is enabled, - * but it is somewhat better than nothing. - */ -- if ( opt_md_clear_pv == -1 ) -- opt_md_clear_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && -- boot_cpu_has(X86_FEATURE_MD_CLEAR)); -- if ( opt_md_clear_hvm == -1 ) -- opt_md_clear_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && -- boot_cpu_has(X86_FEATURE_MD_CLEAR)); -+ if ( opt_verw_pv == -1 ) -+ opt_verw_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && -+ cpu_has_md_clear); -+ -+ if ( opt_verw_hvm == -1 ) -+ opt_verw_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && -+ cpu_has_md_clear); - - /* - * Enable MDS/MMIO defences as applicable. The Idle blocks need using if -@@ -1903,12 +1905,12 @@ void __init init_speculation_mitigations(void) - * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) - * - * After calculating the appropriate idle setting, simplify -- * opt_md_clear_hvm to mean just "should we VERW on the way into HVM -+ * opt_verw_hvm to mean just "should we VERW on the way into HVM - * guests", so spec_ctrl_init_domain() can calculate suitable settings. - */ -- if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio ) -+ if ( opt_verw_pv || opt_verw_hvm || opt_verw_mmio ) - setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); -- opt_md_clear_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; -+ opt_verw_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; - - /* - * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT diff --git a/xsa452-4.17-6.patch b/xsa452-4.17-6.patch deleted file mode 100644 index bbe617e..0000000 --- a/xsa452-4.17-6.patch +++ /dev/null @@ -1,163 +0,0 @@ -From: Andrew Cooper -Subject: x86/spec-ctrl: VERW-handling adjustments - -... before we add yet more complexity to this logic. Mostly expanded -comments, but with three minor changes. - -1) Introduce cpu_has_useful_md_clear to simplify later logic in this patch and - future ones. - -2) We only ever need SC_VERW_IDLE when SMT is active. If SMT isn't active, - then there's no re-partition of pipeline resources based on thread-idleness - to worry about. - -3) The logic to adjust HVM VERW based on L1D_FLUSH is unmaintainable and, as - it turns out, wrong. SKIP_L1DFL is just a hint bit, whereas opt_l1d_flush - is the relevant decision of whether to use L1D_FLUSH based on - susceptibility and user preference. - - Rewrite the logic so it can be followed, and incorporate the fact that when - FB_CLEAR is visible, L1D_FLUSH isn't a safe substitution. - -This is part of XSA-452 / CVE-2023-28746. - -Signed-off-by: Andrew Cooper -Acked-by: Jan Beulich -(cherry picked from commit 1eb91a8a06230b4b64228c9a380194f8cfe6c5e2) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index e12ec9930cf7..adb6bc74e8e6 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -1531,7 +1531,7 @@ void __init init_speculation_mitigations(void) - { - enum ind_thunk thunk = THUNK_DEFAULT; - bool has_spec_ctrl, ibrs = false, hw_smt_enabled; -- bool cpu_has_bug_taa, retpoline_safe; -+ bool cpu_has_bug_taa, cpu_has_useful_md_clear, retpoline_safe; - - hw_smt_enabled = check_smt_enabled(); - -@@ -1867,50 +1867,97 @@ void __init init_speculation_mitigations(void) - "enabled. Please assess your configuration and choose an\n" - "explicit 'smt=' setting. See XSA-273.\n"); - -+ /* -+ * A brief summary of VERW-related changes. -+ * -+ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html -+ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html -+ * -+ * Relevant ucodes: -+ * -+ * - May 2019, for MDS. Introduces the MD_CLEAR CPUID bit and VERW side -+ * effects to scrub Store/Load/Fill buffers as applicable. MD_CLEAR -+ * exists architecturally, even when the side effects have been removed. -+ * -+ * Use VERW to scrub on return-to-guest. Parts with L1D_FLUSH to -+ * mitigate L1TF have the same side effect, so no need to do both. -+ * -+ * Various Atoms suffer from Store-buffer sampling only. Store buffers -+ * are statically partitioned between non-idle threads, so scrubbing is -+ * wanted when going idle too. -+ * -+ * Load ports and Fill buffers are competitively shared between threads. -+ * SMT must be disabled for VERW scrubbing to be fully effective. -+ * -+ * - November 2019, for TAA. Extended VERW side effects to TSX-enabled -+ * MDS_NO parts. -+ * -+ * - February 2022, for Client TSX de-feature. Removed VERW side effects -+ * from Client CPUs only. -+ * -+ * - May 2022, for MMIO Stale Data. (Re)introduced Fill Buffer scrubbing -+ * on all MMIO-affected parts which didn't already have it for MDS -+ * reasons, enumerating FB_CLEAR on those parts only. -+ * -+ * If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing -+ * side effects as VERW and cannot be used in its place. -+ */ - mds_calculations(); - - /* -- * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have -- * reintroduced the VERW fill buffer flushing side effect because of a -- * susceptibility to FBSDP. -+ * Parts which enumerate FB_CLEAR are those with now-updated microcode -+ * which weren't susceptible to the original MFBDS (and therefore didn't -+ * have Fill Buffer scrubbing side effects to begin with, or were Client -+ * MDS_NO non-TAA_NO parts where the scrubbing was removed), but have had -+ * the scrubbing reintroduced because of a susceptibility to FBSDP. - * - * If unprivileged guests have (or will have) MMIO mappings, we can - * mitigate cross-domain leakage of fill buffer data by issuing VERW on -- * the return-to-guest path. -+ * the return-to-guest path. This is only a token effort if SMT is -+ * active. - */ - if ( opt_unpriv_mmio ) - opt_verw_mmio = cpu_has_fb_clear; - - /* -- * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. -- * This will only be a token effort for MLPDS/MFBDS when HT is enabled, -- * but it is somewhat better than nothing. -+ * MD_CLEAR is enumerated architecturally forevermore, even after the -+ * scrubbing side effects have been removed. Create ourselves an version -+ * which expressed whether we think MD_CLEAR is having any useful side -+ * effect. -+ */ -+ cpu_has_useful_md_clear = (cpu_has_md_clear && -+ (cpu_has_bug_mds || cpu_has_bug_msbds_only)); -+ -+ /* -+ * By default, use VERW scrubbing on applicable hardware, if we think it's -+ * going to have an effect. This will only be a token effort for -+ * MLPDS/MFBDS when SMT is enabled. - */ - if ( opt_verw_pv == -1 ) -- opt_verw_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && -- cpu_has_md_clear); -+ opt_verw_pv = cpu_has_useful_md_clear; - - if ( opt_verw_hvm == -1 ) -- opt_verw_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && -- cpu_has_md_clear); -+ opt_verw_hvm = cpu_has_useful_md_clear; - - /* -- * Enable MDS/MMIO defences as applicable. The Idle blocks need using if -- * either the PV or HVM MDS defences are used, or if we may give MMIO -- * access to untrusted guests. -- * -- * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with -- * equivalent semantics to avoid needing to perform both flushes on the -- * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH (for -- * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) -- * -- * After calculating the appropriate idle setting, simplify -- * opt_verw_hvm to mean just "should we VERW on the way into HVM -- * guests", so spec_ctrl_init_domain() can calculate suitable settings. -+ * If SMT is active, and we're protecting against MDS or MMIO stale data, -+ * we need to scrub before going idle as well as on return to guest. -+ * Various pipeline resources are repartitioned amongst non-idle threads. - */ -- if ( opt_verw_pv || opt_verw_hvm || opt_verw_mmio ) -+ if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) || -+ opt_verw_mmio) && hw_smt_enabled ) - setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); -- opt_verw_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; -+ -+ /* -+ * After calculating the appropriate idle setting, simplify opt_verw_hvm -+ * to mean just "should we VERW on the way into HVM guests", so -+ * spec_ctrl_init_domain() can calculate suitable settings. -+ * -+ * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the -+ * only *_CLEAR we can see. -+ */ -+ if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear ) -+ opt_verw_hvm = false; - - /* - * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT diff --git a/xsa452-4.17-7.patch b/xsa452-4.17-7.patch deleted file mode 100644 index 26ba4eb..0000000 --- a/xsa452-4.17-7.patch +++ /dev/null @@ -1,307 +0,0 @@ -From: Andrew Cooper -Subject: x86/spec-ctrl: Mitigation Register File Data Sampling - -RFDS affects Atom cores, also branded E-cores, between the Goldmont and -Gracemont microarchitectures. This includes Alder Lake and Raptor Lake hybrid -clien systems which have a mix of Gracemont and other types of cores. - -Two new bits have been defined; RFDS_CLEAR to indicate VERW has more side -effets, and RFDS_NO to incidate that the system is unaffected. Plenty of -unaffected CPUs won't be getting RFDS_NO retrofitted in microcode, so we -synthesise it. Alder Lake and Raptor Lake Xeon-E's are unaffected due to -their platform configuration, and we must use the Hybrid CPUID bit to -distinguish them from their non-Xeon counterparts. - -Like MD_CLEAR and FB_CLEAR, RFDS_CLEAR needs OR-ing across a resource pool, so -set it in the max policies and reflect the host setting in default. - -This is part of XSA-452 / CVE-2023-28746. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit fb5b6f6744713410c74cfc12b7176c108e3c9a31) - -diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c -index aefc140d6651..5ceea8be073b 100644 ---- a/tools/misc/xen-cpuid.c -+++ b/tools/misc/xen-cpuid.c -@@ -172,7 +172,7 @@ static const char *const str_7d0[32] = - [ 8] = "avx512-vp2intersect", [ 9] = "srbds-ctrl", - [10] = "md-clear", [11] = "rtm-always-abort", - /* 12 */ [13] = "tsx-force-abort", -- [14] = "serialize", -+ [14] = "serialize", [15] = "hybrid", - [16] = "tsxldtrk", - [18] = "pconfig", - [20] = "cet-ibt", -@@ -237,7 +237,8 @@ static const char *const str_m10Al[32] = - [20] = "bhi-no", [21] = "xapic-status", - /* 22 */ [23] = "ovrclk-status", - [24] = "pbrsb-no", [25] = "gds-ctrl", -- [26] = "gds-no", -+ [26] = "gds-no", [27] = "rfds-no", -+ [28] = "rfds-clear", - }; - - static const char *const str_m10Ah[32] = -diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c -index 7b875a722142..96c2cee1a857 100644 ---- a/xen/arch/x86/cpu-policy.c -+++ b/xen/arch/x86/cpu-policy.c -@@ -444,6 +444,7 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs) - */ - __set_bit(X86_FEATURE_MD_CLEAR, fs); - __set_bit(X86_FEATURE_FB_CLEAR, fs); -+ __set_bit(X86_FEATURE_RFDS_CLEAR, fs); - - /* - * The Gather Data Sampling microcode mitigation (August 2023) has an -@@ -493,6 +494,10 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs) - if ( cpu_has_fb_clear ) - __set_bit(X86_FEATURE_FB_CLEAR, fs); - -+ __clear_bit(X86_FEATURE_RFDS_CLEAR, fs); -+ if ( cpu_has_rfds_clear ) -+ __set_bit(X86_FEATURE_RFDS_CLEAR, fs); -+ - /* - * The Gather Data Sampling microcode mitigation (August 2023) has an - * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. -diff --git a/xen/arch/x86/include/asm/cpufeature.h b/xen/arch/x86/include/asm/cpufeature.h -index ec824e895498..a6b8af12964c 100644 ---- a/xen/arch/x86/include/asm/cpufeature.h -+++ b/xen/arch/x86/include/asm/cpufeature.h -@@ -140,6 +140,7 @@ - #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) - #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) - #define cpu_has_serialize boot_cpu_has(X86_FEATURE_SERIALIZE) -+#define cpu_has_hybrid boot_cpu_has(X86_FEATURE_HYBRID) - #define cpu_has_avx512_fp16 boot_cpu_has(X86_FEATURE_AVX512_FP16) - #define cpu_has_arch_caps boot_cpu_has(X86_FEATURE_ARCH_CAPS) - -@@ -161,6 +162,8 @@ - #define cpu_has_rrsba boot_cpu_has(X86_FEATURE_RRSBA) - #define cpu_has_gds_ctrl boot_cpu_has(X86_FEATURE_GDS_CTRL) - #define cpu_has_gds_no boot_cpu_has(X86_FEATURE_GDS_NO) -+#define cpu_has_rfds_no boot_cpu_has(X86_FEATURE_RFDS_NO) -+#define cpu_has_rfds_clear boot_cpu_has(X86_FEATURE_RFDS_CLEAR) - - /* Synthesized. */ - #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) -diff --git a/xen/arch/x86/include/asm/msr-index.h b/xen/arch/x86/include/asm/msr-index.h -index 6abf7bc34a4f..9b5f67711f0c 100644 ---- a/xen/arch/x86/include/asm/msr-index.h -+++ b/xen/arch/x86/include/asm/msr-index.h -@@ -88,6 +88,8 @@ - #define ARCH_CAPS_PBRSB_NO (_AC(1, ULL) << 24) - #define ARCH_CAPS_GDS_CTRL (_AC(1, ULL) << 25) - #define ARCH_CAPS_GDS_NO (_AC(1, ULL) << 26) -+#define ARCH_CAPS_RFDS_NO (_AC(1, ULL) << 27) -+#define ARCH_CAPS_RFDS_CLEAR (_AC(1, ULL) << 28) - - #define MSR_FLUSH_CMD 0x0000010b - #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index adb6bc74e8e6..1ee81e2dfe79 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -24,6 +24,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -447,7 +448,7 @@ static void __init print_details(enum ind_thunk thunk) - * Hardware read-only information, stating immunity to certain issues, or - * suggestions of which mitigation to use. - */ -- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", - (caps & ARCH_CAPS_EIBRS) ? " EIBRS" : "", - (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", -@@ -463,6 +464,7 @@ static void __init print_details(enum ind_thunk thunk) - (caps & ARCH_CAPS_FB_CLEAR) ? " FB_CLEAR" : "", - (caps & ARCH_CAPS_PBRSB_NO) ? " PBRSB_NO" : "", - (caps & ARCH_CAPS_GDS_NO) ? " GDS_NO" : "", -+ (caps & ARCH_CAPS_RFDS_NO) ? " RFDS_NO" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", -@@ -473,7 +475,7 @@ static void __init print_details(enum ind_thunk thunk) - (e21a & cpufeat_mask(X86_FEATURE_SRSO_NO)) ? " SRSO_NO" : ""); - - /* Hardware features which need driving to mitigate issues. */ -- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (e8b & cpufeat_mask(X86_FEATURE_IBPB)) || - (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBPB" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS)) || -@@ -491,6 +493,7 @@ static void __init print_details(enum ind_thunk thunk) - (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", - (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : "", - (caps & ARCH_CAPS_GDS_CTRL) ? " GDS_CTRL" : "", -+ (caps & ARCH_CAPS_RFDS_CLEAR) ? " RFDS_CLEAR" : "", - (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : ""); - - /* Compiled-in support which pertains to mitigations. */ -@@ -1359,6 +1362,83 @@ static __init void mds_calculations(void) - } - } - -+/* -+ * Register File Data Sampling affects Atom cores from the Goldmont to -+ * Gracemont microarchitectures. The March 2024 microcode adds RFDS_NO to -+ * some but not all unaffected parts, and RFDS_CLEAR to affected parts still -+ * in support. -+ * -+ * Alder Lake and Raptor Lake client CPUs have a mix of P cores -+ * (Golden/Raptor Cove, not vulnerable) and E cores (Gracemont, -+ * vulnerable), and both enumerate RFDS_CLEAR. -+ * -+ * Both exist in a Xeon SKU, which has the E cores (Gracemont) disabled by -+ * platform configuration, and enumerate RFDS_NO. -+ * -+ * With older parts, or with out-of-date microcode, synthesise RFDS_NO when -+ * safe to do so. -+ * -+ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html -+ */ -+static void __init rfds_calculations(void) -+{ -+ /* RFDS is only known to affect Intel Family 6 processors at this time. */ -+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || -+ boot_cpu_data.x86 != 6 ) -+ return; -+ -+ /* -+ * If RFDS_NO or RFDS_CLEAR are visible, we've either got suitable -+ * microcode, or an RFDS-aware hypervisor is levelling us in a pool. -+ */ -+ if ( cpu_has_rfds_no || cpu_has_rfds_clear ) -+ return; -+ -+ /* If we're virtualised, don't attempt to synthesise RFDS_NO. */ -+ if ( cpu_has_hypervisor ) -+ return; -+ -+ /* -+ * Not all CPUs are expected to get a microcode update enumerating one of -+ * RFDS_{NO,CLEAR}, or we might have out-of-date microcode. -+ */ -+ switch ( boot_cpu_data.x86_model ) -+ { -+ case INTEL_FAM6_ALDERLAKE: -+ case INTEL_FAM6_RAPTORLAKE: -+ /* -+ * Alder Lake and Raptor Lake might be a client SKU (with the -+ * Gracemont cores active, and therefore vulnerable) or might be a -+ * server SKU (with the Gracemont cores disabled, and therefore not -+ * vulnerable). -+ * -+ * See if the CPU identifies as hybrid to distinguish the two cases. -+ */ -+ if ( !cpu_has_hybrid ) -+ break; -+ fallthrough; -+ case INTEL_FAM6_ALDERLAKE_L: -+ case INTEL_FAM6_RAPTORLAKE_P: -+ case INTEL_FAM6_RAPTORLAKE_S: -+ -+ case INTEL_FAM6_ATOM_GOLDMONT: /* Apollo Lake */ -+ case INTEL_FAM6_ATOM_GOLDMONT_D: /* Denverton */ -+ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: /* Gemini Lake */ -+ case INTEL_FAM6_ATOM_TREMONT_D: /* Snow Ridge / Parker Ridge */ -+ case INTEL_FAM6_ATOM_TREMONT: /* Elkhart Lake */ -+ case INTEL_FAM6_ATOM_TREMONT_L: /* Jasper Lake */ -+ case INTEL_FAM6_ATOM_GRACEMONT: /* Alder Lake N */ -+ return; -+ } -+ -+ /* -+ * We appear to be on an unaffected CPU which didn't enumerate RFDS_NO, -+ * perhaps because of it's age or because of out-of-date microcode. -+ * Synthesise it. -+ */ -+ setup_force_cpu_cap(X86_FEATURE_RFDS_NO); -+} -+ - static bool __init cpu_has_gds(void) - { - /* -@@ -1872,6 +1952,7 @@ void __init init_speculation_mitigations(void) - * - * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html - * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html -+ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html - * - * Relevant ucodes: - * -@@ -1901,8 +1982,12 @@ void __init init_speculation_mitigations(void) - * - * If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing - * side effects as VERW and cannot be used in its place. -+ * -+ * - March 2023, for RFDS. Enumerate RFDS_CLEAR to mean that VERW now -+ * scrubs non-architectural entries from certain register files. - */ - mds_calculations(); -+ rfds_calculations(); - - /* - * Parts which enumerate FB_CLEAR are those with now-updated microcode -@@ -1934,15 +2019,19 @@ void __init init_speculation_mitigations(void) - * MLPDS/MFBDS when SMT is enabled. - */ - if ( opt_verw_pv == -1 ) -- opt_verw_pv = cpu_has_useful_md_clear; -+ opt_verw_pv = cpu_has_useful_md_clear || cpu_has_rfds_clear; - - if ( opt_verw_hvm == -1 ) -- opt_verw_hvm = cpu_has_useful_md_clear; -+ opt_verw_hvm = cpu_has_useful_md_clear || cpu_has_rfds_clear; - - /* - * If SMT is active, and we're protecting against MDS or MMIO stale data, - * we need to scrub before going idle as well as on return to guest. - * Various pipeline resources are repartitioned amongst non-idle threads. -+ * -+ * We don't need to scrub on idle for RFDS. There are no affected cores -+ * which support SMT, despite there being affected cores in hybrid systems -+ * which have SMT elsewhere in the platform. - */ - if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) || - opt_verw_mmio) && hw_smt_enabled ) -@@ -1956,7 +2045,8 @@ void __init init_speculation_mitigations(void) - * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the - * only *_CLEAR we can see. - */ -- if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear ) -+ if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear && -+ !cpu_has_rfds_clear ) - opt_verw_hvm = false; - - /* -diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h -index aec1407613c3..113e6cadc17d 100644 ---- a/xen/include/public/arch-x86/cpufeatureset.h -+++ b/xen/include/public/arch-x86/cpufeatureset.h -@@ -264,6 +264,7 @@ XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*!A VERW clears microarchitectural buffe - XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */ - XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */ - XEN_CPUFEATURE(SERIALIZE, 9*32+14) /*A SERIALIZE insn */ -+XEN_CPUFEATURE(HYBRID, 9*32+15) /* Heterogeneous platform */ - XEN_CPUFEATURE(TSXLDTRK, 9*32+16) /*a TSX load tracking suspend/resume insns */ - XEN_CPUFEATURE(CET_IBT, 9*32+20) /* CET - Indirect Branch Tracking */ - XEN_CPUFEATURE(AVX512_FP16, 9*32+23) /* AVX512 FP16 instructions */ -@@ -330,6 +331,8 @@ XEN_CPUFEATURE(OVRCLK_STATUS, 16*32+23) /* MSR_OVERCLOCKING_STATUS */ - XEN_CPUFEATURE(PBRSB_NO, 16*32+24) /*A No Post-Barrier RSB predictions */ - XEN_CPUFEATURE(GDS_CTRL, 16*32+25) /* MCU_OPT_CTRL.GDS_MIT_{DIS,LOCK} */ - XEN_CPUFEATURE(GDS_NO, 16*32+26) /*A No Gather Data Sampling */ -+XEN_CPUFEATURE(RFDS_NO, 16*32+27) /*A No Register File Data Sampling */ -+XEN_CPUFEATURE(RFDS_CLEAR, 16*32+28) /*!A Register File(s) cleared by VERW */ - - /* Intel-defined CPU features, MSR_ARCH_CAPS 0x10a.edx, word 17 */ - diff --git a/xsa453-4.17-1.patch b/xsa453-4.17-1.patch deleted file mode 100644 index 07f4337..0000000 --- a/xsa453-4.17-1.patch +++ /dev/null @@ -1,150 +0,0 @@ -From: Andrew Cooper -Subject: x86/paging: Delete update_cr3()'s do_locking parameter - -Nicola reports that the XSA-438 fix introduced new MISRA violations because of -some incidental tidying it tried to do. The parameter is useless, so resolve -the MISRA regression by removing it. - -hap_update_cr3() discards the parameter entirely, while sh_update_cr3() uses -it to distinguish internal and external callers and therefore whether the -paging lock should be taken. - -However, we have paging_lock_recursive() for this purpose, which also avoids -the ability for the shadow internal callers to accidentally not hold the lock. - -Fixes: fb0ff49fe9f7 ("x86/shadow: defer releasing of PV's top-level shadow reference") -Reported-by: Nicola Vetrini -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -Release-acked-by: Henry Wang -(cherry picked from commit e71157d1ac2a7fbf413130663cf0a93ff9fbcf7e) - -diff --git a/xen/arch/x86/include/asm/paging.h b/xen/arch/x86/include/asm/paging.h -index 94c590f31aa8..809ff35d9a0d 100644 ---- a/xen/arch/x86/include/asm/paging.h -+++ b/xen/arch/x86/include/asm/paging.h -@@ -138,8 +138,7 @@ struct paging_mode { - paddr_t ga, uint32_t *pfec, - unsigned int *page_order); - #endif -- pagetable_t (*update_cr3 )(struct vcpu *v, bool do_locking, -- bool noflush); -+ pagetable_t (*update_cr3 )(struct vcpu *v, bool noflush); - void (*update_paging_modes )(struct vcpu *v); - bool (*flush_tlb )(const unsigned long *vcpu_bitmap); - -@@ -312,7 +311,7 @@ static inline unsigned long paging_ga_to_gfn_cr3(struct vcpu *v, - * as the value to load into the host CR3 to schedule this vcpu */ - static inline pagetable_t paging_update_cr3(struct vcpu *v, bool noflush) - { -- return paging_get_hostmode(v)->update_cr3(v, 1, noflush); -+ return paging_get_hostmode(v)->update_cr3(v, noflush); - } - - /* Update all the things that are derived from the guest's CR0/CR3/CR4. -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index 57a19c3d59d1..3ad39a7dd781 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -739,8 +739,7 @@ static bool cf_check hap_invlpg(struct vcpu *v, unsigned long linear) - return 1; - } - --static pagetable_t cf_check hap_update_cr3( -- struct vcpu *v, bool do_locking, bool noflush) -+static pagetable_t cf_check hap_update_cr3(struct vcpu *v, bool noflush) - { - v->arch.hvm.hw_cr[3] = v->arch.hvm.guest_cr[3]; - hvm_update_guest_cr3(v, noflush); -@@ -826,7 +825,7 @@ static void cf_check hap_update_paging_modes(struct vcpu *v) - } - - /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */ -- hap_update_cr3(v, 0, false); -+ hap_update_cr3(v, false); - - unlock: - paging_unlock(d); -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index c0940f939ef0..18714dbd02ab 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2579,7 +2579,7 @@ static void sh_update_paging_modes(struct vcpu *v) - } - #endif /* OOS */ - -- v->arch.paging.mode->update_cr3(v, 0, false); -+ v->arch.paging.mode->update_cr3(v, false); - } - - void cf_check shadow_update_paging_modes(struct vcpu *v) -diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c -index c92b354a7815..e54a507b54f6 100644 ---- a/xen/arch/x86/mm/shadow/multi.c -+++ b/xen/arch/x86/mm/shadow/multi.c -@@ -2506,7 +2506,7 @@ static int cf_check sh_page_fault( - * In any case, in the PAE case, the ASSERT is not true; it can - * happen because of actions the guest is taking. */ - #if GUEST_PAGING_LEVELS == 3 -- v->arch.paging.mode->update_cr3(v, 0, false); -+ v->arch.paging.mode->update_cr3(v, false); - #else - ASSERT(d->is_shutting_down); - #endif -@@ -3224,17 +3224,13 @@ static void cf_check sh_detach_old_tables(struct vcpu *v) - } - } - --static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool do_locking, -- bool noflush) -+static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool noflush) - /* Updates vcpu->arch.cr3 after the guest has changed CR3. - * Paravirtual guests should set v->arch.guest_table (and guest_table_user, - * if appropriate). - * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works; - * this function will call hvm_update_guest_cr(v, 3) to tell them where the - * shadow tables are. -- * If do_locking != 0, assume we are being called from outside the -- * shadow code, and must take and release the paging lock; otherwise -- * that is the caller's responsibility. - */ - { - struct domain *d = v->domain; -@@ -3252,7 +3248,11 @@ static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool do_locking, - return old_entry; - } - -- if ( do_locking ) paging_lock(v->domain); -+ /* -+ * This is used externally (with the paging lock not taken) and internally -+ * by the shadow code (with the lock already taken). -+ */ -+ paging_lock_recursive(v->domain); - - #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) - /* Need to resync all the shadow entries on a TLB flush. Resync -@@ -3480,8 +3480,7 @@ static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool do_locking, - shadow_sync_other_vcpus(v); - #endif - -- /* Release the lock, if we took it (otherwise it's the caller's problem) */ -- if ( do_locking ) paging_unlock(v->domain); -+ paging_unlock(v->domain); - - return old_entry; - } -diff --git a/xen/arch/x86/mm/shadow/none.c b/xen/arch/x86/mm/shadow/none.c -index 743c0ffb8514..7e4e386cd030 100644 ---- a/xen/arch/x86/mm/shadow/none.c -+++ b/xen/arch/x86/mm/shadow/none.c -@@ -52,8 +52,7 @@ static unsigned long cf_check _gva_to_gfn( - } - #endif - --static pagetable_t cf_check _update_cr3(struct vcpu *v, bool do_locking, -- bool noflush) -+static pagetable_t cf_check _update_cr3(struct vcpu *v, bool noflush) - { - ASSERT_UNREACHABLE(); - return pagetable_null(); - diff --git a/xsa453-4.17-2.patch b/xsa453-4.17-2.patch deleted file mode 100644 index 9247e71..0000000 --- a/xsa453-4.17-2.patch +++ /dev/null @@ -1,49 +0,0 @@ -From: Andrew Cooper -Subject: xen: Swap order of actions in the FREE*() macros - -Wherever possible, it is a good idea to NULL out the visible reference to an -object prior to freeing it. The FREE*() macros already collect together both -parts, making it easy to adjust. - -This has a marginal code generation improvement, as some of the calls to the -free() function can be tailcall optimised. - -No functional change. - -Signed-off-by: Andrew Cooper -Acked-by: Jan Beulich -(cherry picked from commit c4f427ec879e7c0df6d44d02561e8bee838a293e) - -diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h -index 3dc61bcc3c07..211685a5d29c 100644 ---- a/xen/include/xen/mm.h -+++ b/xen/include/xen/mm.h -@@ -80,8 +80,9 @@ bool scrub_free_pages(void); - - /* Free an allocation, and zero the pointer to it. */ - #define FREE_XENHEAP_PAGES(p, o) do { \ -- free_xenheap_pages(p, o); \ -+ void *_ptr_ = (p); \ - (p) = NULL; \ -+ free_xenheap_pages(_ptr_, o); \ - } while ( false ) - #define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0) - -diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h -index 16979a117c6a..d857298011c1 100644 ---- a/xen/include/xen/xmalloc.h -+++ b/xen/include/xen/xmalloc.h -@@ -66,9 +66,10 @@ - extern void xfree(void *); - - /* Free an allocation, and zero the pointer to it. */ --#define XFREE(p) do { \ -- xfree(p); \ -- (p) = NULL; \ -+#define XFREE(p) do { \ -+ void *_ptr_ = (p); \ -+ (p) = NULL; \ -+ xfree(_ptr_); \ - } while ( false ) - - /* Underlying functions */ diff --git a/xsa453-4.17-3.patch b/xsa453-4.17-3.patch deleted file mode 100644 index d9b3bfc..0000000 --- a/xsa453-4.17-3.patch +++ /dev/null @@ -1,314 +0,0 @@ -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= -Subject: x86/spinlock: introduce support for blocking speculation into - critical regions - -Introduce a new Kconfig option to block speculation into lock protected -critical regions. The Kconfig option is enabled by default, but the mitigation -won't be engaged unless it's explicitly enabled in the command line using -`spec-ctrl=lock-harden`. - -Convert the spinlock acquire macros into always-inline functions, and introduce -a speculation barrier after the lock has been taken. Note the speculation -barrier is not placed inside the implementation of the spin lock functions, as -to prevent speculation from falling through the call to the lock functions -resulting in the barrier also being skipped. - -trylock variants are protected using a construct akin to the existing -evaluate_nospec(). - -This patch only implements the speculation barrier for x86. - -Note spin locks are the only locking primitive taken care in this change, -further locking primitives will be adjusted by separate changes. - -This is part of XSA-453 / CVE-2024-2193 - -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich -(cherry picked from commit 7ef0084418e188d05f338c3e028fbbe8b6924afa) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index d909ec94fe7c..e1d56407dd88 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2327,7 +2327,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - > {msr-sc,rsb,verw,ibpb-entry}=|{pv,hvm}=, - > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, - > eager-fpu,l1d-flush,branch-harden,srb-lock, --> unpriv-mmio,gds-mit,div-scrub}= ]` -+> unpriv-mmio,gds-mit,div-scrub,lock-harden}= ]` - - Controls for speculative execution sidechannel mitigations. By default, Xen - will pick the most appropriate mitigations based on compiled in support, -@@ -2454,6 +2454,11 @@ On all hardware, the `div-scrub=` option can be used to force or prevent Xen - from mitigating the DIV-leakage vulnerability. By default, Xen will mitigate - DIV-leakage on hardware believed to be vulnerable. - -+If Xen is compiled with `CONFIG_SPECULATIVE_HARDEN_LOCK`, the `lock-harden=` -+boolean can be used to force or prevent Xen from using speculation barriers to -+protect lock critical regions. This mitigation won't be engaged by default, -+and needs to be explicitly enabled on the command line. -+ - ### sync_console - > `= ` - -diff --git a/xen/arch/x86/include/asm/cpufeatures.h b/xen/arch/x86/include/asm/cpufeatures.h -index c3aad21c3b43..7e8221fd85dd 100644 ---- a/xen/arch/x86/include/asm/cpufeatures.h -+++ b/xen/arch/x86/include/asm/cpufeatures.h -@@ -24,7 +24,7 @@ XEN_CPUFEATURE(APERFMPERF, X86_SYNTH( 8)) /* APERFMPERF */ - XEN_CPUFEATURE(MFENCE_RDTSC, X86_SYNTH( 9)) /* MFENCE synchronizes RDTSC */ - XEN_CPUFEATURE(XEN_SMEP, X86_SYNTH(10)) /* SMEP gets used by Xen itself */ - XEN_CPUFEATURE(XEN_SMAP, X86_SYNTH(11)) /* SMAP gets used by Xen itself */ --/* Bit 12 unused. */ -+XEN_CPUFEATURE(SC_NO_LOCK_HARDEN, X86_SYNTH(12)) /* (Disable) Lock critical region hardening */ - XEN_CPUFEATURE(IND_THUNK_LFENCE, X86_SYNTH(13)) /* Use IND_THUNK_LFENCE */ - XEN_CPUFEATURE(IND_THUNK_JMP, X86_SYNTH(14)) /* Use IND_THUNK_JMP */ - XEN_CPUFEATURE(SC_NO_BRANCH_HARDEN, X86_SYNTH(15)) /* (Disable) Conditional branch hardening */ -diff --git a/xen/arch/x86/include/asm/nospec.h b/xen/arch/x86/include/asm/nospec.h -index 7150e76b87fb..0725839e1982 100644 ---- a/xen/arch/x86/include/asm/nospec.h -+++ b/xen/arch/x86/include/asm/nospec.h -@@ -38,6 +38,32 @@ static always_inline void block_speculation(void) - barrier_nospec_true(); - } - -+static always_inline void arch_block_lock_speculation(void) -+{ -+ alternative("lfence", "", X86_FEATURE_SC_NO_LOCK_HARDEN); -+} -+ -+/* Allow to insert a read memory barrier into conditionals */ -+static always_inline bool barrier_lock_true(void) -+{ -+ alternative("lfence #nospec-true", "", X86_FEATURE_SC_NO_LOCK_HARDEN); -+ return true; -+} -+ -+static always_inline bool barrier_lock_false(void) -+{ -+ alternative("lfence #nospec-false", "", X86_FEATURE_SC_NO_LOCK_HARDEN); -+ return false; -+} -+ -+static always_inline bool arch_lock_evaluate_nospec(bool condition) -+{ -+ if ( condition ) -+ return barrier_lock_true(); -+ else -+ return barrier_lock_false(); -+} -+ - #endif /* _ASM_X86_NOSPEC_H */ - - /* -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 1ee81e2dfe79..ac21af2c5c0f 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -65,6 +65,7 @@ int8_t __read_mostly opt_eager_fpu = -1; - int8_t __read_mostly opt_l1d_flush = -1; - static bool __initdata opt_branch_harden = - IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH); -+static bool __initdata opt_lock_harden; - - bool __initdata bsp_delay_spec_ctrl; - uint8_t __read_mostly default_xen_spec_ctrl; -@@ -133,6 +134,7 @@ static int __init cf_check parse_spec_ctrl(const char *s) - opt_ssbd = false; - opt_l1d_flush = 0; - opt_branch_harden = false; -+ opt_lock_harden = false; - opt_srb_lock = 0; - opt_unpriv_mmio = false; - opt_gds_mit = 0; -@@ -298,6 +300,16 @@ static int __init cf_check parse_spec_ctrl(const char *s) - rc = -EINVAL; - } - } -+ else if ( (val = parse_boolean("lock-harden", s, ss)) >= 0 ) -+ { -+ if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) ) -+ opt_lock_harden = val; -+ else -+ { -+ no_config_param("SPECULATIVE_HARDEN_LOCK", "spec-ctrl", s, ss); -+ rc = -EINVAL; -+ } -+ } - else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) - opt_srb_lock = val; - else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) -@@ -500,7 +512,8 @@ static void __init print_details(enum ind_thunk thunk) - if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) || - IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_ARRAY) || - IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) || -- IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) ) -+ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) || -+ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) ) - printk(" Compiled-in support:" - #ifdef CONFIG_INDIRECT_THUNK - " INDIRECT_THUNK" -@@ -516,11 +529,14 @@ static void __init print_details(enum ind_thunk thunk) - #endif - #ifdef CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS - " HARDEN_GUEST_ACCESS" -+#endif -+#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK -+ " HARDEN_LOCK" - #endif - "\n"); - - /* Settings for Xen's protection, irrespective of guests. */ -- printk(" Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", -+ printk(" Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s%s\n", - thunk != THUNK_NONE ? "BTI-Thunk: " : "", - thunk == THUNK_NONE ? "" : - thunk == THUNK_RETPOLINE ? "RETPOLINE, " : -@@ -547,7 +563,8 @@ static void __init print_details(enum ind_thunk thunk) - opt_verw_pv || opt_verw_hvm || - opt_verw_mmio ? " VERW" : "", - opt_div_scrub ? " DIV" : "", -- opt_branch_harden ? " BRANCH_HARDEN" : ""); -+ opt_branch_harden ? " BRANCH_HARDEN" : "", -+ opt_lock_harden ? " LOCK_HARDEN" : ""); - - /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ - if ( cpu_has_bug_l1tf || opt_pv_l1tf_hwdom || opt_pv_l1tf_domu ) -@@ -1930,6 +1947,9 @@ void __init init_speculation_mitigations(void) - if ( !opt_branch_harden ) - setup_force_cpu_cap(X86_FEATURE_SC_NO_BRANCH_HARDEN); - -+ if ( !opt_lock_harden ) -+ setup_force_cpu_cap(X86_FEATURE_SC_NO_LOCK_HARDEN); -+ - /* - * We do not disable HT by default on affected hardware. - * -diff --git a/xen/common/Kconfig b/xen/common/Kconfig -index e7794cb7f681..cd7385153823 100644 ---- a/xen/common/Kconfig -+++ b/xen/common/Kconfig -@@ -173,6 +173,23 @@ config SPECULATIVE_HARDEN_GUEST_ACCESS - - If unsure, say Y. - -+config SPECULATIVE_HARDEN_LOCK -+ bool "Speculative lock context hardening" -+ default y -+ depends on X86 -+ help -+ Contemporary processors may use speculative execution as a -+ performance optimisation, but this can potentially be abused by an -+ attacker to leak data via speculative sidechannels. -+ -+ One source of data leakage is via speculative accesses to lock -+ critical regions. -+ -+ This option is disabled by default at run time, and needs to be -+ enabled on the command line. -+ -+ If unsure, say Y. -+ - endmenu - - config DIT_DEFAULT -diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h -index 76255bc46efe..455284640396 100644 ---- a/xen/include/xen/nospec.h -+++ b/xen/include/xen/nospec.h -@@ -70,6 +70,21 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, - #define array_access_nospec(array, index) \ - (array)[array_index_nospec(index, ARRAY_SIZE(array))] - -+static always_inline void block_lock_speculation(void) -+{ -+#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK -+ arch_block_lock_speculation(); -+#endif -+} -+ -+static always_inline bool lock_evaluate_nospec(bool condition) -+{ -+#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK -+ return arch_lock_evaluate_nospec(condition); -+#endif -+ return condition; -+} -+ - #endif /* XEN_NOSPEC_H */ - - /* -diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h -index 961891bea4d5..daf48fdea709 100644 ---- a/xen/include/xen/spinlock.h -+++ b/xen/include/xen/spinlock.h -@@ -1,6 +1,7 @@ - #ifndef __SPINLOCK_H__ - #define __SPINLOCK_H__ - -+#include - #include - #include - #include -@@ -189,13 +190,30 @@ int _spin_trylock_recursive(spinlock_t *lock); - void _spin_lock_recursive(spinlock_t *lock); - void _spin_unlock_recursive(spinlock_t *lock); - --#define spin_lock(l) _spin_lock(l) --#define spin_lock_cb(l, c, d) _spin_lock_cb(l, c, d) --#define spin_lock_irq(l) _spin_lock_irq(l) -+static always_inline void spin_lock(spinlock_t *l) -+{ -+ _spin_lock(l); -+ block_lock_speculation(); -+} -+ -+static always_inline void spin_lock_cb(spinlock_t *l, void (*c)(void *data), -+ void *d) -+{ -+ _spin_lock_cb(l, c, d); -+ block_lock_speculation(); -+} -+ -+static always_inline void spin_lock_irq(spinlock_t *l) -+{ -+ _spin_lock_irq(l); -+ block_lock_speculation(); -+} -+ - #define spin_lock_irqsave(l, f) \ - ({ \ - BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ - ((f) = _spin_lock_irqsave(l)); \ -+ block_lock_speculation(); \ - }) - - #define spin_unlock(l) _spin_unlock(l) -@@ -203,7 +221,7 @@ void _spin_unlock_recursive(spinlock_t *lock); - #define spin_unlock_irqrestore(l, f) _spin_unlock_irqrestore(l, f) - - #define spin_is_locked(l) _spin_is_locked(l) --#define spin_trylock(l) _spin_trylock(l) -+#define spin_trylock(l) lock_evaluate_nospec(_spin_trylock(l)) - - #define spin_trylock_irqsave(lock, flags) \ - ({ \ -@@ -224,8 +242,15 @@ void _spin_unlock_recursive(spinlock_t *lock); - * are any critical regions that cannot form part of such a set, they can use - * standard spin_[un]lock(). - */ --#define spin_trylock_recursive(l) _spin_trylock_recursive(l) --#define spin_lock_recursive(l) _spin_lock_recursive(l) -+#define spin_trylock_recursive(l) \ -+ lock_evaluate_nospec(_spin_trylock_recursive(l)) -+ -+static always_inline void spin_lock_recursive(spinlock_t *l) -+{ -+ _spin_lock_recursive(l); -+ block_lock_speculation(); -+} -+ - #define spin_unlock_recursive(l) _spin_unlock_recursive(l) - - #endif /* __SPINLOCK_H__ */ diff --git a/xsa453-4.17-4.patch b/xsa453-4.17-4.patch deleted file mode 100644 index 0465124..0000000 --- a/xsa453-4.17-4.patch +++ /dev/null @@ -1,113 +0,0 @@ -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= -Subject: rwlock: introduce support for blocking speculation into critical - regions - -Introduce inline wrappers as required and add direct calls to -block_lock_speculation() in order to prevent speculation into the rwlock -protected critical regions. - -Note the rwlock primitives are adjusted to use the non speculation safe variants -of the spinlock handlers, as a speculation barrier is added in the rwlock -calling wrappers. - -trylock variants are protected by using lock_evaluate_nospec(). - -This is part of XSA-453 / CVE-2024-2193 - -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich -(cherry picked from commit a1fb15f61692b1fa9945fc51f55471ace49cdd59) - -diff --git a/xen/common/rwlock.c b/xen/common/rwlock.c -index aa15529bbe8c..cda06b9d6ece 100644 ---- a/xen/common/rwlock.c -+++ b/xen/common/rwlock.c -@@ -34,8 +34,11 @@ void queue_read_lock_slowpath(rwlock_t *lock) - - /* - * Put the reader into the wait queue. -+ * -+ * Use the speculation unsafe helper, as it's the caller responsibility to -+ * issue a speculation barrier if required. - */ -- spin_lock(&lock->lock); -+ _spin_lock(&lock->lock); - - /* - * At the head of the wait queue now, wait until the writer state -@@ -64,8 +67,13 @@ void queue_write_lock_slowpath(rwlock_t *lock) - { - u32 cnts; - -- /* Put the writer into the wait queue. */ -- spin_lock(&lock->lock); -+ /* -+ * Put the writer into the wait queue. -+ * -+ * Use the speculation unsafe helper, as it's the caller responsibility to -+ * issue a speculation barrier if required. -+ */ -+ _spin_lock(&lock->lock); - - /* Try to acquire the lock directly if no reader is present. */ - if ( !atomic_read(&lock->cnts) && -diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h -index 0cc9167715b3..fd0458be94ae 100644 ---- a/xen/include/xen/rwlock.h -+++ b/xen/include/xen/rwlock.h -@@ -247,27 +247,49 @@ static inline int _rw_is_write_locked(rwlock_t *lock) - return (atomic_read(&lock->cnts) & _QW_WMASK) == _QW_LOCKED; - } - --#define read_lock(l) _read_lock(l) --#define read_lock_irq(l) _read_lock_irq(l) -+static always_inline void read_lock(rwlock_t *l) -+{ -+ _read_lock(l); -+ block_lock_speculation(); -+} -+ -+static always_inline void read_lock_irq(rwlock_t *l) -+{ -+ _read_lock_irq(l); -+ block_lock_speculation(); -+} -+ - #define read_lock_irqsave(l, f) \ - ({ \ - BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ - ((f) = _read_lock_irqsave(l)); \ -+ block_lock_speculation(); \ - }) - - #define read_unlock(l) _read_unlock(l) - #define read_unlock_irq(l) _read_unlock_irq(l) - #define read_unlock_irqrestore(l, f) _read_unlock_irqrestore(l, f) --#define read_trylock(l) _read_trylock(l) -+#define read_trylock(l) lock_evaluate_nospec(_read_trylock(l)) -+ -+static always_inline void write_lock(rwlock_t *l) -+{ -+ _write_lock(l); -+ block_lock_speculation(); -+} -+ -+static always_inline void write_lock_irq(rwlock_t *l) -+{ -+ _write_lock_irq(l); -+ block_lock_speculation(); -+} - --#define write_lock(l) _write_lock(l) --#define write_lock_irq(l) _write_lock_irq(l) - #define write_lock_irqsave(l, f) \ - ({ \ - BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ - ((f) = _write_lock_irqsave(l)); \ -+ block_lock_speculation(); \ - }) --#define write_trylock(l) _write_trylock(l) -+#define write_trylock(l) lock_evaluate_nospec(_write_trylock(l)) - - #define write_unlock(l) _write_unlock(l) - #define write_unlock_irq(l) _write_unlock_irq(l) diff --git a/xsa453-4.17-5.patch b/xsa453-4.17-5.patch deleted file mode 100644 index 7a44eb8..0000000 --- a/xsa453-4.17-5.patch +++ /dev/null @@ -1,75 +0,0 @@ -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= -Subject: percpu-rwlock: introduce support for blocking speculation into - critical regions - -Add direct calls to block_lock_speculation() where required in order to prevent -speculation into the lock protected critical regions. Also convert -_percpu_read_lock() from inline to always_inline. - -Note that _percpu_write_lock() has been modified the use the non speculation -safe of the locking primites, as a speculation is added unconditionally by the -calling wrapper. - -This is part of XSA-453 / CVE-2024-2193 - -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich -(cherry picked from commit f218daf6d3a3b847736d37c6a6b76031a0d08441) - -diff --git a/xen/common/rwlock.c b/xen/common/rwlock.c -index cda06b9d6ece..4da0ed8fadb0 100644 ---- a/xen/common/rwlock.c -+++ b/xen/common/rwlock.c -@@ -125,8 +125,12 @@ void _percpu_write_lock(percpu_rwlock_t **per_cpudata, - /* - * First take the write lock to protect against other writers or slow - * path readers. -+ * -+ * Note we use the speculation unsafe variant of write_lock(), as the -+ * calling wrapper already adds a speculation barrier after the lock has -+ * been taken. - */ -- write_lock(&percpu_rwlock->rwlock); -+ _write_lock(&percpu_rwlock->rwlock); - - /* Now set the global variable so that readers start using read_lock. */ - percpu_rwlock->writer_activating = 1; -diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h -index fd0458be94ae..abe0804bf7d5 100644 ---- a/xen/include/xen/rwlock.h -+++ b/xen/include/xen/rwlock.h -@@ -326,8 +326,8 @@ static inline void _percpu_rwlock_owner_check(percpu_rwlock_t **per_cpudata, - #define percpu_rwlock_resource_init(l, owner) \ - (*(l) = (percpu_rwlock_t)PERCPU_RW_LOCK_UNLOCKED(&get_per_cpu_var(owner))) - --static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, -- percpu_rwlock_t *percpu_rwlock) -+static always_inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, -+ percpu_rwlock_t *percpu_rwlock) - { - /* Validate the correct per_cpudata variable has been provided. */ - _percpu_rwlock_owner_check(per_cpudata, percpu_rwlock); -@@ -362,6 +362,8 @@ static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, - } - else - { -+ /* Other branch already has a speculation barrier in read_lock(). */ -+ block_lock_speculation(); - /* All other paths have implicit check_lock() calls via read_lock(). */ - check_lock(&percpu_rwlock->rwlock.lock.debug, false); - } -@@ -410,8 +412,12 @@ static inline void _percpu_write_unlock(percpu_rwlock_t **per_cpudata, - _percpu_read_lock(&get_per_cpu_var(percpu), lock) - #define percpu_read_unlock(percpu, lock) \ - _percpu_read_unlock(&get_per_cpu_var(percpu), lock) --#define percpu_write_lock(percpu, lock) \ -- _percpu_write_lock(&get_per_cpu_var(percpu), lock) -+ -+#define percpu_write_lock(percpu, lock) \ -+({ \ -+ _percpu_write_lock(&get_per_cpu_var(percpu), lock); \ -+ block_lock_speculation(); \ -+}) - #define percpu_write_unlock(percpu, lock) \ - _percpu_write_unlock(&get_per_cpu_var(percpu), lock) - diff --git a/xsa453-4.17-6.patch b/xsa453-4.17-6.patch deleted file mode 100644 index 24cc56a..0000000 --- a/xsa453-4.17-6.patch +++ /dev/null @@ -1,382 +0,0 @@ -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= -Subject: locking: attempt to ensure lock wrappers are always inline - -In order to prevent the locking speculation barriers from being inside of -`call`ed functions that could be speculatively bypassed. - -While there also add an extra locking barrier to _mm_write_lock() in the branch -taken when the lock is already held. - -Note some functions are switched to use the unsafe variants (without speculation -barrier) of the locking primitives, but a speculation barrier is always added -to the exposed public lock wrapping helper. That's the case with -sched_spin_lock_double() or pcidevs_lock() for example. - -This is part of XSA-453 / CVE-2024-2193 - -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich -(cherry picked from commit 197ecd838a2aaf959a469df3696d4559c4f8b762) - -diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c -index cb1d81bf9e82..66f10952456b 100644 ---- a/xen/arch/x86/hvm/vpt.c -+++ b/xen/arch/x86/hvm/vpt.c -@@ -161,7 +161,7 @@ static int pt_irq_masked(struct periodic_time *pt) - * pt->vcpu field, because another thread holding the pt_migrate lock - * may already be spinning waiting for your vcpu lock. - */ --static void pt_vcpu_lock(struct vcpu *v) -+static always_inline void pt_vcpu_lock(struct vcpu *v) - { - spin_lock(&v->arch.hvm.tm_lock); - } -@@ -180,9 +180,13 @@ static void pt_vcpu_unlock(struct vcpu *v) - * need to take an additional lock that protects against pt->vcpu - * changing. - */ --static void pt_lock(struct periodic_time *pt) -+static always_inline void pt_lock(struct periodic_time *pt) - { -- read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); -+ /* -+ * Use the speculation unsafe variant for the first lock, as the following -+ * lock taking helper already includes a speculation barrier. -+ */ -+ _read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); - spin_lock(&pt->vcpu->arch.hvm.tm_lock); - } - -diff --git a/xen/arch/x86/include/asm/irq.h b/xen/arch/x86/include/asm/irq.h -index f6a0207a8087..823d627fd001 100644 ---- a/xen/arch/x86/include/asm/irq.h -+++ b/xen/arch/x86/include/asm/irq.h -@@ -178,6 +178,7 @@ void cf_check irq_complete_move(struct irq_desc *); - - extern struct irq_desc *irq_desc; - -+/* Not speculation safe, only used for AP bringup. */ - void lock_vector_lock(void); - void unlock_vector_lock(void); - -diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h -index c1523aeccf99..265239c49f39 100644 ---- a/xen/arch/x86/mm/mm-locks.h -+++ b/xen/arch/x86/mm/mm-locks.h -@@ -86,8 +86,8 @@ static inline void _set_lock_level(int l) - this_cpu(mm_lock_level) = l; - } - --static inline void _mm_lock(const struct domain *d, mm_lock_t *l, -- const char *func, int level, int rec) -+static always_inline void _mm_lock(const struct domain *d, mm_lock_t *l, -+ const char *func, int level, int rec) - { - if ( !((mm_locked_by_me(l)) && rec) ) - _check_lock_level(d, level); -@@ -137,8 +137,8 @@ static inline int mm_write_locked_by_me(mm_rwlock_t *l) - return (l->locker == get_processor_id()); - } - --static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, -- const char *func, int level) -+static always_inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, -+ const char *func, int level) - { - if ( !mm_write_locked_by_me(l) ) - { -@@ -149,6 +149,8 @@ static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, - l->unlock_level = _get_lock_level(); - _set_lock_level(_lock_level(d, level)); - } -+ else -+ block_speculation(); - l->recurse_count++; - } - -@@ -162,8 +164,8 @@ static inline void mm_write_unlock(mm_rwlock_t *l) - percpu_write_unlock(p2m_percpu_rwlock, &l->lock); - } - --static inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, -- int level) -+static always_inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, -+ int level) - { - _check_lock_level(d, level); - percpu_read_lock(p2m_percpu_rwlock, &l->lock); -@@ -178,15 +180,15 @@ static inline void mm_read_unlock(mm_rwlock_t *l) - - /* This wrapper uses the line number to express the locking order below */ - #define declare_mm_lock(name) \ -- static inline void mm_lock_##name(const struct domain *d, mm_lock_t *l, \ -- const char *func, int rec) \ -+ static always_inline void mm_lock_##name( \ -+ const struct domain *d, mm_lock_t *l, const char *func, int rec) \ - { _mm_lock(d, l, func, MM_LOCK_ORDER_##name, rec); } - #define declare_mm_rwlock(name) \ -- static inline void mm_write_lock_##name(const struct domain *d, \ -- mm_rwlock_t *l, const char *func) \ -+ static always_inline void mm_write_lock_##name( \ -+ const struct domain *d, mm_rwlock_t *l, const char *func) \ - { _mm_write_lock(d, l, func, MM_LOCK_ORDER_##name); } \ -- static inline void mm_read_lock_##name(const struct domain *d, \ -- mm_rwlock_t *l) \ -+ static always_inline void mm_read_lock_##name(const struct domain *d, \ -+ mm_rwlock_t *l) \ - { _mm_read_lock(d, l, MM_LOCK_ORDER_##name); } - /* These capture the name of the calling function */ - #define mm_lock(name, d, l) mm_lock_##name(d, l, __func__, 0) -@@ -321,7 +323,7 @@ declare_mm_lock(altp2mlist) - #define MM_LOCK_ORDER_altp2m 40 - declare_mm_rwlock(altp2m); - --static inline void p2m_lock(struct p2m_domain *p) -+static always_inline void p2m_lock(struct p2m_domain *p) - { - if ( p2m_is_altp2m(p) ) - mm_write_lock(altp2m, p->domain, &p->lock); -diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c -index fc110506dce2..99dbcb3101e2 100644 ---- a/xen/arch/x86/mm/p2m-pod.c -+++ b/xen/arch/x86/mm/p2m-pod.c -@@ -36,7 +36,7 @@ - #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) - - /* Enforce lock ordering when grabbing the "external" page_alloc lock */ --static inline void lock_page_alloc(struct p2m_domain *p2m) -+static always_inline void lock_page_alloc(struct p2m_domain *p2m) - { - page_alloc_mm_pre_lock(p2m->domain); - spin_lock(&(p2m->domain->page_alloc_lock)); -diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c -index f5e0b12d1520..dada9f15f574 100644 ---- a/xen/common/event_channel.c -+++ b/xen/common/event_channel.c -@@ -62,7 +62,7 @@ - * just assume the event channel is free or unbound at the moment when the - * evtchn_read_trylock() returns false. - */ --static inline void evtchn_write_lock(struct evtchn *evtchn) -+static always_inline void evtchn_write_lock(struct evtchn *evtchn) - { - write_lock(&evtchn->lock); - -@@ -364,7 +364,8 @@ int evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc, evtchn_port_t port) - return rc; - } - --static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn) -+static always_inline void double_evtchn_lock(struct evtchn *lchn, -+ struct evtchn *rchn) - { - ASSERT(lchn != rchn); - -diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c -index ee7cc496b8cb..62a8685cd514 100644 ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -410,7 +410,7 @@ static inline void act_set_gfn(struct active_grant_entry *act, gfn_t gfn) - - static DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock); - --static inline void grant_read_lock(struct grant_table *gt) -+static always_inline void grant_read_lock(struct grant_table *gt) - { - percpu_read_lock(grant_rwlock, >->lock); - } -@@ -420,7 +420,7 @@ static inline void grant_read_unlock(struct grant_table *gt) - percpu_read_unlock(grant_rwlock, >->lock); - } - --static inline void grant_write_lock(struct grant_table *gt) -+static always_inline void grant_write_lock(struct grant_table *gt) - { - percpu_write_lock(grant_rwlock, >->lock); - } -@@ -457,7 +457,7 @@ nr_active_grant_frames(struct grant_table *gt) - return num_act_frames_from_sha_frames(nr_grant_frames(gt)); - } - --static inline struct active_grant_entry * -+static always_inline struct active_grant_entry * - active_entry_acquire(struct grant_table *t, grant_ref_t e) - { - struct active_grant_entry *act; -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index 078beb1adbbd..29bbab5ac6fd 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -348,23 +348,28 @@ uint64_t get_cpu_idle_time(unsigned int cpu) - * This avoids dead- or live-locks when this code is running on both - * cpus at the same time. - */ --static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2, -- unsigned long *flags) -+static always_inline void sched_spin_lock_double( -+ spinlock_t *lock1, spinlock_t *lock2, unsigned long *flags) - { -+ /* -+ * In order to avoid extra overhead, use the locking primitives without the -+ * speculation barrier, and introduce a single barrier here. -+ */ - if ( lock1 == lock2 ) - { -- spin_lock_irqsave(lock1, *flags); -+ *flags = _spin_lock_irqsave(lock1); - } - else if ( lock1 < lock2 ) - { -- spin_lock_irqsave(lock1, *flags); -- spin_lock(lock2); -+ *flags = _spin_lock_irqsave(lock1); -+ _spin_lock(lock2); - } - else - { -- spin_lock_irqsave(lock2, *flags); -- spin_lock(lock1); -+ *flags = _spin_lock_irqsave(lock2); -+ _spin_lock(lock1); - } -+ block_lock_speculation(); - } - - static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2, -diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h -index 0527a8c70d1c..24a93dd0c123 100644 ---- a/xen/common/sched/private.h -+++ b/xen/common/sched/private.h -@@ -207,8 +207,24 @@ DECLARE_PER_CPU(cpumask_t, cpumask_scratch); - #define cpumask_scratch (&this_cpu(cpumask_scratch)) - #define cpumask_scratch_cpu(c) (&per_cpu(cpumask_scratch, c)) - -+/* -+ * Deal with _spin_lock_irqsave() returning the flags value instead of storing -+ * it in a passed parameter. -+ */ -+#define _sched_spinlock0(lock, irq) _spin_lock##irq(lock) -+#define _sched_spinlock1(lock, irq, arg) ({ \ -+ BUILD_BUG_ON(sizeof(arg) != sizeof(unsigned long)); \ -+ (arg) = _spin_lock##irq(lock); \ -+}) -+ -+#define _sched_spinlock__(nr) _sched_spinlock ## nr -+#define _sched_spinlock_(nr) _sched_spinlock__(nr) -+#define _sched_spinlock(lock, irq, args...) \ -+ _sched_spinlock_(count_args(args))(lock, irq, ## args) -+ - #define sched_lock(kind, param, cpu, irq, arg...) \ --static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ -+static always_inline spinlock_t \ -+*kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ - { \ - for ( ; ; ) \ - { \ -@@ -220,10 +236,16 @@ static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ - * \ - * It may also be the case that v->processor may change but the \ - * lock may be the same; this will succeed in that case. \ -+ * \ -+ * Use the speculation unsafe locking helper, there's a speculation \ -+ * barrier before returning to the caller. \ - */ \ -- spin_lock##irq(lock, ## arg); \ -+ _sched_spinlock(lock, irq, ## arg); \ - if ( likely(lock == get_sched_res(cpu)->schedule_lock) ) \ -+ { \ -+ block_lock_speculation(); \ - return lock; \ -+ } \ - spin_unlock##irq(lock, ## arg); \ - } \ - } -diff --git a/xen/common/timer.c b/xen/common/timer.c -index 9b5016d5ed82..459668d417f4 100644 ---- a/xen/common/timer.c -+++ b/xen/common/timer.c -@@ -240,7 +240,7 @@ static inline void deactivate_timer(struct timer *timer) - list_add(&timer->inactive, &per_cpu(timers, timer->cpu).inactive); - } - --static inline bool_t timer_lock(struct timer *timer) -+static inline bool_t timer_lock_unsafe(struct timer *timer) - { - unsigned int cpu; - -@@ -254,7 +254,8 @@ static inline bool_t timer_lock(struct timer *timer) - rcu_read_unlock(&timer_cpu_read_lock); - return 0; - } -- spin_lock(&per_cpu(timers, cpu).lock); -+ /* Use the speculation unsafe variant, the wrapper has the barrier. */ -+ _spin_lock(&per_cpu(timers, cpu).lock); - if ( likely(timer->cpu == cpu) ) - break; - spin_unlock(&per_cpu(timers, cpu).lock); -@@ -267,8 +268,9 @@ static inline bool_t timer_lock(struct timer *timer) - #define timer_lock_irqsave(t, flags) ({ \ - bool_t __x; \ - local_irq_save(flags); \ -- if ( !(__x = timer_lock(t)) ) \ -+ if ( !(__x = timer_lock_unsafe(t)) ) \ - local_irq_restore(flags); \ -+ block_lock_speculation(); \ - __x; \ - }) - -diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c -index 8c62b14d19c1..1b3d28516643 100644 ---- a/xen/drivers/passthrough/pci.c -+++ b/xen/drivers/passthrough/pci.c -@@ -52,9 +52,10 @@ struct pci_seg { - - static spinlock_t _pcidevs_lock = SPIN_LOCK_UNLOCKED; - --void pcidevs_lock(void) -+/* Do not use, as it has no speculation barrier, use pcidevs_lock() instead. */ -+void pcidevs_lock_unsafe(void) - { -- spin_lock_recursive(&_pcidevs_lock); -+ _spin_lock_recursive(&_pcidevs_lock); - } - - void pcidevs_unlock(void) -diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h -index 8eae9984a9f1..dd96e84c6956 100644 ---- a/xen/include/xen/event.h -+++ b/xen/include/xen/event.h -@@ -114,12 +114,12 @@ void notify_via_xen_event_channel(struct domain *ld, int lport); - #define bucket_from_port(d, p) \ - ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET]) - --static inline void evtchn_read_lock(struct evtchn *evtchn) -+static always_inline void evtchn_read_lock(struct evtchn *evtchn) - { - read_lock(&evtchn->lock); - } - --static inline bool evtchn_read_trylock(struct evtchn *evtchn) -+static always_inline bool evtchn_read_trylock(struct evtchn *evtchn) - { - return read_trylock(&evtchn->lock); - } -diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h -index 5975ca2f3032..b373f139d136 100644 ---- a/xen/include/xen/pci.h -+++ b/xen/include/xen/pci.h -@@ -155,8 +155,12 @@ struct pci_dev { - * devices, it also sync the access to the msi capability that is not - * interrupt handling related (the mask bit register). - */ -- --void pcidevs_lock(void); -+void pcidevs_lock_unsafe(void); -+static always_inline void pcidevs_lock(void) -+{ -+ pcidevs_lock_unsafe(); -+ block_lock_speculation(); -+} - void pcidevs_unlock(void); - bool_t __must_check pcidevs_locked(void); - diff --git a/xsa453-4.17-7.patch b/xsa453-4.17-7.patch deleted file mode 100644 index f110929..0000000 --- a/xsa453-4.17-7.patch +++ /dev/null @@ -1,61 +0,0 @@ -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= -Subject: x86/mm: add speculation barriers to open coded locks - -Add a speculation barrier to the clearly identified open-coded lock taking -functions. - -Note that the memory sharing page_lock() replacement (_page_lock()) is left -as-is, as the code is experimental and not security supported. - -This is part of XSA-453 / CVE-2024-2193 - -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich -(cherry picked from commit 42a572a38e22a97d86a4b648a22597628d5b42e4) - -diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h -index a5d7fdd32ea7..5845b729c3f7 100644 ---- a/xen/arch/x86/include/asm/mm.h -+++ b/xen/arch/x86/include/asm/mm.h -@@ -393,7 +393,9 @@ const struct platform_bad_page *get_platform_badpages(unsigned int *array_size); - * The use of PGT_locked in mem_sharing does not collide, since mem_sharing is - * only supported for hvm guests, which do not have PV PTEs updated. - */ --int page_lock(struct page_info *page); -+int page_lock_unsafe(struct page_info *page); -+#define page_lock(pg) lock_evaluate_nospec(page_lock_unsafe(pg)) -+ - void page_unlock(struct page_info *page); - - void put_page_type(struct page_info *page); -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 330c4abcd10e..8d19d719bd16 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -2033,7 +2033,7 @@ static inline bool current_locked_page_ne_check(struct page_info *page) { - #define current_locked_page_ne_check(x) true - #endif - --int page_lock(struct page_info *page) -+int page_lock_unsafe(struct page_info *page) - { - unsigned long x, nx; - -@@ -2094,7 +2094,7 @@ void page_unlock(struct page_info *page) - * l3t_lock(), so to avoid deadlock we must avoid grabbing them in - * reverse order. - */ --static void l3t_lock(struct page_info *page) -+static always_inline void l3t_lock(struct page_info *page) - { - unsigned long x, nx; - -@@ -2103,6 +2103,8 @@ static void l3t_lock(struct page_info *page) - cpu_relax(); - nx = x | PGT_locked; - } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); -+ -+ block_lock_speculation(); - } - - static void l3t_unlock(struct page_info *page) diff --git a/xsa453-4.17-8.patch b/xsa453-4.17-8.patch deleted file mode 100644 index a9de529..0000000 --- a/xsa453-4.17-8.patch +++ /dev/null @@ -1,201 +0,0 @@ -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= -Subject: x86: protect conditional lock taking from speculative execution - -Conditionally taken locks that use the pattern: - -if ( lock ) - spin_lock(...); - -Need an else branch in order to issue an speculation barrier in the else case, -just like it's done in case the lock needs to be acquired. - -eval_nospec() could be used on the condition itself, but that would result in a -double barrier on the branch where the lock is taken. - -Introduce a new pair of helpers, {gfn,spin}_lock_if() that can be used to -conditionally take a lock in a speculation safe way. - -This is part of XSA-453 / CVE-2024-2193 - -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich -(cherry picked from commit 03cf7ca23e0e876075954c558485b267b7d02406) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 8d19d719bd16..d31b8d56ffbc 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -5023,8 +5023,7 @@ static l3_pgentry_t *virt_to_xen_l3e(unsigned long v) - if ( !l3t ) - return NULL; - UNMAP_DOMAIN_PAGE(l3t); -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) ) - { - l4_pgentry_t l4e = l4e_from_mfn(l3mfn, __PAGE_HYPERVISOR); -@@ -5061,8 +5060,7 @@ static l2_pgentry_t *virt_to_xen_l2e(unsigned long v) - return NULL; - } - UNMAP_DOMAIN_PAGE(l2t); -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) - { - l3e_write(pl3e, l3e_from_mfn(l2mfn, __PAGE_HYPERVISOR)); -@@ -5100,8 +5098,7 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) - return NULL; - } - UNMAP_DOMAIN_PAGE(l1t); -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) - { - l2e_write(pl2e, l2e_from_mfn(l1mfn, __PAGE_HYPERVISOR)); -@@ -5132,6 +5129,8 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) - do { \ - if ( locking ) \ - l3t_lock(page); \ -+ else \ -+ block_lock_speculation(); \ - } while ( false ) - - #define L3T_UNLOCK(page) \ -@@ -5347,8 +5346,7 @@ int map_pages_to_xen( - if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL ) - flush_flags |= FLUSH_TLB_GLOBAL; - -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && - (l3e_get_flags(*pl3e) & _PAGE_PSE) ) - { -@@ -5452,8 +5450,7 @@ int map_pages_to_xen( - if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL ) - flush_flags |= FLUSH_TLB_GLOBAL; - -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && - (l2e_get_flags(*pl2e) & _PAGE_PSE) ) - { -@@ -5494,8 +5491,7 @@ int map_pages_to_xen( - unsigned long base_mfn; - const l1_pgentry_t *l1t; - -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - - ol2e = *pl2e; - /* -@@ -5549,8 +5545,7 @@ int map_pages_to_xen( - unsigned long base_mfn; - const l2_pgentry_t *l2t; - -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - - ol3e = *pl3e; - /* -@@ -5694,8 +5689,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - l3e_get_flags(*pl3e))); - UNMAP_DOMAIN_PAGE(l2t); - -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && - (l3e_get_flags(*pl3e) & _PAGE_PSE) ) - { -@@ -5754,8 +5748,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - l2e_get_flags(*pl2e) & ~_PAGE_PSE)); - UNMAP_DOMAIN_PAGE(l1t); - -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && - (l2e_get_flags(*pl2e) & _PAGE_PSE) ) - { -@@ -5799,8 +5792,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - */ - if ( (nf & _PAGE_PRESENT) || ((v != e) && (l1_table_offset(v) != 0)) ) - continue; -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - - /* - * L2E may be already cleared, or set to a superpage, by -@@ -5847,8 +5839,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - if ( (nf & _PAGE_PRESENT) || - ((v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0)) ) - continue; -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - - /* - * L3E may be already cleared, or set to a superpage, by -diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h -index 265239c49f39..3ea2d8eb032c 100644 ---- a/xen/arch/x86/mm/mm-locks.h -+++ b/xen/arch/x86/mm/mm-locks.h -@@ -347,6 +347,15 @@ static inline void p2m_unlock(struct p2m_domain *p) - #define p2m_locked_by_me(p) mm_write_locked_by_me(&(p)->lock) - #define gfn_locked_by_me(p,g) p2m_locked_by_me(p) - -+static always_inline void gfn_lock_if(bool condition, struct p2m_domain *p2m, -+ gfn_t gfn, unsigned int order) -+{ -+ if ( condition ) -+ gfn_lock(p2m, gfn, order); -+ else -+ block_lock_speculation(); -+} -+ - /* PoD lock (per-p2m-table) - * - * Protects private PoD data structs: entry and cache -diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c -index b28c899b5ea7..1fa9e01012a2 100644 ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -292,9 +292,8 @@ mfn_t p2m_get_gfn_type_access(struct p2m_domain *p2m, gfn_t gfn, - if ( q & P2M_UNSHARE ) - q |= P2M_ALLOC; - -- if ( locked ) -- /* Grab the lock here, don't release until put_gfn */ -- gfn_lock(p2m, gfn, 0); -+ /* Grab the lock here, don't release until put_gfn */ -+ gfn_lock_if(locked, p2m, gfn, 0); - - mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL); - -diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h -index daf48fdea709..7e75d0e2e7fb 100644 ---- a/xen/include/xen/spinlock.h -+++ b/xen/include/xen/spinlock.h -@@ -216,6 +216,14 @@ static always_inline void spin_lock_irq(spinlock_t *l) - block_lock_speculation(); \ - }) - -+/* Conditionally take a spinlock in a speculation safe way. */ -+static always_inline void spin_lock_if(bool condition, spinlock_t *l) -+{ -+ if ( condition ) -+ _spin_lock(l); -+ block_lock_speculation(); -+} -+ - #define spin_unlock(l) _spin_unlock(l) - #define spin_unlock_irq(l) _spin_unlock_irq(l) - #define spin_unlock_irqrestore(l, f) _spin_unlock_irqrestore(l, f)