diff --git a/.gitignore b/.gitignore index fd62b03..6077ab0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,4 @@ lwip-1.3.0.tar.gz pciutils-2.2.9.tar.bz2 zlib-1.2.3.tar.gz polarssl-1.1.4-gpl.tgz -/xen-4.14.4.tar.gz +/xen-4.14.5.tar.gz diff --git a/sources b/sources index ad71745..9523d9e 100644 --- a/sources +++ b/sources @@ -4,4 +4,4 @@ SHA512 (newlib-1.16.0.tar.gz) = 40eb96bbc6736a16b6399e0cdb73e853d0d90b685c967e77 SHA512 (zlib-1.2.3.tar.gz) = 021b958fcd0d346c4ba761bcf0cc40f3522de6186cf5a0a6ea34a70504ce9622b1c2626fce40675bc8282cf5f5ade18473656abc38050f72f5d6480507a2106e SHA512 (polarssl-1.1.4-gpl.tgz) = 88da614e4d3f4409c4fd3bb3e44c7587ba051e3fed4e33d526069a67e8180212e1ea22da984656f50e290049f60ddca65383e5983c0f8884f648d71f698303ad SHA512 (pciutils-2.2.9.tar.bz2) = 2b3d98d027e46d8c08037366dde6f0781ca03c610ef2b380984639e4ef39899ed8d8b8e4cd9c9dc54df101279b95879bd66bfd4d04ad07fef41e847ea7ae32b5 -SHA512 (xen-4.14.4.tar.gz) = 7c8b86f204a30b82ffde2e1bd6da07d609f6721db2261eab01b8dc453a4cb66c9bde79212a44d066e947612aa0edd0051e1188abfaa3b646a76898720751dede +SHA512 (xen-4.14.5.tar.gz) = 7fc1c98b5e135e14a1902786d6cf44304c1c1e9b600195592aa3d12ba937bc307eaae984596c30544519f181d2a02f2c9ad9c94d6b2b6fac2091b54568b0705e diff --git a/xen.git-1a52e3946d9b04eb8a38d561524e42556cdeb4fb.patch b/xen.git-1a52e3946d9b04eb8a38d561524e42556cdeb4fb.patch deleted file mode 100644 index ad59a14..0000000 --- a/xen.git-1a52e3946d9b04eb8a38d561524e42556cdeb4fb.patch +++ /dev/null @@ -1,94 +0,0 @@ -From: Andrew Cooper -Date: Tue, 25 Jan 2022 17:14:48 +0000 (+0000) -Subject: x86/spec-ctrl: Introduce new has_spec_ctrl boolean -X-Git-Url: http://xenbits.xenproject.org/gitweb/?p=xen.git;a=commitdiff_plain;h=1a52e3946d9b04eb8a38d561524e42556cdeb4fb - -x86/spec-ctrl: Introduce new has_spec_ctrl boolean - -Most MSR_SPEC_CTRL setup will be common between Intel and AMD. Instead of -opencoding an OR of two features everywhere, introduce has_spec_ctrl instead. - -Reword the comment above the Intel specific alternatives block to highlight -that it is Intel specific, and pull the setting of default_xen_spec_ctrl.IBRS -out because it will want to be common. - -No functional change. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 5d9eff3a312763d889cfbf3c8468b6dfb3ab490c) ---- - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index e85b0c0c7d..84d5de8856 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -898,7 +898,7 @@ static __init void mds_calculations(uint64_t caps) - void __init init_speculation_mitigations(void) - { - enum ind_thunk thunk = THUNK_DEFAULT; -- bool ibrs = false, hw_smt_enabled; -+ bool has_spec_ctrl, ibrs = false, hw_smt_enabled; - bool cpu_has_bug_taa; - uint64_t caps = 0; - -@@ -907,6 +907,8 @@ void __init init_speculation_mitigations(void) - - hw_smt_enabled = check_smt_enabled(); - -+ has_spec_ctrl = boot_cpu_has(X86_FEATURE_IBRSB); -+ - /* - * First, disable the use of retpolines if Xen is using shadow stacks, as - * they are incompatible. -@@ -944,11 +946,11 @@ void __init init_speculation_mitigations(void) - */ - else if ( retpoline_safe(caps) ) - thunk = THUNK_RETPOLINE; -- else if ( boot_cpu_has(X86_FEATURE_IBRSB) ) -+ else if ( has_spec_ctrl ) - ibrs = true; - } - /* Without compiler thunk support, use IBRS if available. */ -- else if ( boot_cpu_has(X86_FEATURE_IBRSB) ) -+ else if ( has_spec_ctrl ) - ibrs = true; - } - -@@ -979,10 +981,7 @@ void __init init_speculation_mitigations(void) - else if ( thunk == THUNK_JMP ) - setup_force_cpu_cap(X86_FEATURE_IND_THUNK_JMP); - -- /* -- * If we are on hardware supporting MSR_SPEC_CTRL, see about setting up -- * the alternatives blocks so we can virtualise support for guests. -- */ -+ /* Intel hardware: MSR_SPEC_CTRL alternatives setup. */ - if ( boot_cpu_has(X86_FEATURE_IBRSB) ) - { - if ( opt_msr_sc_pv ) -@@ -1001,11 +1000,12 @@ void __init init_speculation_mitigations(void) - default_spec_ctrl_flags |= SCF_ist_wrmsr; - setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM); - } -- -- if ( ibrs ) -- default_xen_spec_ctrl |= SPEC_CTRL_IBRS; - } - -+ /* If we have IBRS available, see whether we should use it. */ -+ if ( has_spec_ctrl && ibrs ) -+ default_xen_spec_ctrl |= SPEC_CTRL_IBRS; -+ - /* If we have SSBD available, see whether we should use it. */ - if ( boot_cpu_has(X86_FEATURE_SSBD) && opt_ssbd ) - default_xen_spec_ctrl |= SPEC_CTRL_SSBD; -@@ -1220,7 +1220,7 @@ void __init init_speculation_mitigations(void) - * boot won't have any other code running in a position to mount an - * attack. - */ -- if ( boot_cpu_has(X86_FEATURE_IBRSB) ) -+ if ( has_spec_ctrl ) - { - bsp_delay_spec_ctrl = !cpu_has_hypervisor && default_xen_spec_ctrl; - diff --git a/xen.git-35d0ea6726f8f013cbf3699a90309136896ae55e.patch b/xen.git-35d0ea6726f8f013cbf3699a90309136896ae55e.patch deleted file mode 100644 index b462ae4..0000000 --- a/xen.git-35d0ea6726f8f013cbf3699a90309136896ae55e.patch +++ /dev/null @@ -1,62 +0,0 @@ -From: Andrew Cooper -Date: Tue, 25 Jan 2022 16:09:59 +0000 (+0000) -Subject: x86/spec-ctrl: Drop use_spec_ctrl boolean -X-Git-Url: http://xenbits.xenproject.org/gitweb/?p=xen.git;a=commitdiff_plain;h=35d0ea6726f8f013cbf3699a90309136896ae55e - -x86/spec-ctrl: Drop use_spec_ctrl boolean - -Several bugfixes have reduced the utility of this variable from it's original -purpose, and now all it does is aid in the setup of SCF_ist_wrmsr. - -Simplify the logic by drop the variable, and doubling up the setting of -SCF_ist_wrmsr for the PV and HVM blocks, which will make the AMD SPEC_CTRL -support easier to follow. Leave a comment explaining why SCF_ist_wrmsr is -still necessary for the VMExit case. - -No functional change. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit ec083bf552c35e10347449e21809f4780f8155d2) ---- - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index f70535b6e7..e85b0c0c7d 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -898,7 +898,7 @@ static __init void mds_calculations(uint64_t caps) - void __init init_speculation_mitigations(void) - { - enum ind_thunk thunk = THUNK_DEFAULT; -- bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled; -+ bool ibrs = false, hw_smt_enabled; - bool cpu_has_bug_taa; - uint64_t caps = 0; - -@@ -987,19 +987,21 @@ void __init init_speculation_mitigations(void) - { - if ( opt_msr_sc_pv ) - { -- use_spec_ctrl = true; -+ default_spec_ctrl_flags |= SCF_ist_wrmsr; - setup_force_cpu_cap(X86_FEATURE_SC_MSR_PV); - } - - if ( opt_msr_sc_hvm ) - { -- use_spec_ctrl = true; -+ /* -+ * While the guest MSR_SPEC_CTRL value is loaded/saved atomically, -+ * Xen's value is not restored atomically. An early NMI hitting -+ * the VMExit path needs to restore Xen's value for safety. -+ */ -+ default_spec_ctrl_flags |= SCF_ist_wrmsr; - setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM); - } - -- if ( use_spec_ctrl ) -- default_spec_ctrl_flags |= SCF_ist_wrmsr; -- - if ( ibrs ) - default_xen_spec_ctrl |= SPEC_CTRL_IBRS; - } diff --git a/xen.spec b/xen.spec index 3ea73bf..9ec19cd 100644 --- a/xen.spec +++ b/xen.spec @@ -57,8 +57,8 @@ Summary: Xen is a virtual machine monitor Name: xen -Version: 4.14.4 -Release: 3%{?dist} +Version: 4.14.5 +Release: 1%{?dist} License: GPLv2+ and LGPLv2+ and BSD URL: http://xen.org/ Source0: https://downloads.xenproject.org/release/xen/%{version}/xen-%{version}.tar.gz @@ -118,27 +118,6 @@ Patch47: xen.git-d6627cf1b63ce57a6a7e2c1800dbc50eed742c32.patch Patch48: xen.git-d8099d94dfaa3573bd86ebfc457cbc8f70a3ecda.patch Patch49: xen.git-8169f82049efb5b2044b33aa482ba3a136b7804d.patch Patch56: xsa376.patch -Patch57: xsa398-4.14-1-xen-arm-Introduce-new-Arm-processors.patch -Patch58: xsa398-4.14-2-xen-arm-move-errata-CSV2-check-earlier.patch -Patch59: xsa398-4.14-3-xen-arm-Add-ECBHB-and-CLEARBHB-ID-fields.patch -Patch60: xsa398-4.14-4-xen-arm-Add-Spectre-BHB-handling.patch -Patch61: xsa398-4.14-5-xen-arm-Allow-to-discover-and-use-SMCCC_ARCH_WORKARO.patch -Patch62: xen.git-35d0ea6726f8f013cbf3699a90309136896ae55e.patch -Patch63: xen.git-1a52e3946d9b04eb8a38d561524e42556cdeb4fb.patch -Patch64: xsa398-4.14-6-x86-spec-ctrl-Cease-using-thunk-lfence-on-AMD.patch -Patch65: xsa397-4.14.patch -Patch66: xsa399-4.16.patch -Patch67: xsa400-4.14-01.patch -Patch68: xsa400-4.14-02.patch -Patch69: xsa400-4.14-03.patch -Patch70: xsa400-4.14-04.patch -Patch71: xsa400-4.14-05.patch -Patch72: xsa400-4.14-06.patch -Patch73: xsa400-4.14-07.patch -Patch74: xsa400-4.14-08.patch -Patch75: xsa400-4.14-09.patch -Patch76: xsa400-4.14-10.patch -Patch77: xsa400-4.14-11.patch %if %build_qemutrad @@ -353,27 +332,6 @@ manage Xen virtual machines. %patch48 -p1 %patch49 -p1 %patch56 -p1 -%patch57 -p1 -%patch58 -p1 -%patch59 -p1 -%patch60 -p1 -%patch61 -p1 -%patch62 -p1 -%patch63 -p1 -%patch64 -p1 -%patch65 -p1 -%patch66 -p1 -%patch67 -p1 -%patch68 -p1 -%patch69 -p1 -%patch70 -p1 -%patch71 -p1 -%patch72 -p1 -%patch73 -p1 -%patch74 -p1 -%patch75 -p1 -%patch76 -p1 -%patch77 -p1 # qemu-xen-traditional patches pushd tools/qemu-xen-traditional @@ -980,6 +938,10 @@ fi %endif %changelog +* Fri Apr 15 2022 Michael Young - 4.14.5-1 +- update to xen-4.14.5 + remove or adjust patches now included or superceded upstream + * Wed Apr 06 2022 Michael Young - 4.14.4-3 - Racy interactions between dirty vram tracking and paging log dirty hypercalls [XSA-397, CVE-2022-26356] diff --git a/xsa397-4.14.patch b/xsa397-4.14.patch deleted file mode 100644 index 0ac6230..0000000 --- a/xsa397-4.14.patch +++ /dev/null @@ -1,98 +0,0 @@ -From: Roger Pau Monne -Subject: x86/hap: do not switch on log dirty for VRAM tracking - -XEN_DMOP_track_dirty_vram possibly calls into paging_log_dirty_enable -when using HAP mode, and it can interact badly with other ongoing -paging domctls, as XEN_DMOP_track_dirty_vram is not holding the domctl -lock. - -This was detected as a result of the following assert triggering when -doing repeated migrations of a HAP HVM domain with a stubdom: - -Assertion 'd->arch.paging.log_dirty.allocs == 0' failed at paging.c:198 -----[ Xen-4.17-unstable x86_64 debug=y Not tainted ]---- -CPU: 34 -RIP: e008:[] arch/x86/mm/paging.c#paging_free_log_dirty_bitmap+0x606/0x6 -RFLAGS: 0000000000010206 CONTEXT: hypervisor (d0v23) -[...] -Xen call trace: - [] R arch/x86/mm/paging.c#paging_free_log_dirty_bitmap+0x606/0x63a - [] S xsm/flask/hooks.c#domain_has_perm+0x5a/0x67 - [] F paging_domctl+0x251/0xd41 - [] F paging_domctl_continuation+0x19d/0x202 - [] F pv_hypercall+0x150/0x2a7 - [] F lstar_enter+0x12d/0x140 - -Such assert triggered because the stubdom used -XEN_DMOP_track_dirty_vram while dom0 was in the middle of executing -XEN_DOMCTL_SHADOW_OP_OFF, and so log dirty become enabled while -retiring the old structures, thus leading to new entries being -populated in already clear slots. - -Fix this by not enabling log dirty for VRAM tracking, similar to what -is done when using shadow instead of HAP. Call -p2m_enable_hardware_log_dirty when enabling VRAM tracking in order to -get some hardware assistance if available. As a side effect the memory -pressure on the p2m pool should go down if only VRAM tracking is -enabled, as the dirty bitmap is no longer allocated. - -Note that paging_log_dirty_range (used to get the dirty bitmap for -VRAM tracking) doesn't use the log dirty bitmap, and instead relies on -checking whether each gfn on the range has been switched from -p2m_ram_logdirty to p2m_ram_rw in order to account for dirty pages. - -This is CVE-2022-26356 / XSA-397. - -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich - ---- a/xen/include/asm-x86/paging.h -+++ b/xen/include/asm-x86/paging.h -@@ -160,9 +160,6 @@ void paging_log_dirty_range(struct domai - unsigned long nr, - uint8_t *dirty_bitmap); - --/* enable log dirty */ --int paging_log_dirty_enable(struct domain *d, bool log_global); -- - /* log dirty initialization */ - void paging_log_dirty_init(struct domain *d, const struct log_dirty_ops *ops); - ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -69,13 +69,6 @@ int hap_track_dirty_vram(struct domain * - { - int size = (nr + BITS_PER_BYTE - 1) / BITS_PER_BYTE; - -- if ( !paging_mode_log_dirty(d) ) -- { -- rc = paging_log_dirty_enable(d, false); -- if ( rc ) -- goto out; -- } -- - rc = -ENOMEM; - dirty_bitmap = vzalloc(size); - if ( !dirty_bitmap ) -@@ -107,6 +100,10 @@ int hap_track_dirty_vram(struct domain * - - paging_unlock(d); - -+ domain_pause(d); -+ p2m_enable_hardware_log_dirty(d); -+ domain_unpause(d); -+ - if ( oend > ostart ) - p2m_change_type_range(d, ostart, oend, - p2m_ram_logdirty, p2m_ram_rw); ---- a/xen/arch/x86/mm/paging.c -+++ b/xen/arch/x86/mm/paging.c -@@ -209,7 +209,7 @@ static int paging_free_log_dirty_bitmap( - return rc; - } - --int paging_log_dirty_enable(struct domain *d, bool log_global) -+static int paging_log_dirty_enable(struct domain *d, bool log_global) - { - int ret; - diff --git a/xsa398-4.14-1-xen-arm-Introduce-new-Arm-processors.patch b/xsa398-4.14-1-xen-arm-Introduce-new-Arm-processors.patch deleted file mode 100644 index e6b2569..0000000 --- a/xsa398-4.14-1-xen-arm-Introduce-new-Arm-processors.patch +++ /dev/null @@ -1,63 +0,0 @@ -From 021466aa73caaa0c5983f02203678e649dd4d22c Mon Sep 17 00:00:00 2001 -From: Bertrand Marquis -Date: Tue, 15 Feb 2022 10:37:51 +0000 -Subject: xen/arm: Introduce new Arm processors - -Add some new processor identifiers in processor.h and sync Xen -definitions with status of Linux 5.17 (declared in -arch/arm64/include/asm/cputype.h). - -This is part of XSA-398 / CVE-2022-23960. - -Signed-off-by: Bertrand Marquis -Acked-by: Julien Grall -(cherry picked from commit 35d1b85a6b43483f6bd007d48757434e54743e98) - -diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h -index 87c8136022df..17cc5cf486f9 100644 ---- a/xen/include/asm-arm/processor.h -+++ b/xen/include/asm-arm/processor.h -@@ -53,6 +53,7 @@ - #define ARM_CPU_PART_CORTEX_A17 0xC0E - #define ARM_CPU_PART_CORTEX_A15 0xC0F - #define ARM_CPU_PART_CORTEX_A53 0xD03 -+#define ARM_CPU_PART_CORTEX_A35 0xD04 - #define ARM_CPU_PART_CORTEX_A55 0xD05 - #define ARM_CPU_PART_CORTEX_A57 0xD07 - #define ARM_CPU_PART_CORTEX_A72 0xD08 -@@ -60,11 +61,20 @@ - #define ARM_CPU_PART_CORTEX_A75 0xD0A - #define ARM_CPU_PART_CORTEX_A76 0xD0B - #define ARM_CPU_PART_NEOVERSE_N1 0xD0C -+#define ARM_CPU_PART_CORTEX_A77 0xD0D -+#define ARM_CPU_PART_NEOVERSE_V1 0xD40 -+#define ARM_CPU_PART_CORTEX_A78 0xD41 -+#define ARM_CPU_PART_CORTEX_X1 0xD44 -+#define ARM_CPU_PART_CORTEX_A710 0xD47 -+#define ARM_CPU_PART_CORTEX_X2 0xD48 -+#define ARM_CPU_PART_NEOVERSE_N2 0xD49 -+#define ARM_CPU_PART_CORTEX_A78C 0xD4B - - #define MIDR_CORTEX_A12 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A12) - #define MIDR_CORTEX_A17 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A17) - #define MIDR_CORTEX_A15 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A15) - #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) -+#define MIDR_CORTEX_A35 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A35) - #define MIDR_CORTEX_A55 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A55) - #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) - #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) -@@ -72,6 +82,14 @@ - #define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75) - #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) - #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) -+#define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77) -+#define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1) -+#define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78) -+#define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) -+#define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) -+#define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) -+#define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) -+#define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) - - /* MPIDR Multiprocessor Affinity Register */ - #define _MPIDR_UP (30) diff --git a/xsa398-4.14-2-xen-arm-move-errata-CSV2-check-earlier.patch b/xsa398-4.14-2-xen-arm-move-errata-CSV2-check-earlier.patch deleted file mode 100644 index 556b9c5..0000000 --- a/xsa398-4.14-2-xen-arm-move-errata-CSV2-check-earlier.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 6da7a845fb476ef7395185ec08a58c76ebd8c442 Mon Sep 17 00:00:00 2001 -From: Bertrand Marquis -Date: Tue, 15 Feb 2022 10:39:47 +0000 -Subject: xen/arm: move errata CSV2 check earlier - -CSV2 availability check is done after printing to the user that -workaround 1 will be used. Move the check before to prevent saying to the -user that workaround 1 is used when it is not because it is not needed. -This will also allow to reuse install_bp_hardening_vec function for -other use cases. - -Code previously returning "true", now returns "0" to conform to -enable_smccc_arch_workaround_1 returning an int and surrounding code -doing a "return 0" if workaround is not needed. - -This is part of XSA-398 / CVE-2022-23960. - -Signed-off-by: Bertrand Marquis -Reviewed-by: Julien Grall -(cherry picked from commit 599616d70eb886b9ad0ef9d6b51693ce790504ba) - -diff --git a/xen/arch/arm/cpuerrata.c b/xen/arch/arm/cpuerrata.c -index 66d9a1e45cf8..9d79e3bad7e8 100644 ---- a/xen/arch/arm/cpuerrata.c -+++ b/xen/arch/arm/cpuerrata.c -@@ -103,13 +103,6 @@ install_bp_hardening_vec(const struct arm_cpu_capabilities *entry, - printk(XENLOG_INFO "CPU%u will %s on exception entry\n", - smp_processor_id(), desc); - -- /* -- * No need to install hardened vector when the processor has -- * ID_AA64PRF0_EL1.CSV2 set. -- */ -- if ( cpu_data[smp_processor_id()].pfr64.csv2 ) -- return true; -- - spin_lock(&bp_lock); - - /* -@@ -168,6 +161,13 @@ static int enable_smccc_arch_workaround_1(void *data) - if ( !entry->matches(entry) ) - return 0; - -+ /* -+ * No need to install hardened vector when the processor has -+ * ID_AA64PRF0_EL1.CSV2 set. -+ */ -+ if ( cpu_data[smp_processor_id()].pfr64.csv2 ) -+ return 0; -+ - if ( smccc_ver < SMCCC_VERSION(1, 1) ) - goto warn; - diff --git a/xsa398-4.14-3-xen-arm-Add-ECBHB-and-CLEARBHB-ID-fields.patch b/xsa398-4.14-3-xen-arm-Add-ECBHB-and-CLEARBHB-ID-fields.patch deleted file mode 100644 index 8b9bc8a..0000000 --- a/xsa398-4.14-3-xen-arm-Add-ECBHB-and-CLEARBHB-ID-fields.patch +++ /dev/null @@ -1,76 +0,0 @@ -From ee4b53ae1b95966fd9a491668f0eca73028925e1 Mon Sep 17 00:00:00 2001 -From: Bertrand Marquis -Date: Wed, 23 Feb 2022 09:42:18 +0000 -Subject: xen/arm: Add ECBHB and CLEARBHB ID fields - -Introduce ID coprocessor register ID_AA64ISAR2_EL1. -Add definitions in cpufeature and sysregs of ECBHB field in mmfr1 and -CLEARBHB in isar2 ID coprocessor registers. - -This is part of XSA-398 / CVE-2022-23960. - -Signed-off-by: Bertrand Marquis -Acked-by: Julien Grall -(cherry picked from commit 4b68d12d98b8790d8002fcc2c25a9d713374a4d7) - -diff --git a/xen/arch/arm/cpufeature.c b/xen/arch/arm/cpufeature.c -index 44126dbf0723..13dac7ccaf94 100644 ---- a/xen/arch/arm/cpufeature.c -+++ b/xen/arch/arm/cpufeature.c -@@ -117,6 +117,7 @@ void identify_cpu(struct cpuinfo_arm *c) - - c->isa64.bits[0] = READ_SYSREG64(ID_AA64ISAR0_EL1); - c->isa64.bits[1] = READ_SYSREG64(ID_AA64ISAR1_EL1); -+ c->isa64.bits[2] = READ_SYSREG64(ID_AA64ISAR2_EL1); - #endif - - c->pfr32.bits[0] = READ_SYSREG32(ID_PFR0_EL1); -diff --git a/xen/include/asm-arm/arm64/sysregs.h b/xen/include/asm-arm/arm64/sysregs.h -index c60029d38f5b..cfd2e1d48699 100644 ---- a/xen/include/asm-arm/arm64/sysregs.h -+++ b/xen/include/asm-arm/arm64/sysregs.h -@@ -57,6 +57,10 @@ - #define ICH_AP1R2_EL2 __AP1Rx_EL2(2) - #define ICH_AP1R3_EL2 __AP1Rx_EL2(3) - -+#ifndef ID_AA64ISAR2_EL1 -+#define ID_AA64ISAR2_EL1 S3_0_C0_C6_2 -+#endif -+ - /* Access to system registers */ - - #define READ_SYSREG32(name) ((uint32_t)READ_SYSREG64(name)) -diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h -index 016a9fe2039a..7be4ee8cf821 100644 ---- a/xen/include/asm-arm/cpufeature.h -+++ b/xen/include/asm-arm/cpufeature.h -@@ -188,12 +188,26 @@ struct cpuinfo_arm { - unsigned long lo:4; - unsigned long pan:4; - unsigned long __res1:8; -- unsigned long __res2:32; -+ unsigned long __res2:28; -+ unsigned long ecbhb:4; - }; - } mm64; - -- struct { -- uint64_t bits[2]; -+ union { -+ uint64_t bits[3]; -+ struct { -+ /* ISAR0 */ -+ unsigned long __res0:64; -+ -+ /* ISAR1 */ -+ unsigned long __res1:64; -+ -+ /* ISAR2 */ -+ unsigned long __res3:28; -+ unsigned long clearbhb:4; -+ -+ unsigned long __res4:32; -+ }; - } isa64; - - #endif diff --git a/xsa398-4.14-4-xen-arm-Add-Spectre-BHB-handling.patch b/xsa398-4.14-4-xen-arm-Add-Spectre-BHB-handling.patch deleted file mode 100644 index 18e01c6..0000000 --- a/xsa398-4.14-4-xen-arm-Add-Spectre-BHB-handling.patch +++ /dev/null @@ -1,351 +0,0 @@ -From fc56dd212e4574c5fd77f830d077036b330dc1b5 Mon Sep 17 00:00:00 2001 -From: Rahul Singh -Date: Mon, 14 Feb 2022 18:47:32 +0000 -Subject: xen/arm: Add Spectre BHB handling - -This commit is adding Spectre BHB handling to Xen on Arm. -The commit is introducing new alternative code to be executed during -exception entry: -- SMCC workaround 3 call -- loop workaround (with 8, 24 or 32 iterations) -- use of new clearbhb instruction - -Cpuerrata is modified by this patch to apply the required workaround for -CPU affected by Spectre BHB when CONFIG_ARM64_HARDEN_BRANCH_PREDICTOR is -enabled. - -To do this the system previously used to apply smcc workaround 1 is -reused and new alternative code to be copied in the exception handler is -introduced. - -To define the type of workaround required by a processor, 4 new cpu -capabilities are introduced (for each number of loop and for smcc -workaround 3). - -When a processor is affected, enable_spectre_bhb_workaround is called -and if the processor does not have CSV2 set to 3 or ECBHB feature (which -would mean that the processor is doing what is required in hardware), -the proper code is enabled at exception entry. - -In the case where workaround 3 is not supported by the firmware, we -enable workaround 1 when possible as it will also mitigate Spectre BHB -on systems without CSV2. - -This is part of XSA-398 / CVE-2022-23960. - -Signed-off-by: Bertrand Marquis -Signed-off-by: Rahul Singh -Acked-by: Julien Grall -(cherry picked from commit 62c91eb66a2904eefb1d1d9642e3697a1e3c3a3c) - -diff --git a/xen/arch/arm/arm64/bpi.S b/xen/arch/arm/arm64/bpi.S -index d8743d955c4a..4e6382522048 100644 ---- a/xen/arch/arm/arm64/bpi.S -+++ b/xen/arch/arm/arm64/bpi.S -@@ -58,16 +58,42 @@ ENTRY(__bp_harden_hyp_vecs_start) - .endr - ENTRY(__bp_harden_hyp_vecs_end) - --ENTRY(__smccc_workaround_1_smc_start) -+.macro mitigate_spectre_bhb_loop count -+ENTRY(__mitigate_spectre_bhb_loop_start_\count) -+ stp x0, x1, [sp, #-16]! -+ mov x0, \count -+.Lspectre_bhb_loop\@: -+ b . + 4 -+ subs x0, x0, #1 -+ b.ne .Lspectre_bhb_loop\@ -+ sb -+ ldp x0, x1, [sp], #16 -+ENTRY(__mitigate_spectre_bhb_loop_end_\count) -+.endm -+ -+.macro smccc_workaround num smcc_id -+ENTRY(__smccc_workaround_smc_start_\num) - sub sp, sp, #(8 * 4) - stp x0, x1, [sp, #(8 * 2)] - stp x2, x3, [sp, #(8 * 0)] -- mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1_FID -+ mov w0, \smcc_id - smc #0 - ldp x2, x3, [sp, #(8 * 0)] - ldp x0, x1, [sp, #(8 * 2)] - add sp, sp, #(8 * 4) --ENTRY(__smccc_workaround_1_smc_end) -+ENTRY(__smccc_workaround_smc_end_\num) -+.endm -+ -+ENTRY(__mitigate_spectre_bhb_clear_insn_start) -+ clearbhb -+ isb -+ENTRY(__mitigate_spectre_bhb_clear_insn_end) -+ -+mitigate_spectre_bhb_loop 8 -+mitigate_spectre_bhb_loop 24 -+mitigate_spectre_bhb_loop 32 -+smccc_workaround 1, #ARM_SMCCC_ARCH_WORKAROUND_1_FID -+smccc_workaround 3, #ARM_SMCCC_ARCH_WORKAROUND_3_FID - - /* - * Local variables: -diff --git a/xen/arch/arm/cpuerrata.c b/xen/arch/arm/cpuerrata.c -index 9d79e3bad7e8..1c1149b2c795 100644 ---- a/xen/arch/arm/cpuerrata.c -+++ b/xen/arch/arm/cpuerrata.c -@@ -145,7 +145,16 @@ install_bp_hardening_vec(const struct arm_cpu_capabilities *entry, - return ret; - } - --extern char __smccc_workaround_1_smc_start[], __smccc_workaround_1_smc_end[]; -+extern char __smccc_workaround_smc_start_1[], __smccc_workaround_smc_end_1[]; -+extern char __smccc_workaround_smc_start_3[], __smccc_workaround_smc_end_3[]; -+extern char __mitigate_spectre_bhb_clear_insn_start[], -+ __mitigate_spectre_bhb_clear_insn_end[]; -+extern char __mitigate_spectre_bhb_loop_start_8[], -+ __mitigate_spectre_bhb_loop_end_8[]; -+extern char __mitigate_spectre_bhb_loop_start_24[], -+ __mitigate_spectre_bhb_loop_end_24[]; -+extern char __mitigate_spectre_bhb_loop_start_32[], -+ __mitigate_spectre_bhb_loop_end_32[]; - - static int enable_smccc_arch_workaround_1(void *data) - { -@@ -177,8 +186,8 @@ static int enable_smccc_arch_workaround_1(void *data) - if ( (int)res.a0 < 0 ) - goto warn; - -- return !install_bp_hardening_vec(entry,__smccc_workaround_1_smc_start, -- __smccc_workaround_1_smc_end, -+ return !install_bp_hardening_vec(entry,__smccc_workaround_smc_start_1, -+ __smccc_workaround_smc_end_1, - "call ARM_SMCCC_ARCH_WORKAROUND_1"); - - warn: -@@ -193,6 +202,93 @@ static int enable_smccc_arch_workaround_1(void *data) - return 0; - } - -+/* -+ * Spectre BHB Mitigation -+ * -+ * CPU is either: -+ * - Having CVS2.3 so it is not affected. -+ * - Having ECBHB and is clearing the branch history buffer when an exception -+ * to a different exception level is happening so no mitigation is needed. -+ * - Mitigating using a loop on exception entry (number of loop depending on -+ * the CPU). -+ * - Mitigating using the firmware. -+ */ -+static int enable_spectre_bhb_workaround(void *data) -+{ -+ const struct arm_cpu_capabilities *entry = data; -+ -+ /* -+ * Enable callbacks are called on every CPU based on the capabilities, so -+ * double-check whether the CPU matches the entry. -+ */ -+ if ( !entry->matches(entry) ) -+ return 0; -+ -+ if ( cpu_data[smp_processor_id()].pfr64.csv2 == 3 ) -+ return 0; -+ -+ if ( cpu_data[smp_processor_id()].mm64.ecbhb ) -+ return 0; -+ -+ if ( cpu_data[smp_processor_id()].isa64.clearbhb ) -+ return !install_bp_hardening_vec(entry, -+ __mitigate_spectre_bhb_clear_insn_start, -+ __mitigate_spectre_bhb_clear_insn_end, -+ "use clearBHB instruction"); -+ -+ /* Apply solution depending on hwcaps set on arm_errata */ -+ if ( cpus_have_cap(ARM_WORKAROUND_BHB_LOOP_8) ) -+ return !install_bp_hardening_vec(entry, -+ __mitigate_spectre_bhb_loop_start_8, -+ __mitigate_spectre_bhb_loop_end_8, -+ "use 8 loops workaround"); -+ -+ if ( cpus_have_cap(ARM_WORKAROUND_BHB_LOOP_24) ) -+ return !install_bp_hardening_vec(entry, -+ __mitigate_spectre_bhb_loop_start_24, -+ __mitigate_spectre_bhb_loop_end_24, -+ "use 24 loops workaround"); -+ -+ if ( cpus_have_cap(ARM_WORKAROUND_BHB_LOOP_32) ) -+ return !install_bp_hardening_vec(entry, -+ __mitigate_spectre_bhb_loop_start_32, -+ __mitigate_spectre_bhb_loop_end_32, -+ "use 32 loops workaround"); -+ -+ if ( cpus_have_cap(ARM_WORKAROUND_BHB_SMCC_3) ) -+ { -+ struct arm_smccc_res res; -+ -+ if ( smccc_ver < SMCCC_VERSION(1, 1) ) -+ goto warn; -+ -+ arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FID, -+ ARM_SMCCC_ARCH_WORKAROUND_3_FID, &res); -+ /* The return value is in the lower 32-bits. */ -+ if ( (int)res.a0 < 0 ) -+ { -+ /* -+ * On processor affected with CSV2=0, workaround 1 will mitigate -+ * both Spectre v2 and BHB so use it when available -+ */ -+ if ( enable_smccc_arch_workaround_1(data) ) -+ return 1; -+ -+ goto warn; -+ } -+ -+ return !install_bp_hardening_vec(entry,__smccc_workaround_smc_start_3, -+ __smccc_workaround_smc_end_3, -+ "call ARM_SMCCC_ARCH_WORKAROUND_3"); -+ } -+ -+warn: -+ printk_once("**** No support for any spectre BHB workaround. ****\n" -+ "**** Please update your firmware. ****\n"); -+ -+ return 0; -+} -+ - #endif /* CONFIG_ARM64_HARDEN_BRANCH_PREDICTOR */ - - /* Hardening Branch predictor code for Arm32 */ -@@ -438,19 +534,77 @@ static const struct arm_cpu_capabilities arm_errata[] = { - }, - { - .capability = ARM_HARDEN_BRANCH_PREDICTOR, -- MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), -+ MIDR_RANGE(MIDR_CORTEX_A72, 0, 1 << MIDR_VARIANT_SHIFT), - .enable = enable_smccc_arch_workaround_1, - }, - { -- .capability = ARM_HARDEN_BRANCH_PREDICTOR, -+ .capability = ARM_WORKAROUND_BHB_SMCC_3, - MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), -- .enable = enable_smccc_arch_workaround_1, -+ .enable = enable_spectre_bhb_workaround, - }, - { -- .capability = ARM_HARDEN_BRANCH_PREDICTOR, -+ .capability = ARM_WORKAROUND_BHB_SMCC_3, - MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), -- .enable = enable_smccc_arch_workaround_1, -+ .enable = enable_spectre_bhb_workaround, -+ }, -+ /* spectre BHB */ -+ { -+ .capability = ARM_WORKAROUND_BHB_LOOP_8, -+ MIDR_RANGE(MIDR_CORTEX_A72, 1 << MIDR_VARIANT_SHIFT, -+ (MIDR_VARIANT_MASK | MIDR_REVISION_MASK)), -+ .enable = enable_spectre_bhb_workaround, -+ }, -+ { -+ .capability = ARM_WORKAROUND_BHB_LOOP_24, -+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), -+ .enable = enable_spectre_bhb_workaround, -+ }, -+ { -+ .capability = ARM_WORKAROUND_BHB_LOOP_24, -+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), -+ .enable = enable_spectre_bhb_workaround, -+ }, -+ { -+ .capability = ARM_WORKAROUND_BHB_LOOP_32, -+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), -+ .enable = enable_spectre_bhb_workaround, -+ }, -+ { -+ .capability = ARM_WORKAROUND_BHB_LOOP_32, -+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), -+ .enable = enable_spectre_bhb_workaround, -+ }, -+ { -+ .capability = ARM_WORKAROUND_BHB_LOOP_32, -+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), -+ .enable = enable_spectre_bhb_workaround, -+ }, -+ { -+ .capability = ARM_WORKAROUND_BHB_LOOP_32, -+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), -+ .enable = enable_spectre_bhb_workaround, -+ }, -+ { -+ .capability = ARM_WORKAROUND_BHB_LOOP_32, -+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), -+ .enable = enable_spectre_bhb_workaround, - }, -+ { -+ .capability = ARM_WORKAROUND_BHB_LOOP_24, -+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), -+ .enable = enable_spectre_bhb_workaround, -+ }, -+ { -+ .capability = ARM_WORKAROUND_BHB_LOOP_32, -+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), -+ .enable = enable_spectre_bhb_workaround, -+ }, -+ { -+ .capability = ARM_WORKAROUND_BHB_LOOP_32, -+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), -+ .enable = enable_spectre_bhb_workaround, -+ }, -+ - #endif - #ifdef CONFIG_ARM32_HARDEN_BRANCH_PREDICTOR - { -diff --git a/xen/include/asm-arm/arm64/macros.h b/xen/include/asm-arm/arm64/macros.h -index f981b4f43e84..5100aed6e3ec 100644 ---- a/xen/include/asm-arm/arm64/macros.h -+++ b/xen/include/asm-arm/arm64/macros.h -@@ -21,6 +21,11 @@ - ldr \dst, [\dst, \tmp] - .endm - -+ /* clearbhb instruction clearing the branch history */ -+ .macro clearbhb -+ hint #22 -+ .endm -+ - /* - * Register aliases. - */ -diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h -index 7be4ee8cf821..14c7f7d218e2 100644 ---- a/xen/include/asm-arm/cpufeature.h -+++ b/xen/include/asm-arm/cpufeature.h -@@ -46,8 +46,12 @@ - #define ARM_SMCCC_1_1 8 - #define ARM64_WORKAROUND_AT_SPECULATE 9 - #define ARM_WORKAROUND_858921 10 -+#define ARM_WORKAROUND_BHB_LOOP_8 11 -+#define ARM_WORKAROUND_BHB_LOOP_24 12 -+#define ARM_WORKAROUND_BHB_LOOP_32 13 -+#define ARM_WORKAROUND_BHB_SMCC_3 14 - --#define ARM_NCAPS 11 -+#define ARM_NCAPS 15 - - #ifndef __ASSEMBLY__ - -diff --git a/xen/include/asm-arm/smccc.h b/xen/include/asm-arm/smccc.h -index 9d94beb3df2d..b3dbeecc90ad 100644 ---- a/xen/include/asm-arm/smccc.h -+++ b/xen/include/asm-arm/smccc.h -@@ -334,6 +334,12 @@ void __arm_smccc_1_0_smc(register_t a0, register_t a1, register_t a2, - ARM_SMCCC_OWNER_ARCH, \ - 0x7FFF) - -+#define ARM_SMCCC_ARCH_WORKAROUND_3_FID \ -+ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ -+ ARM_SMCCC_CONV_32, \ -+ ARM_SMCCC_OWNER_ARCH, \ -+ 0x3FFF) -+ - /* SMCCC error codes */ - #define ARM_SMCCC_NOT_REQUIRED (-2) - #define ARM_SMCCC_ERR_UNKNOWN_FUNCTION (-1) diff --git a/xsa398-4.14-5-xen-arm-Allow-to-discover-and-use-SMCCC_ARCH_WORKARO.patch b/xsa398-4.14-5-xen-arm-Allow-to-discover-and-use-SMCCC_ARCH_WORKARO.patch deleted file mode 100644 index dc4db67..0000000 --- a/xsa398-4.14-5-xen-arm-Allow-to-discover-and-use-SMCCC_ARCH_WORKARO.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 7cebd77c80ce87f84c63a6043a5ad7115ccab9d5 Mon Sep 17 00:00:00 2001 -From: Bertrand Marquis -Date: Thu, 17 Feb 2022 14:52:54 +0000 -Subject: xen/arm: Allow to discover and use SMCCC_ARCH_WORKAROUND_3 - -Allow guest to discover whether or not SMCCC_ARCH_WORKAROUND_3 is -supported and create a fastpath in the code to handle guests request to -do the workaround. - -The function SMCCC_ARCH_WORKAROUND_3 will be called by the guest for -flushing the branch history. So we want the handling to be as fast as -possible. - -As the mitigation is applied on every guest exit, we can check for the -call before saving all context and return very early. - -This is part of XSA-398 / CVE-2022-23960. - -Signed-off-by: Bertrand Marquis -Reviewed-by: Julien Grall -(cherry picked from commit c0a56ea0fd92ecb471936b7355ddbecbaea3707c) - -diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S -index 175ea2981e72..a8c214506786 100644 ---- a/xen/arch/arm/arm64/entry.S -+++ b/xen/arch/arm/arm64/entry.S -@@ -338,16 +338,26 @@ guest_sync: - cbnz x1, guest_sync_slowpath /* should be 0 for HVC #0 */ - - /* -- * Fastest path possible for ARM_SMCCC_ARCH_WORKAROUND_1. -- * The workaround has already been applied on the exception -+ * Fastest path possible for ARM_SMCCC_ARCH_WORKAROUND_1 and -+ * ARM_SMCCC_ARCH_WORKAROUND_3. -+ * The workaround needed has already been applied on the exception - * entry from the guest, so let's quickly get back to the guest. - * - * Note that eor is used because the function identifier cannot - * be encoded as an immediate for cmp. - */ - eor w0, w0, #ARM_SMCCC_ARCH_WORKAROUND_1_FID -- cbnz w0, check_wa2 -+ cbz w0, fastpath_out_workaround - -+ /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */ -+ eor w0, w0, #(ARM_SMCCC_ARCH_WORKAROUND_1_FID ^ ARM_SMCCC_ARCH_WORKAROUND_2_FID) -+ cbz w0, wa2_ssbd -+ -+ /* Fastpath out for ARM_SMCCC_ARCH_WORKAROUND_3 */ -+ eor w0, w0, #(ARM_SMCCC_ARCH_WORKAROUND_2_FID ^ ARM_SMCCC_ARCH_WORKAROUND_3_FID) -+ cbnz w0, guest_sync_slowpath -+ -+fastpath_out_workaround: - /* - * Clobber both x0 and x1 to prevent leakage. Note that thanks - * the eor, x0 = 0. -@@ -356,10 +366,7 @@ guest_sync: - eret - sb - --check_wa2: -- /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */ -- eor w0, w0, #(ARM_SMCCC_ARCH_WORKAROUND_1_FID ^ ARM_SMCCC_ARCH_WORKAROUND_2_FID) -- cbnz w0, guest_sync_slowpath -+wa2_ssbd: - #ifdef CONFIG_ARM_SSBD - alternative_cb arm_enable_wa2_handling - b wa2_end -diff --git a/xen/arch/arm/vsmc.c b/xen/arch/arm/vsmc.c -index a36db15fffc0..b633ff2fe897 100644 ---- a/xen/arch/arm/vsmc.c -+++ b/xen/arch/arm/vsmc.c -@@ -124,6 +124,10 @@ static bool handle_arch(struct cpu_user_regs *regs) - break; - } - break; -+ case ARM_SMCCC_ARCH_WORKAROUND_3_FID: -+ if ( cpus_have_cap(ARM_WORKAROUND_BHB_SMCC_3) ) -+ ret = 0; -+ break; - } - - set_user_reg(regs, 0, ret); -@@ -132,6 +136,7 @@ static bool handle_arch(struct cpu_user_regs *regs) - } - - case ARM_SMCCC_ARCH_WORKAROUND_1_FID: -+ case ARM_SMCCC_ARCH_WORKAROUND_3_FID: - /* No return value */ - return true; - diff --git a/xsa398-4.14-6-x86-spec-ctrl-Cease-using-thunk-lfence-on-AMD.patch b/xsa398-4.14-6-x86-spec-ctrl-Cease-using-thunk-lfence-on-AMD.patch deleted file mode 100644 index e9efec5..0000000 --- a/xsa398-4.14-6-x86-spec-ctrl-Cease-using-thunk-lfence-on-AMD.patch +++ /dev/null @@ -1,118 +0,0 @@ -From ca304edd3ba8c19211107fd2e898249987557ce5 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper -Date: Mon, 7 Mar 2022 16:35:52 +0000 -Subject: x86/spec-ctrl: Cease using thunk=lfence on AMD - -AMD have updated their Spectre v2 guidance, and lfence/jmp is no longer -considered safe. AMD are recommending using retpoline everywhere. - -Retpoline is incompatible with CET. All CET-capable hardware has efficient -IBRS (specifically, not something retrofitted in microcode), so use IBRS (and -STIBP for consistency sake). - -This is a logical change on AMD, but not on Intel as the default calculations -would end up with these settings anyway. Leave behind a message if IBRS is -found to be missing. - -Also update the default heuristics to never select THUNK_LFENCE. This causes -AMD CPUs to change their default to retpoline. - -Also update the printed message to include the AMD MSR_SPEC_CTRL settings, and -STIBP now that we set it for consistency sake. - -This is part of XSA-398 / CVE-2021-26401. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich -(cherry picked from commit 8d03080d2a339840d3a59e0932a94f804e45110d) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index fd8f82549152..c0bfbb7a5c27 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2140,9 +2140,9 @@ to use. - - If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to - select which of the thunks gets patched into the `__x86_indirect_thunk_%reg` --locations. The default thunk is `retpoline` (generally preferred for Intel --hardware), with the alternatives being `jmp` (a `jmp *%reg` gadget, minimal --overhead), and `lfence` (an `lfence; jmp *%reg` gadget, preferred for AMD). -+locations. The default thunk is `retpoline` (generally preferred), with the -+alternatives being `jmp` (a `jmp *%reg` gadget, minimal overhead), and -+`lfence` (an `lfence; jmp *%reg` gadget). - - On hardware supporting IBRS (Indirect Branch Restricted Speculation), the - `ibrs=` option can be used to force or prevent Xen using the feature itself. -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 9301d95bd705..7ded6ecba197 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -367,14 +367,19 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - "\n"); - - /* Settings for Xen's protection, irrespective of guests. */ -- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s%s%s\n", -+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s, Other:%s%s%s%s%s\n", - thunk == THUNK_NONE ? "N/A" : - thunk == THUNK_RETPOLINE ? "RETPOLINE" : - thunk == THUNK_LFENCE ? "LFENCE" : - thunk == THUNK_JMP ? "JMP" : "?", -- !boot_cpu_has(X86_FEATURE_IBRSB) ? "No" : -+ (!boot_cpu_has(X86_FEATURE_IBRSB) && -+ !boot_cpu_has(X86_FEATURE_IBRS)) ? "No" : - (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", -- !boot_cpu_has(X86_FEATURE_SSBD) ? "" : -+ (!boot_cpu_has(X86_FEATURE_STIBP) && -+ !boot_cpu_has(X86_FEATURE_AMD_STIBP)) ? "" : -+ (default_xen_spec_ctrl & SPEC_CTRL_STIBP) ? " STIBP+" : " STIBP-", -+ (!boot_cpu_has(X86_FEATURE_SSBD) && -+ !boot_cpu_has(X86_FEATURE_AMD_SSBD)) ? "" : - (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", - !(caps & ARCH_CAPS_TSX_CTRL) ? "" : - (opt_tsx & 1) ? " TSX+" : " TSX-", -@@ -916,10 +921,23 @@ void __init init_speculation_mitigations(void) - /* - * First, disable the use of retpolines if Xen is using shadow stacks, as - * they are incompatible. -+ * -+ * In the absence of retpolines, IBRS needs to be used for speculative -+ * safety. All CET-capable hardware has efficient IBRS. - */ -- if ( cpu_has_xen_shstk && -- (opt_thunk == THUNK_DEFAULT || opt_thunk == THUNK_RETPOLINE) ) -- thunk = THUNK_JMP; -+ if ( cpu_has_xen_shstk ) -+ { -+ if ( !has_spec_ctrl ) -+ printk(XENLOG_WARNING "?!? CET active, but no MSR_SPEC_CTRL?\n"); -+ else if ( opt_ibrs == -1 ) -+ { -+ opt_ibrs = ibrs = true; -+ default_xen_spec_ctrl |= SPEC_CTRL_IBRS | SPEC_CTRL_STIBP; -+ } -+ -+ if ( opt_thunk == THUNK_DEFAULT || opt_thunk == THUNK_RETPOLINE ) -+ thunk = THUNK_JMP; -+ } - - /* - * Has the user specified any custom BTI mitigations? If so, follow their -@@ -939,16 +957,10 @@ void __init init_speculation_mitigations(void) - if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) ) - { - /* -- * AMD's recommended mitigation is to set lfence as being dispatch -- * serialising, and to use IND_THUNK_LFENCE. -- */ -- if ( cpu_has_lfence_dispatch ) -- thunk = THUNK_LFENCE; -- /* -- * On Intel hardware, we'd like to use retpoline in preference to -+ * On all hardware, we'd like to use retpoline in preference to - * IBRS, but only if it is safe on this hardware. - */ -- else if ( retpoline_safe(caps) ) -+ if ( retpoline_safe(caps) ) - thunk = THUNK_RETPOLINE; - else if ( has_spec_ctrl ) - ibrs = true; diff --git a/xsa399-4.16.patch b/xsa399-4.16.patch deleted file mode 100644 index 5f3850e..0000000 --- a/xsa399-4.16.patch +++ /dev/null @@ -1,45 +0,0 @@ -From: Jan Beulich -Subject: VT-d: correct ordering of operations in cleanup_domid_map() - -The function may be called without any locks held (leaving aside the -domctl one, which we surely don't want to depend on here), so needs to -play safe wrt other accesses to domid_map[] and domid_bitmap[]. This is -to avoid context_set_domain_id()'s writing of domid_map[] to be reset to -zero right away in the case of it racing the freeing of a DID. - -For the interaction with context_set_domain_id() and ->domid_map[] reads -see the code comment. - -{check_,}cleanup_domid_map() are called with pcidevs_lock held or during -domain cleanup only (and pcidevs_lock is also held around -context_set_domain_id()), i.e. racing calls with the same (dom, iommu) -tuple cannot occur. - -domain_iommu_domid(), besides its use by cleanup_domid_map(), has its -result used only to control flushing, and hence a stale result would -only lead to a stray extra flush. - -This is CVE-2022-26357 / XSA-399. - -Fixes: b9c20c78789f ("VT-d: per-iommu domain-id") -Signed-off-by: Jan Beulich -Reviewed-by: Roger Pau Monné - ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -152,8 +152,14 @@ static void cleanup_domid_map(struct dom - - if ( iommu_domid >= 0 ) - { -+ /* -+ * Update domid_map[] /before/ domid_bitmap[] to avoid a race with -+ * context_set_domain_id(), setting the slot to DOMID_INVALID for -+ * ->domid_map[] reads to produce a suitable value while the bit is -+ * still set. -+ */ -+ iommu->domid_map[iommu_domid] = DOMID_INVALID; - clear_bit(iommu_domid, iommu->domid_bitmap); -- iommu->domid_map[iommu_domid] = 0; - } - } - diff --git a/xsa400-4.14-01.patch b/xsa400-4.14-01.patch deleted file mode 100644 index 61f32df..0000000 --- a/xsa400-4.14-01.patch +++ /dev/null @@ -1,105 +0,0 @@ -From: Jan Beulich -Subject: VT-d: fix (de)assign ordering when RMRRs are in use - -In the event that the RMRR mappings are essential for device operation, -they should be established before updating the device's context entry, -while they should be torn down only after the device's context entry was -successfully updated. - -Also adjust a related log message. - -This is CVE-2022-26358 / part of XSA-400. - -Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling") -Signed-off-by: Jan Beulich -Reviewed-by: Roger Pau Monné -Reviewed-by: Paul Durrant -Reviewed-by: Kevin Tian - ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -2411,6 +2411,10 @@ static int reassign_device_ownership( - { - int ret; - -+ ret = domain_context_unmap(source, devfn, pdev); -+ if ( ret ) -+ return ret; -+ - /* - * Devices assigned to untrusted domains (here assumed to be any domU) - * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected -@@ -2447,10 +2451,6 @@ static int reassign_device_ownership( - } - } - -- ret = domain_context_unmap(source, devfn, pdev); -- if ( ret ) -- return ret; -- - if ( devfn == pdev->devfn && pdev->domain != dom_io ) - { - list_move(&pdev->domain_list, &dom_io->pdev_list); -@@ -2527,9 +2527,8 @@ static int intel_iommu_assign_device( - } - } - -- ret = reassign_device_ownership(s, d, devfn, pdev); -- if ( ret || d == dom_io ) -- return ret; -+ if ( d == dom_io ) -+ return reassign_device_ownership(s, d, devfn, pdev); - - /* Setup rmrr identity mapping */ - for_each_rmrr_device( rmrr, bdf, i ) -@@ -2542,20 +2541,37 @@ static int intel_iommu_assign_device( - rmrr->end_address, flag); - if ( ret ) - { -- int rc; -- -- rc = reassign_device_ownership(d, s, devfn, pdev); - printk(XENLOG_G_ERR VTDPREFIX -- " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n", -- rmrr->base_address, rmrr->end_address, -- d->domain_id, ret); -- if ( rc ) -- { -- printk(XENLOG_ERR VTDPREFIX -- " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n", -- seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc); -- domain_crash(d); -- } -+ "%pd: cannot map reserved region [%"PRIx64",%"PRIx64"]: %d\n", -+ d, rmrr->base_address, rmrr->end_address, ret); -+ break; -+ } -+ } -+ } -+ -+ if ( !ret ) -+ ret = reassign_device_ownership(s, d, devfn, pdev); -+ -+ /* See reassign_device_ownership() for the hwdom aspect. */ -+ if ( !ret || is_hardware_domain(d) ) -+ return ret; -+ -+ for_each_rmrr_device( rmrr, bdf, i ) -+ { -+ if ( rmrr->segment == seg && -+ PCI_BUS(bdf) == bus && -+ PCI_DEVFN2(bdf) == devfn ) -+ { -+ int rc = iommu_identity_mapping(d, p2m_access_x, -+ rmrr->base_address, -+ rmrr->end_address, 0); -+ -+ if ( rc && rc != -ENOENT ) -+ { -+ printk(XENLOG_ERR VTDPREFIX -+ "%pd: cannot unmap reserved region [%"PRIx64",%"PRIx64"]: %d\n", -+ d, rmrr->base_address, rmrr->end_address, rc); -+ domain_crash(d); - break; - } - } diff --git a/xsa400-4.14-02.patch b/xsa400-4.14-02.patch deleted file mode 100644 index cb47c2e..0000000 --- a/xsa400-4.14-02.patch +++ /dev/null @@ -1,80 +0,0 @@ -From: Jan Beulich -Subject: VT-d: fix add/remove ordering when RMRRs are in use - -In the event that the RMRR mappings are essential for device operation, -they should be established before updating the device's context entry, -while they should be torn down only after the device's context entry was -successfully cleared. - -Also switch to %pd in related log messages. - -Fixes: fa88cfadf918 ("vt-d: Map RMRR in intel_iommu_add_device() if the device has RMRR") -Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling") -Signed-off-by: Jan Beulich -Reviewed-by: Roger Pau Monné -Reviewed-by: Kevin Tian - ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -2000,14 +2000,6 @@ static int intel_iommu_add_device(u8 dev - if ( !pdev->domain ) - return -EINVAL; - -- ret = domain_context_mapping(pdev->domain, devfn, pdev); -- if ( ret ) -- { -- dprintk(XENLOG_ERR VTDPREFIX, "d%d: context mapping failed\n", -- pdev->domain->domain_id); -- return ret; -- } -- - for_each_rmrr_device ( rmrr, bdf, i ) - { - if ( rmrr->segment == pdev->seg && -@@ -2024,12 +2016,17 @@ static int intel_iommu_add_device(u8 dev - rmrr->base_address, rmrr->end_address, - 0); - if ( ret ) -- dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n", -- pdev->domain->domain_id); -+ dprintk(XENLOG_ERR VTDPREFIX, "%pd: RMRR mapping failed\n", -+ pdev->domain); - } - } - -- return 0; -+ ret = domain_context_mapping(pdev->domain, devfn, pdev); -+ if ( ret ) -+ dprintk(XENLOG_ERR VTDPREFIX, "%pd: context mapping failed\n", -+ pdev->domain); -+ -+ return ret; - } - - static int intel_iommu_enable_device(struct pci_dev *pdev) -@@ -2051,11 +2048,15 @@ static int intel_iommu_remove_device(u8 - { - struct acpi_rmrr_unit *rmrr; - u16 bdf; -- int i; -+ int ret, i; - - if ( !pdev->domain ) - return -EINVAL; - -+ ret = domain_context_unmap(pdev->domain, devfn, pdev); -+ if ( ret ) -+ return ret; -+ - for_each_rmrr_device ( rmrr, bdf, i ) - { - if ( rmrr->segment != pdev->seg || -@@ -2071,7 +2072,7 @@ static int intel_iommu_remove_device(u8 - rmrr->end_address, 0); - } - -- return domain_context_unmap(pdev->domain, devfn, pdev); -+ return 0; - } - - static int __hwdom_init setup_hwdom_device(u8 devfn, struct pci_dev *pdev) diff --git a/xsa400-4.14-03.patch b/xsa400-4.14-03.patch deleted file mode 100644 index c011308..0000000 --- a/xsa400-4.14-03.patch +++ /dev/null @@ -1,97 +0,0 @@ -From: Jan Beulich -Subject: VT-d: drop ownership checking from domain_context_mapping_one() - -Despite putting in quite a bit of effort it was not possible to -establish why exactly this code exists (beyond possibly sanity -checking). Instead of a subsequent change further complicating this -logic, simply get rid of it. - -Take the opportunity and move the respective unmap_vtd_domain_page() out -of the locked region. - -Signed-off-by: Jan Beulich -Reviewed-by: Roger Pau Monné -Reviewed-by: Paul Durrant -Reviewed-by: Kevin Tian - ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -120,28 +120,6 @@ static int context_set_domain_id(struct - return 0; - } - --static int context_get_domain_id(struct context_entry *context, -- struct vtd_iommu *iommu) --{ -- unsigned long dom_index, nr_dom; -- int domid = -1; -- -- if (iommu && context) -- { -- nr_dom = cap_ndoms(iommu->cap); -- -- dom_index = context_domain_id(*context); -- -- if ( dom_index < nr_dom && iommu->domid_map ) -- domid = iommu->domid_map[dom_index]; -- else -- dprintk(XENLOG_DEBUG VTDPREFIX, -- "dom_index %lu exceeds nr_dom %lu or iommu has no domid_map\n", -- dom_index, nr_dom); -- } -- return domid; --} -- - static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu) - { - int iommu_domid = domain_iommu_domid(domain, iommu); -@@ -1395,47 +1373,9 @@ int domain_context_mapping_one( - - if ( context_present(*context) ) - { -- int res = 0; -- -- /* Try to get domain ownership from device structure. If that's -- * not available, try to read it from the context itself. */ -- if ( pdev ) -- { -- if ( pdev->domain != domain ) -- { -- printk(XENLOG_G_INFO VTDPREFIX -- "%pd: %04x:%02x:%02x.%u owned by %pd\n", -- domain, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), -- pdev->domain); -- res = -EINVAL; -- } -- } -- else -- { -- int cdomain; -- cdomain = context_get_domain_id(context, iommu); -- -- if ( cdomain < 0 ) -- { -- printk(XENLOG_G_WARNING VTDPREFIX -- "%pd: %04x:%02x:%02x.%u mapped, but can't find owner\n", -- domain, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); -- res = -EINVAL; -- } -- else if ( cdomain != domain->domain_id ) -- { -- printk(XENLOG_G_INFO VTDPREFIX -- "%pd: %04x:%02x:%02x.%u already mapped to d%d\n", -- domain, -- seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), -- cdomain); -- res = -EINVAL; -- } -- } -- -- unmap_vtd_domain_page(context_entries); - spin_unlock(&iommu->lock); -- return res; -+ unmap_vtd_domain_page(context_entries); -+ return 0; - } - - if ( iommu_hwdom_passthrough && is_hardware_domain(domain) ) diff --git a/xsa400-4.14-04.patch b/xsa400-4.14-04.patch deleted file mode 100644 index 9e0c7ad..0000000 --- a/xsa400-4.14-04.patch +++ /dev/null @@ -1,564 +0,0 @@ -From: Jan Beulich -Subject: VT-d: re-assign devices directly - -Devices with RMRRs, due to it being unspecified how/when the specified -memory regions may get accessed, may not be left disconnected from their -respective mappings (as long as it's not certain that the device has -been fully quiesced). Hence rather than unmapping the old context and -then mapping the new one, re-assignment needs to be done in a single -step. - -This is CVE-2022-26359 / part of XSA-400. - -Reported-by: Roger Pau Monné - -Similarly quarantining scratch-page mode relies on page tables to be -continuously wired up. - -To avoid complicating things more than necessary, treat all devices -mostly equally, i.e. regardless of their association with any RMRRs. The -main difference is when it comes to updating context entries, which need -to be atomic when there are RMRRs. Yet atomicity can only be achieved -with CMPXCHG16B, availability of which we can't take for given. - -The seemingly complicated choice of non-negative return values for -domain_context_mapping_one() is to limit code churn: This way callers -passing NULL for pdev don't need fiddling with. - -Signed-off-by: Jan Beulich -Reviewed-by: Kevin Tian -Reviewed-by: Roger Pau Monné - ---- a/xen/drivers/passthrough/vtd/extern.h -+++ b/xen/drivers/passthrough/vtd/extern.h -@@ -85,7 +85,8 @@ void free_pgtable_maddr(u64 maddr); - void *map_vtd_domain_page(u64 maddr); - void unmap_vtd_domain_page(void *va); - int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu, -- u8 bus, u8 devfn, const struct pci_dev *); -+ uint8_t bus, uint8_t devfn, -+ const struct pci_dev *pdev, unsigned int mode); - int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu, - u8 bus, u8 devfn); - int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt); -@@ -105,8 +106,8 @@ int is_igd_vt_enabled_quirk(void); - void platform_quirks_init(void); - void vtd_ops_preamble_quirk(struct vtd_iommu *iommu); - void vtd_ops_postamble_quirk(struct vtd_iommu *iommu); --int __must_check me_wifi_quirk(struct domain *domain, -- u8 bus, u8 devfn, int map); -+int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus, -+ uint8_t devfn, unsigned int mode); - void pci_vtd_quirk(const struct pci_dev *); - void quirk_iommu_caps(struct vtd_iommu *iommu); - ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -116,6 +116,7 @@ static int context_set_domain_id(struct - } - - set_bit(i, iommu->domid_bitmap); -+ context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET); - context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET; - return 0; - } -@@ -1353,15 +1354,27 @@ static void __hwdom_init intel_iommu_hwd - } - } - -+/* -+ * This function returns -+ * - a negative errno value upon error, -+ * - zero upon success when previously the entry was non-present, or this isn't -+ * the "main" request for a device (pdev == NULL), or for no-op quarantining -+ * assignments, -+ * - positive (one) upon success when previously the entry was present and this -+ * is the "main" request for a device (pdev != NULL). -+ */ - int domain_context_mapping_one( - struct domain *domain, - struct vtd_iommu *iommu, -- u8 bus, u8 devfn, const struct pci_dev *pdev) -+ uint8_t bus, uint8_t devfn, const struct pci_dev *pdev, -+ unsigned int mode) - { - struct domain_iommu *hd = dom_iommu(domain); -- struct context_entry *context, *context_entries; -+ struct context_entry *context, *context_entries, lctxt; -+ __uint128_t old; - u64 maddr, pgd_maddr; -- u16 seg = iommu->drhd->segment; -+ uint16_t seg = iommu->drhd->segment, prev_did = 0; -+ struct domain *prev_dom = NULL; - int agaw, rc, ret; - bool_t flush_dev_iotlb; - -@@ -1370,17 +1383,32 @@ int domain_context_mapping_one( - maddr = bus_to_context_maddr(iommu, bus); - context_entries = (struct context_entry *)map_vtd_domain_page(maddr); - context = &context_entries[devfn]; -+ old = (lctxt = *context).full; - -- if ( context_present(*context) ) -+ if ( context_present(lctxt) ) - { -- spin_unlock(&iommu->lock); -- unmap_vtd_domain_page(context_entries); -- return 0; -+ domid_t domid; -+ -+ prev_did = context_domain_id(lctxt); -+ domid = iommu->domid_map[prev_did]; -+ if ( domid < DOMID_FIRST_RESERVED ) -+ prev_dom = rcu_lock_domain_by_id(domid); -+ else if ( domid == DOMID_IO ) -+ prev_dom = rcu_lock_domain(dom_io); -+ if ( !prev_dom ) -+ { -+ spin_unlock(&iommu->lock); -+ unmap_vtd_domain_page(context_entries); -+ dprintk(XENLOG_DEBUG VTDPREFIX, -+ "no domain for did %u (nr_dom %u)\n", -+ prev_did, cap_ndoms(iommu->cap)); -+ return -ESRCH; -+ } - } - - if ( iommu_hwdom_passthrough && is_hardware_domain(domain) ) - { -- context_set_translation_type(*context, CONTEXT_TT_PASS_THRU); -+ context_set_translation_type(lctxt, CONTEXT_TT_PASS_THRU); - agaw = level_to_agaw(iommu->nr_pt_levels); - } - else -@@ -1397,6 +1425,8 @@ int domain_context_mapping_one( - spin_unlock(&hd->arch.mapping_lock); - spin_unlock(&iommu->lock); - unmap_vtd_domain_page(context_entries); -+ if ( prev_dom ) -+ rcu_unlock_domain(prev_dom); - return -ENOMEM; - } - } -@@ -1414,33 +1444,102 @@ int domain_context_mapping_one( - goto nomem; - } - -- context_set_address_root(*context, pgd_maddr); -+ context_set_address_root(lctxt, pgd_maddr); - if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) ) -- context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB); -+ context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB); - else -- context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL); -+ context_set_translation_type(lctxt, CONTEXT_TT_MULTI_LEVEL); - - spin_unlock(&hd->arch.mapping_lock); - } - -- if ( context_set_domain_id(context, domain, iommu) ) -+ rc = context_set_domain_id(&lctxt, domain, iommu); -+ if ( rc ) - { -+ unlock: - spin_unlock(&iommu->lock); - unmap_vtd_domain_page(context_entries); -- return -EFAULT; -+ if ( prev_dom ) -+ rcu_unlock_domain(prev_dom); -+ return rc; -+ } -+ -+ if ( !prev_dom ) -+ { -+ context_set_address_width(lctxt, agaw); -+ context_set_fault_enable(lctxt); -+ context_set_present(lctxt); -+ } -+ else if ( prev_dom == domain ) -+ { -+ ASSERT(lctxt.full == context->full); -+ rc = !!pdev; -+ goto unlock; -+ } -+ else -+ { -+ ASSERT(context_address_width(lctxt) == agaw); -+ ASSERT(!context_fault_disable(lctxt)); -+ } -+ -+ if ( cpu_has_cx16 ) -+ { -+ __uint128_t res = cmpxchg16b(context, &old, &lctxt.full); -+ -+ /* -+ * Hardware does not update the context entry behind our backs, -+ * so the return value should match "old". -+ */ -+ if ( res != old ) -+ { -+ if ( pdev ) -+ check_cleanup_domid_map(domain, pdev, iommu); -+ printk(XENLOG_ERR -+ "%04x:%02x:%02x.%u: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n", -+ pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), -+ (uint64_t)(res >> 64), (uint64_t)res, -+ (uint64_t)(old >> 64), (uint64_t)old); -+ rc = -EILSEQ; -+ goto unlock; -+ } -+ } -+ else if ( !prev_dom || !(mode & MAP_WITH_RMRR) ) -+ { -+ context_clear_present(*context); -+ iommu_sync_cache(context, sizeof(*context)); -+ -+ write_atomic(&context->hi, lctxt.hi); -+ /* No barrier should be needed between these two. */ -+ write_atomic(&context->lo, lctxt.lo); -+ } -+ else /* Best effort, updating DID last. */ -+ { -+ /* -+ * By non-atomically updating the context entry's DID field last, -+ * during a short window in time TLB entries with the old domain ID -+ * but the new page tables may be inserted. This could affect I/O -+ * of other devices using this same (old) domain ID. Such updating -+ * therefore is not a problem if this was the only device associated -+ * with the old domain ID. Diverting I/O of any of a dying domain's -+ * devices to the quarantine page tables is intended anyway. -+ */ -+ if ( !(mode & (MAP_OWNER_DYING | MAP_SINGLE_DEVICE)) ) -+ printk(XENLOG_WARNING VTDPREFIX -+ " %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n", -+ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), prev_dom); -+ -+ write_atomic(&context->lo, lctxt.lo); -+ /* No barrier should be needed between these two. */ -+ write_atomic(&context->hi, lctxt.hi); - } - -- context_set_address_width(*context, agaw); -- context_set_fault_enable(*context); -- context_set_present(*context); - iommu_sync_cache(context, sizeof(struct context_entry)); - spin_unlock(&iommu->lock); - -- /* Context entry was previously non-present (with domid 0). */ -- rc = iommu_flush_context_device(iommu, 0, PCI_BDF2(bus, devfn), -- DMA_CCMD_MASK_NOBIT, 1); -+ rc = iommu_flush_context_device(iommu, prev_did, PCI_BDF2(bus, devfn), -+ DMA_CCMD_MASK_NOBIT, !prev_dom); - flush_dev_iotlb = !!find_ats_dev_drhd(iommu); -- ret = iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb); -+ ret = iommu_flush_iotlb_dsi(iommu, prev_did, !prev_dom, flush_dev_iotlb); - - /* - * The current logic for returns: -@@ -1461,17 +1560,26 @@ int domain_context_mapping_one( - unmap_vtd_domain_page(context_entries); - - if ( !seg && !rc ) -- rc = me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC); -+ rc = me_wifi_quirk(domain, bus, devfn, mode); - - if ( rc ) - { -- ret = domain_context_unmap_one(domain, iommu, bus, devfn); -+ if ( !prev_dom ) -+ ret = domain_context_unmap_one(domain, iommu, bus, devfn); -+ else if ( prev_dom != domain ) /* Avoid infinite recursion. */ -+ ret = domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, -+ mode & MAP_WITH_RMRR) < 0; -+ else -+ ret = 1; - - if ( !ret && pdev && pdev->devfn == devfn ) - check_cleanup_domid_map(domain, pdev, iommu); - } - -- return rc; -+ if ( prev_dom ) -+ rcu_unlock_domain(prev_dom); -+ -+ return rc ?: pdev && prev_dom; - } - - static int domain_context_unmap(struct domain *d, uint8_t devfn, -@@ -1481,8 +1589,10 @@ static int domain_context_mapping(struct - struct pci_dev *pdev) - { - struct acpi_drhd_unit *drhd; -+ const struct acpi_rmrr_unit *rmrr; - int ret = 0; -- uint16_t seg = pdev->seg; -+ unsigned int i, mode = 0; -+ uint16_t seg = pdev->seg, bdf; - uint8_t bus = pdev->bus, secbus; - - drhd = acpi_find_matched_drhd_unit(pdev); -@@ -1502,8 +1612,29 @@ static int domain_context_mapping(struct - - ASSERT(pcidevs_locked()); - -+ for_each_rmrr_device( rmrr, bdf, i ) -+ { -+ if ( rmrr->segment != pdev->seg || bdf != pdev->sbdf.bdf ) -+ continue; -+ -+ mode |= MAP_WITH_RMRR; -+ break; -+ } -+ -+ if ( domain != pdev->domain ) -+ { -+ if ( pdev->domain->is_dying ) -+ mode |= MAP_OWNER_DYING; -+ else if ( drhd && -+ !any_pdev_behind_iommu(pdev->domain, pdev, drhd->iommu) && -+ !pdev->phantom_stride ) -+ mode |= MAP_SINGLE_DEVICE; -+ } -+ - switch ( pdev->type ) - { -+ bool prev_present; -+ - case DEV_TYPE_PCI_HOST_BRIDGE: - if ( iommu_debug ) - printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u map\n", -@@ -1524,7 +1655,9 @@ static int domain_context_mapping(struct - domain->domain_id, seg, bus, - PCI_SLOT(devfn), PCI_FUNC(devfn)); - ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, -- pdev); -+ pdev, mode); -+ if ( ret > 0 ) -+ ret = 0; - if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) - enable_ats_device(pdev, &drhd->iommu->ats_devices); - -@@ -1537,9 +1670,10 @@ static int domain_context_mapping(struct - PCI_SLOT(devfn), PCI_FUNC(devfn)); - - ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, -- pdev); -- if ( ret ) -+ pdev, mode); -+ if ( ret < 0 ) - break; -+ prev_present = ret; - - if ( (ret = find_upstream_bridge(seg, &bus, &devfn, &secbus)) < 1 ) - { -@@ -1547,6 +1681,15 @@ static int domain_context_mapping(struct - break; - ret = -ENXIO; - } -+ /* -+ * Strictly speaking if the device is the only one behind this bridge -+ * and the only one with this (secbus,0,0) tuple, it could be allowed -+ * to be re-assigned regardless of RMRR presence. But let's deal with -+ * that case only if it is actually found in the wild. -+ */ -+ else if ( prev_present && (mode & MAP_WITH_RMRR) && -+ domain != pdev->domain ) -+ ret = -EOPNOTSUPP; - - /* - * Mapping a bridge should, if anything, pass the struct pci_dev of -@@ -1555,7 +1698,7 @@ static int domain_context_mapping(struct - */ - if ( ret >= 0 ) - ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, -- NULL); -+ NULL, mode); - - /* - * Devices behind PCIe-to-PCI/PCIx bridge may generate different -@@ -1570,10 +1713,15 @@ static int domain_context_mapping(struct - if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && - (secbus != pdev->bus || pdev->devfn != 0) ) - ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, -- NULL); -+ NULL, mode); - - if ( ret ) -- domain_context_unmap(domain, devfn, pdev); -+ { -+ if ( !prev_present ) -+ domain_context_unmap(domain, devfn, pdev); -+ else if ( pdev->domain != domain ) /* Avoid infinite recursion. */ -+ domain_context_mapping(pdev->domain, devfn, pdev); -+ } - - break; - -@@ -2352,9 +2500,8 @@ static int reassign_device_ownership( - { - int ret; - -- ret = domain_context_unmap(source, devfn, pdev); -- if ( ret ) -- return ret; -+ if ( !has_arch_pdevs(target) ) -+ vmx_pi_hooks_assign(target); - - /* - * Devices assigned to untrusted domains (here assumed to be any domU) -@@ -2364,6 +2511,31 @@ static int reassign_device_ownership( - if ( (target != hardware_domain) && !iommu_intremap ) - untrusted_msi = true; - -+ ret = domain_context_mapping(target, devfn, pdev); -+ if ( ret ) -+ { -+ if ( !has_arch_pdevs(target) ) -+ vmx_pi_hooks_deassign(target); -+ return ret; -+ } -+ -+ if ( pdev->devfn == devfn ) -+ { -+ const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); -+ -+ if ( drhd ) -+ check_cleanup_domid_map(source, pdev, drhd->iommu); -+ } -+ -+ if ( devfn == pdev->devfn && pdev->domain != target ) -+ { -+ list_move(&pdev->domain_list, &target->pdev_list); -+ pdev->domain = target; -+ } -+ -+ if ( !has_arch_pdevs(source) ) -+ vmx_pi_hooks_deassign(source); -+ - /* - * If the device belongs to the hardware domain, and it has RMRR, don't - * remove it from the hardware domain, because BIOS may use RMRR at -@@ -2392,34 +2564,7 @@ static int reassign_device_ownership( - } - } - -- if ( devfn == pdev->devfn && pdev->domain != dom_io ) -- { -- list_move(&pdev->domain_list, &dom_io->pdev_list); -- pdev->domain = dom_io; -- } -- -- if ( !has_arch_pdevs(source) ) -- vmx_pi_hooks_deassign(source); -- -- if ( !has_arch_pdevs(target) ) -- vmx_pi_hooks_assign(target); -- -- ret = domain_context_mapping(target, devfn, pdev); -- if ( ret ) -- { -- if ( !has_arch_pdevs(target) ) -- vmx_pi_hooks_deassign(target); -- -- return ret; -- } -- -- if ( devfn == pdev->devfn && pdev->domain != target ) -- { -- list_move(&pdev->domain_list, &target->pdev_list); -- pdev->domain = target; -- } -- -- return ret; -+ return 0; - } - - static int intel_iommu_assign_device( ---- a/xen/drivers/passthrough/vtd/iommu.h -+++ b/xen/drivers/passthrough/vtd/iommu.h -@@ -202,8 +202,12 @@ struct root_entry { - do {(root).val |= ((value) & PAGE_MASK_4K);} while(0) - - struct context_entry { -- u64 lo; -- u64 hi; -+ union { -+ struct { -+ uint64_t lo, hi; -+ }; -+ __uint128_t full; -+ }; - }; - #define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry)) - #define context_present(c) ((c).lo & 1) ---- a/xen/drivers/passthrough/vtd/quirks.c -+++ b/xen/drivers/passthrough/vtd/quirks.c -@@ -344,7 +344,8 @@ void __init platform_quirks_init(void) - */ - - static int __must_check map_me_phantom_function(struct domain *domain, -- u32 dev, int map) -+ unsigned int dev, -+ unsigned int mode) - { - struct acpi_drhd_unit *drhd; - struct pci_dev *pdev; -@@ -355,9 +356,9 @@ static int __must_check map_me_phantom_f - drhd = acpi_find_matched_drhd_unit(pdev); - - /* map or unmap ME phantom function */ -- if ( map ) -+ if ( !(mode & UNMAP_ME_PHANTOM_FUNC) ) - rc = domain_context_mapping_one(domain, drhd->iommu, 0, -- PCI_DEVFN(dev, 7), NULL); -+ PCI_DEVFN(dev, 7), NULL, mode); - else - rc = domain_context_unmap_one(domain, drhd->iommu, 0, - PCI_DEVFN(dev, 7)); -@@ -365,7 +366,8 @@ static int __must_check map_me_phantom_f - return rc; - } - --int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map) -+int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn, -+ unsigned int mode) - { - u32 id; - int rc = 0; -@@ -389,7 +391,7 @@ int me_wifi_quirk(struct domain *domain, - case 0x423b8086: - case 0x423c8086: - case 0x423d8086: -- rc = map_me_phantom_function(domain, 3, map); -+ rc = map_me_phantom_function(domain, 3, mode); - break; - default: - break; -@@ -415,7 +417,7 @@ int me_wifi_quirk(struct domain *domain, - case 0x42388086: /* Puma Peak */ - case 0x422b8086: - case 0x422c8086: -- rc = map_me_phantom_function(domain, 22, map); -+ rc = map_me_phantom_function(domain, 22, mode); - break; - default: - break; ---- a/xen/drivers/passthrough/vtd/vtd.h -+++ b/xen/drivers/passthrough/vtd/vtd.h -@@ -22,8 +22,14 @@ - - #include - --#define MAP_ME_PHANTOM_FUNC 1 --#define UNMAP_ME_PHANTOM_FUNC 0 -+/* -+ * Values for domain_context_mapping_one()'s and me_wifi_quirk()'s "mode" -+ * parameters. -+ */ -+#define MAP_WITH_RMRR (1u << 0) -+#define MAP_OWNER_DYING (1u << 1) -+#define MAP_SINGLE_DEVICE (1u << 2) -+#define UNMAP_ME_PHANTOM_FUNC (1u << 3) - - /* Allow for both IOAPIC and IOSAPIC. */ - #define IO_xAPIC_route_entry IO_APIC_route_entry diff --git a/xsa400-4.14-05.patch b/xsa400-4.14-05.patch deleted file mode 100644 index 7283d08..0000000 --- a/xsa400-4.14-05.patch +++ /dev/null @@ -1,398 +0,0 @@ -From: Jan Beulich -Subject: AMD/IOMMU: re-assign devices directly - -Devices with unity map ranges, due to it being unspecified how/when -these memory ranges may get accessed, may not be left disconnected from -their unity mappings (as long as it's not certain that the device has -been fully quiesced). Hence rather than tearing down the old root page -table pointer and then establishing the new one, re-assignment needs to -be done in a single step. - -This is CVE-2022-26360 / part of XSA-400. - -Reported-by: Roger Pau Monné - -Similarly quarantining scratch-page mode relies on page tables to be -continuously wired up. - -To avoid complicating things more than necessary, treat all devices -mostly equally, i.e. regardless of their association with any unity map -ranges. The main difference is when it comes to updating DTEs, which need -to be atomic when there are unity mappings. Yet atomicity can only be -achieved with CMPXCHG16B, availability of which we can't take for given. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant -Reviewed-by: Roger Pau Monné - ---- a/xen/drivers/passthrough/amd/iommu.h -+++ b/xen/drivers/passthrough/amd/iommu.h -@@ -247,9 +247,13 @@ void amd_iommu_set_intremap_table(struct - const void *ptr, - const struct amd_iommu *iommu, - bool valid); --void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, -- uint64_t root_ptr, uint16_t domain_id, -- uint8_t paging_mode, bool valid); -+#define SET_ROOT_VALID (1u << 0) -+#define SET_ROOT_WITH_UNITY_MAP (1u << 1) -+int __must_check amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, -+ uint64_t root_ptr, -+ uint16_t domain_id, -+ uint8_t paging_mode, -+ unsigned int flags); - void iommu_dte_add_device_entry(struct amd_iommu_dte *dte, - const struct ivrs_mappings *ivrs_dev); - void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id, ---- a/xen/drivers/passthrough/amd/iommu_map.c -+++ b/xen/drivers/passthrough/amd/iommu_map.c -@@ -99,10 +99,69 @@ static unsigned int set_iommu_pte_presen - return flush_flags; - } - --void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, -- uint64_t root_ptr, uint16_t domain_id, -- uint8_t paging_mode, bool valid) -+/* -+ * This function returns -+ * - -errno for errors, -+ * - 0 for a successful update, atomic when necessary -+ * - 1 for a successful but non-atomic update, which may need to be warned -+ * about by the caller. -+ */ -+int amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, -+ uint64_t root_ptr, uint16_t domain_id, -+ uint8_t paging_mode, unsigned int flags) - { -+ bool valid = flags & SET_ROOT_VALID; -+ -+ if ( dte->v && dte->tv && -+ (cpu_has_cx16 || (flags & SET_ROOT_WITH_UNITY_MAP)) ) -+ { -+ union { -+ struct amd_iommu_dte dte; -+ uint64_t raw64[4]; -+ __uint128_t raw128[2]; -+ } ldte = { .dte = *dte }; -+ __uint128_t old = ldte.raw128[0]; -+ int ret = 0; -+ -+ ldte.dte.domain_id = domain_id; -+ ldte.dte.pt_root = paddr_to_pfn(root_ptr); -+ ldte.dte.iw = true; -+ ldte.dte.ir = true; -+ ldte.dte.paging_mode = paging_mode; -+ ldte.dte.v = valid; -+ -+ if ( cpu_has_cx16 ) -+ { -+ __uint128_t res = cmpxchg16b(dte, &old, &ldte.raw128[0]); -+ -+ /* -+ * Hardware does not update the DTE behind our backs, so the -+ * return value should match "old". -+ */ -+ if ( res != old ) -+ { -+ printk(XENLOG_ERR -+ "Dom%d: unexpected DTE %016lx_%016lx (expected %016lx_%016lx)\n", -+ domain_id, -+ (uint64_t)(res >> 64), (uint64_t)res, -+ (uint64_t)(old >> 64), (uint64_t)old); -+ ret = -EILSEQ; -+ } -+ } -+ else /* Best effort, updating domain_id last. */ -+ { -+ uint64_t *ptr = (void *)dte; -+ -+ write_atomic(ptr + 0, ldte.raw64[0]); -+ /* No barrier should be needed between these two. */ -+ write_atomic(ptr + 1, ldte.raw64[1]); -+ -+ ret = 1; -+ } -+ -+ return ret; -+ } -+ - if ( valid || dte->v ) - { - dte->tv = false; -@@ -117,6 +176,8 @@ void amd_iommu_set_root_page_table(struc - smp_wmb(); - dte->tv = true; - dte->v = valid; -+ -+ return 0; - } - - void amd_iommu_set_intremap_table( ---- a/xen/drivers/passthrough/amd/pci_amd_iommu.c -+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c -@@ -82,40 +82,81 @@ int get_dma_requestor_id(uint16_t seg, u - return req_id; - } - --static void amd_iommu_setup_domain_device( -+static int __must_check allocate_domain_resources(struct domain_iommu *hd) -+{ -+ int rc; -+ -+ spin_lock(&hd->arch.mapping_lock); -+ rc = amd_iommu_alloc_root(hd); -+ spin_unlock(&hd->arch.mapping_lock); -+ -+ return rc; -+} -+ -+static bool any_pdev_behind_iommu(const struct domain *d, -+ const struct pci_dev *exclude, -+ const struct amd_iommu *iommu) -+{ -+ const struct pci_dev *pdev; -+ -+ for_each_pdev ( d, pdev ) -+ { -+ if ( pdev == exclude ) -+ continue; -+ -+ if ( find_iommu_for_device(pdev->seg, pdev->sbdf.bdf) == iommu ) -+ return true; -+ } -+ -+ return false; -+} -+ -+static int __must_check amd_iommu_setup_domain_device( - struct domain *domain, struct amd_iommu *iommu, - uint8_t devfn, struct pci_dev *pdev) - { - struct amd_iommu_dte *table, *dte; - unsigned long flags; -- int req_id, valid = 1; -+ unsigned int req_id, sr_flags; -+ int rc; - u8 bus = pdev->bus; -- const struct domain_iommu *hd = dom_iommu(domain); -+ struct domain_iommu *hd = dom_iommu(domain); -+ const struct ivrs_mappings *ivrs_dev; - -- BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode || -- !iommu->dev_table.buffer ); -+ BUG_ON(!hd->arch.paging_mode || !iommu->dev_table.buffer); - -- if ( iommu_hwdom_passthrough && is_hardware_domain(domain) ) -- valid = 0; -+ rc = allocate_domain_resources(hd); -+ if ( rc ) -+ return rc; -+ -+ req_id = get_dma_requestor_id(iommu->seg, pdev->sbdf.bdf); -+ ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; -+ sr_flags = (iommu_hwdom_passthrough && is_hardware_domain(domain) -+ ? 0 : SET_ROOT_VALID) -+ | (ivrs_dev->unity_map ? SET_ROOT_WITH_UNITY_MAP : 0); - - /* get device-table entry */ - req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn)); - table = iommu->dev_table.buffer; - dte = &table[req_id]; -+ ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; - - spin_lock_irqsave(&iommu->lock, flags); - - if ( !dte->v || !dte->tv ) - { -- const struct ivrs_mappings *ivrs_dev; -- - /* bind DTE to domain page-tables */ -- amd_iommu_set_root_page_table( -- dte, page_to_maddr(hd->arch.root_table), domain->domain_id, -- hd->arch.paging_mode, valid); -+ rc = amd_iommu_set_root_page_table( -+ dte, page_to_maddr(hd->arch.root_table), -+ domain->domain_id, hd->arch.paging_mode, sr_flags); -+ if ( rc ) -+ { -+ ASSERT(rc < 0); -+ spin_unlock_irqrestore(&iommu->lock, flags); -+ return rc; -+ } - - /* Undo what amd_iommu_disable_domain_device() may have done. */ -- ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; - if ( dte->it_root ) - { - dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED; -@@ -130,17 +171,74 @@ static void amd_iommu_setup_domain_devic - dte->i = ats_enabled; - - amd_iommu_flush_device(iommu, req_id); -+ } -+ else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.root_table)) ) -+ { -+ /* -+ * Strictly speaking if the device is the only one with this requestor -+ * ID, it could be allowed to be re-assigned regardless of unity map -+ * presence. But let's deal with that case only if it is actually -+ * found in the wild. -+ */ -+ if ( req_id != PCI_BDF2(bus, devfn) && -+ (sr_flags & SET_ROOT_WITH_UNITY_MAP) ) -+ rc = -EOPNOTSUPP; -+ else -+ rc = amd_iommu_set_root_page_table( -+ dte, page_to_maddr(hd->arch.root_table), -+ domain->domain_id, hd->arch.paging_mode, sr_flags); -+ if ( rc < 0 ) -+ { -+ spin_unlock_irqrestore(&iommu->lock, flags); -+ return rc; -+ } -+ if ( rc && -+ domain != pdev->domain && -+ /* -+ * By non-atomically updating the DTE's domain ID field last, -+ * during a short window in time TLB entries with the old domain -+ * ID but the new page tables may have been inserted. This could -+ * affect I/O of other devices using this same (old) domain ID. -+ * Such updating therefore is not a problem if this was the only -+ * device associated with the old domain ID. Diverting I/O of any -+ * of a dying domain's devices to the quarantine page tables is -+ * intended anyway. -+ */ -+ !pdev->domain->is_dying && -+ (any_pdev_behind_iommu(pdev->domain, pdev, iommu) || -+ pdev->phantom_stride) ) -+ printk(" %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n", -+ pdev->seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), -+ pdev->domain); -+ -+ /* -+ * Check remaining settings are still in place from an earlier call -+ * here. They're all independent of the domain, so should not have -+ * changed. -+ */ -+ if ( dte->it_root ) -+ ASSERT(dte->int_ctl == IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED); -+ ASSERT(dte->iv == iommu_intremap); -+ ASSERT(dte->ex == ivrs_dev->dte_allow_exclusion); -+ ASSERT(dte->sys_mgt == MASK_EXTR(ivrs_dev->device_flags, -+ ACPI_IVHD_SYSTEM_MGMT)); - -- AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, " -- "root table = %#"PRIx64", " -- "domain = %d, paging mode = %d\n", -- req_id, pdev->type, -- page_to_maddr(hd->arch.root_table), -- domain->domain_id, hd->arch.paging_mode); -+ if ( pci_ats_device(iommu->seg, bus, pdev->devfn) && -+ iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) ) -+ ASSERT(dte->i == ats_enabled); -+ -+ amd_iommu_flush_device(iommu, req_id); - } - - spin_unlock_irqrestore(&iommu->lock, flags); - -+ AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, " -+ "root table = %#"PRIx64", " -+ "domain = %d, paging mode = %d\n", -+ req_id, pdev->type, -+ page_to_maddr(hd->arch.root_table), -+ domain->domain_id, hd->arch.paging_mode); -+ - ASSERT(pcidevs_locked()); - - if ( pci_ats_device(iommu->seg, bus, pdev->devfn) && -@@ -151,6 +249,8 @@ static void amd_iommu_setup_domain_devic - - amd_iommu_flush_iotlb(devfn, pdev, INV_IOMMU_ALL_PAGES_ADDRESS, 0); - } -+ -+ return 0; - } - - int __init acpi_ivrs_init(void) -@@ -214,17 +314,6 @@ int amd_iommu_alloc_root(struct domain_i - return 0; - } - --static int __must_check allocate_domain_resources(struct domain_iommu *hd) --{ -- int rc; -- -- spin_lock(&hd->arch.mapping_lock); -- rc = amd_iommu_alloc_root(hd); -- spin_unlock(&hd->arch.mapping_lock); -- -- return rc; --} -- - int __read_mostly amd_iommu_min_paging_mode = 1; - - static int amd_iommu_domain_init(struct domain *d) -@@ -324,7 +413,6 @@ static int reassign_device(struct domain - { - struct amd_iommu *iommu; - int bdf, rc; -- struct domain_iommu *t = dom_iommu(target); - const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); - - bdf = PCI_BDF2(pdev->bus, pdev->devfn); -@@ -338,7 +426,15 @@ static int reassign_device(struct domain - return -ENODEV; - } - -- amd_iommu_disable_domain_device(source, iommu, devfn, pdev); -+ rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev); -+ if ( rc ) -+ return rc; -+ -+ if ( devfn == pdev->devfn && pdev->domain != target ) -+ { -+ list_move(&pdev->domain_list, &target->pdev_list); -+ pdev->domain = target; -+ } - - /* - * If the device belongs to the hardware domain, and it has a unity mapping, -@@ -354,27 +450,10 @@ static int reassign_device(struct domain - return rc; - } - -- if ( devfn == pdev->devfn && pdev->domain != dom_io ) -- { -- list_move(&pdev->domain_list, &dom_io->pdev_list); -- pdev->domain = dom_io; -- } -- -- rc = allocate_domain_resources(t); -- if ( rc ) -- return rc; -- -- amd_iommu_setup_domain_device(target, iommu, devfn, pdev); - AMD_IOMMU_DEBUG("Re-assign %04x:%02x:%02x.%u from dom%d to dom%d\n", - pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), - source->domain_id, target->domain_id); - -- if ( devfn == pdev->devfn && pdev->domain != target ) -- { -- list_move(&pdev->domain_list, &target->pdev_list); -- pdev->domain = target; -- } -- - return 0; - } - -@@ -538,8 +617,7 @@ static int amd_iommu_add_device(u8 devfn - spin_unlock_irqrestore(&iommu->lock, flags); - } - -- amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); -- return 0; -+ return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); - } - - static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev) diff --git a/xsa400-4.14-06.patch b/xsa400-4.14-06.patch deleted file mode 100644 index 07f69d7..0000000 --- a/xsa400-4.14-06.patch +++ /dev/null @@ -1,279 +0,0 @@ -From: Jan Beulich -Subject: VT-d: prepare for per-device quarantine page tables (part I) - -Arrange for domain ID and page table root to be passed around, the latter in -particular to domain_pgd_maddr() such that taking it from the per-domain -fields can be overridden. - -No functional change intended. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant -Reviewed-by: Roger Pau Monné -Reviewed-by: Kevin Tian - ---- a/xen/drivers/passthrough/vtd/extern.h -+++ b/xen/drivers/passthrough/vtd/extern.h -@@ -86,9 +86,10 @@ void *map_vtd_domain_page(u64 maddr); - void unmap_vtd_domain_page(void *va); - int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu, - uint8_t bus, uint8_t devfn, -- const struct pci_dev *pdev, unsigned int mode); -+ const struct pci_dev *pdev, domid_t domid, -+ paddr_t pgd_maddr, unsigned int mode); - int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu, -- u8 bus, u8 devfn); -+ uint8_t bus, uint8_t devfn, domid_t domid); - int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt); - - unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg); -@@ -107,7 +108,8 @@ void platform_quirks_init(void); - void vtd_ops_preamble_quirk(struct vtd_iommu *iommu); - void vtd_ops_postamble_quirk(struct vtd_iommu *iommu); - int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus, -- uint8_t devfn, unsigned int mode); -+ uint8_t devfn, domid_t domid, paddr_t pgd_maddr, -+ unsigned int mode); - void pci_vtd_quirk(const struct pci_dev *); - void quirk_iommu_caps(struct vtd_iommu *iommu); - ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -1367,12 +1367,12 @@ int domain_context_mapping_one( - struct domain *domain, - struct vtd_iommu *iommu, - uint8_t bus, uint8_t devfn, const struct pci_dev *pdev, -- unsigned int mode) -+ domid_t domid, paddr_t pgd_maddr, unsigned int mode) - { - struct domain_iommu *hd = dom_iommu(domain); - struct context_entry *context, *context_entries, lctxt; - __uint128_t old; -- u64 maddr, pgd_maddr; -+ uint64_t maddr; - uint16_t seg = iommu->drhd->segment, prev_did = 0; - struct domain *prev_dom = NULL; - int agaw, rc, ret; -@@ -1413,10 +1413,12 @@ int domain_context_mapping_one( - } - else - { -+ paddr_t root = pgd_maddr; -+ - spin_lock(&hd->arch.mapping_lock); - - /* Ensure we have pagetables allocated down to leaf PTE. */ -- if ( hd->arch.pgd_maddr == 0 ) -+ if ( !root ) - { - addr_to_dma_page_maddr(domain, 0, 1); - if ( hd->arch.pgd_maddr == 0 ) -@@ -1429,22 +1431,24 @@ int domain_context_mapping_one( - rcu_unlock_domain(prev_dom); - return -ENOMEM; - } -+ -+ root = hd->arch.pgd_maddr; - } - - /* Skip top levels of page tables for 2- and 3-level DRHDs. */ -- pgd_maddr = hd->arch.pgd_maddr; - for ( agaw = level_to_agaw(4); - agaw != level_to_agaw(iommu->nr_pt_levels); - agaw-- ) - { -- struct dma_pte *p = map_vtd_domain_page(pgd_maddr); -- pgd_maddr = dma_pte_addr(*p); -+ struct dma_pte *p = map_vtd_domain_page(root); -+ -+ root = dma_pte_addr(*p); - unmap_vtd_domain_page(p); -- if ( pgd_maddr == 0 ) -+ if ( !root ) - goto nomem; - } - -- context_set_address_root(lctxt, pgd_maddr); -+ context_set_address_root(lctxt, root); - if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) ) - context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB); - else -@@ -1560,15 +1564,21 @@ int domain_context_mapping_one( - unmap_vtd_domain_page(context_entries); - - if ( !seg && !rc ) -- rc = me_wifi_quirk(domain, bus, devfn, mode); -+ rc = me_wifi_quirk(domain, bus, devfn, domid, pgd_maddr, mode); - - if ( rc ) - { - if ( !prev_dom ) -- ret = domain_context_unmap_one(domain, iommu, bus, devfn); -+ ret = domain_context_unmap_one(domain, iommu, bus, devfn, -+ domain->domain_id); - else if ( prev_dom != domain ) /* Avoid infinite recursion. */ -+ { -+ hd = dom_iommu(prev_dom); - ret = domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, -+ domain->domain_id, -+ hd->arch.pgd_maddr, - mode & MAP_WITH_RMRR) < 0; -+ } - else - ret = 1; - -@@ -1590,6 +1600,7 @@ static int domain_context_mapping(struct - { - struct acpi_drhd_unit *drhd; - const struct acpi_rmrr_unit *rmrr; -+ paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr; - int ret = 0; - unsigned int i, mode = 0; - uint16_t seg = pdev->seg, bdf; -@@ -1655,7 +1666,8 @@ static int domain_context_mapping(struct - domain->domain_id, seg, bus, - PCI_SLOT(devfn), PCI_FUNC(devfn)); - ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, -- pdev, mode); -+ pdev, domain->domain_id, pgd_maddr, -+ mode); - if ( ret > 0 ) - ret = 0; - if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) -@@ -1670,7 +1682,8 @@ static int domain_context_mapping(struct - PCI_SLOT(devfn), PCI_FUNC(devfn)); - - ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, -- pdev, mode); -+ pdev, domain->domain_id, pgd_maddr, -+ mode); - if ( ret < 0 ) - break; - prev_present = ret; -@@ -1698,7 +1711,8 @@ static int domain_context_mapping(struct - */ - if ( ret >= 0 ) - ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, -- NULL, mode); -+ NULL, domain->domain_id, pgd_maddr, -+ mode); - - /* - * Devices behind PCIe-to-PCI/PCIx bridge may generate different -@@ -1713,7 +1727,8 @@ static int domain_context_mapping(struct - if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && - (secbus != pdev->bus || pdev->devfn != 0) ) - ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, -- NULL, mode); -+ NULL, domain->domain_id, pgd_maddr, -+ mode); - - if ( ret ) - { -@@ -1742,7 +1757,7 @@ static int domain_context_mapping(struct - int domain_context_unmap_one( - struct domain *domain, - struct vtd_iommu *iommu, -- u8 bus, u8 devfn) -+ uint8_t bus, uint8_t devfn, domid_t domid) - { - struct context_entry *context, *context_entries; - u64 maddr; -@@ -1800,7 +1815,7 @@ int domain_context_unmap_one( - unmap_vtd_domain_page(context_entries); - - if ( !iommu->drhd->segment && !rc ) -- rc = me_wifi_quirk(domain, bus, devfn, UNMAP_ME_PHANTOM_FUNC); -+ rc = me_wifi_quirk(domain, bus, devfn, domid, 0, UNMAP_ME_PHANTOM_FUNC); - - if ( rc && !is_hardware_domain(domain) && domain != dom_io ) - { -@@ -1853,7 +1868,8 @@ static int domain_context_unmap(struct d - printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n", - domain->domain_id, seg, bus, - PCI_SLOT(devfn), PCI_FUNC(devfn)); -- ret = domain_context_unmap_one(domain, iommu, bus, devfn); -+ ret = domain_context_unmap_one(domain, iommu, bus, devfn, -+ domain->domain_id); - if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) - disable_ats_device(pdev); - -@@ -1863,7 +1879,8 @@ static int domain_context_unmap(struct d - if ( iommu_debug ) - printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n", - domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); -- ret = domain_context_unmap_one(domain, iommu, bus, devfn); -+ ret = domain_context_unmap_one(domain, iommu, bus, devfn, -+ domain->domain_id); - if ( ret ) - break; - -@@ -1889,12 +1906,15 @@ static int domain_context_unmap(struct d - /* PCIe to PCI/PCIx bridge */ - if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) - { -- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn); -+ ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, -+ domain->domain_id); - if ( !ret ) -- ret = domain_context_unmap_one(domain, iommu, secbus, 0); -+ ret = domain_context_unmap_one(domain, iommu, secbus, 0, -+ domain->domain_id); - } - else /* Legacy PCI bridge */ -- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn); -+ ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, -+ domain->domain_id); - - break; - ---- a/xen/drivers/passthrough/vtd/quirks.c -+++ b/xen/drivers/passthrough/vtd/quirks.c -@@ -345,6 +345,8 @@ void __init platform_quirks_init(void) - - static int __must_check map_me_phantom_function(struct domain *domain, - unsigned int dev, -+ domid_t domid, -+ paddr_t pgd_maddr, - unsigned int mode) - { - struct acpi_drhd_unit *drhd; -@@ -358,16 +360,17 @@ static int __must_check map_me_phantom_f - /* map or unmap ME phantom function */ - if ( !(mode & UNMAP_ME_PHANTOM_FUNC) ) - rc = domain_context_mapping_one(domain, drhd->iommu, 0, -- PCI_DEVFN(dev, 7), NULL, mode); -+ PCI_DEVFN(dev, 7), NULL, -+ domid, pgd_maddr, mode); - else - rc = domain_context_unmap_one(domain, drhd->iommu, 0, -- PCI_DEVFN(dev, 7)); -+ PCI_DEVFN(dev, 7), domid); - - return rc; - } - - int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn, -- unsigned int mode) -+ domid_t domid, paddr_t pgd_maddr, unsigned int mode) - { - u32 id; - int rc = 0; -@@ -391,7 +394,7 @@ int me_wifi_quirk(struct domain *domain, - case 0x423b8086: - case 0x423c8086: - case 0x423d8086: -- rc = map_me_phantom_function(domain, 3, mode); -+ rc = map_me_phantom_function(domain, 3, domid, pgd_maddr, mode); - break; - default: - break; -@@ -417,7 +420,7 @@ int me_wifi_quirk(struct domain *domain, - case 0x42388086: /* Puma Peak */ - case 0x422b8086: - case 0x422c8086: -- rc = map_me_phantom_function(domain, 22, mode); -+ rc = map_me_phantom_function(domain, 22, domid, pgd_maddr, mode); - break; - default: - break; diff --git a/xsa400-4.14-07.patch b/xsa400-4.14-07.patch deleted file mode 100644 index f7593a3..0000000 --- a/xsa400-4.14-07.patch +++ /dev/null @@ -1,135 +0,0 @@ -From: Jan Beulich -Subject: VT-d: prepare for per-device quarantine page tables (part II) - -Replace the passing of struct domain * by domid_t in preparation of -per-device quarantine page tables also requiring per-device pseudo -domain IDs, which aren't going to be associated with any struct domain -instances. - -No functional change intended (except for slightly adjusted log message -text). - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant -Reviewed-by: Kevin Tian -Reviewed-by: Roger Pau Monné - ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -58,8 +58,8 @@ static struct tasklet vtd_fault_tasklet; - static int setup_hwdom_device(u8 devfn, struct pci_dev *); - static void setup_hwdom_rmrr(struct domain *d); - --static int domain_iommu_domid(struct domain *d, -- struct vtd_iommu *iommu) -+static int get_iommu_did(domid_t domid, const struct vtd_iommu *iommu, -+ bool warn) - { - unsigned long nr_dom, i; - -@@ -67,16 +67,16 @@ static int domain_iommu_domid(struct dom - i = find_first_bit(iommu->domid_bitmap, nr_dom); - while ( i < nr_dom ) - { -- if ( iommu->domid_map[i] == d->domain_id ) -+ if ( iommu->domid_map[i] == domid ) - return i; - - i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1); - } - -- if ( !d->is_dying ) -+ if ( warn ) - dprintk(XENLOG_ERR VTDPREFIX, -- "Cannot get valid iommu %u domid: %pd\n", -- iommu->index, d); -+ "No valid iommu %u domid for Dom%d\n", -+ iommu->index, domid); - - return -1; - } -@@ -84,8 +84,7 @@ static int domain_iommu_domid(struct dom - #define DID_FIELD_WIDTH 16 - #define DID_HIGH_OFFSET 8 - static int context_set_domain_id(struct context_entry *context, -- struct domain *d, -- struct vtd_iommu *iommu) -+ domid_t domid, struct vtd_iommu *iommu) - { - unsigned long nr_dom, i; - int found = 0; -@@ -96,7 +95,7 @@ static int context_set_domain_id(struct - i = find_first_bit(iommu->domid_bitmap, nr_dom); - while ( i < nr_dom ) - { -- if ( iommu->domid_map[i] == d->domain_id ) -+ if ( iommu->domid_map[i] == domid ) - { - found = 1; - break; -@@ -112,7 +111,7 @@ static int context_set_domain_id(struct - dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no free domain ids\n"); - return -EFAULT; - } -- iommu->domid_map[i] = d->domain_id; -+ iommu->domid_map[i] = domid; - } - - set_bit(i, iommu->domid_bitmap); -@@ -121,9 +120,9 @@ static int context_set_domain_id(struct - return 0; - } - --static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu) -+static void cleanup_domid_map(domid_t domid, struct vtd_iommu *iommu) - { -- int iommu_domid = domain_iommu_domid(domain, iommu); -+ int iommu_domid = get_iommu_did(domid, iommu, false); - - if ( iommu_domid >= 0 ) - { -@@ -179,7 +178,7 @@ static void check_cleanup_domid_map(stru - if ( !found ) - { - clear_bit(iommu->index, &dom_iommu(d)->arch.iommu_bitmap); -- cleanup_domid_map(d, iommu); -+ cleanup_domid_map(d->domain_id, iommu); - } - } - -@@ -636,7 +635,7 @@ static int __must_check iommu_flush_iotl - continue; - - flush_dev_iotlb = !!find_ats_dev_drhd(iommu); -- iommu_domid= domain_iommu_domid(d, iommu); -+ iommu_domid = get_iommu_did(d->domain_id, iommu, !d->is_dying); - if ( iommu_domid == -1 ) - continue; - -@@ -1457,7 +1456,7 @@ int domain_context_mapping_one( - spin_unlock(&hd->arch.mapping_lock); - } - -- rc = context_set_domain_id(&lctxt, domain, iommu); -+ rc = context_set_domain_id(&lctxt, domid, iommu); - if ( rc ) - { - unlock: -@@ -1782,7 +1781,7 @@ int domain_context_unmap_one( - context_clear_entry(*context); - iommu_sync_cache(context, sizeof(struct context_entry)); - -- iommu_domid= domain_iommu_domid(domain, iommu); -+ iommu_domid = get_iommu_did(domid, iommu, !domain->is_dying); - if ( iommu_domid == -1 ) - { - spin_unlock(&iommu->lock); -@@ -1954,7 +1953,7 @@ static void iommu_domain_teardown(struct - spin_unlock(&hd->arch.mapping_lock); - - for_each_drhd_unit ( drhd ) -- cleanup_domid_map(d, drhd->iommu); -+ cleanup_domid_map(d->domain_id, drhd->iommu); - } - - static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn, diff --git a/xsa400-4.14-08.patch b/xsa400-4.14-08.patch deleted file mode 100644 index 509b880..0000000 --- a/xsa400-4.14-08.patch +++ /dev/null @@ -1,408 +0,0 @@ -From: Jan Beulich -Subject: IOMMU/x86: maintain a per-device pseudo domain ID - -In order to subsequently enable per-device quarantine page tables, we'll -need domain-ID-like identifiers to be inserted in the respective device -(AMD) or context (Intel) table entries alongside the per-device page -table root addresses. - -Make use of "real" domain IDs occupying only half of the value range -coverable by domid_t. - -Note that in VT-d's iommu_alloc() I didn't want to introduce new memory -leaks in case of error, but existing ones don't get plugged - that'll be -the subject of a later change. - -The VT-d changes are slightly asymmetric, but this way we can avoid -assigning pseudo domain IDs to devices which would never be mapped while -still avoiding to add a new parameter to domain_context_unmap(). - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant -Reviewed-by: Kevin Tian -Reviewed-by: Roger Pau Monné - ---- a/xen/include/asm-x86/iommu.h -+++ b/xen/include/asm-x86/iommu.h -@@ -130,6 +130,10 @@ int pi_update_irte(const struct pi_desc - iommu_vcall(ops, sync_cache, addr, size); \ - }) - -+unsigned long *iommu_init_domid(void); -+domid_t iommu_alloc_domid(unsigned long *map); -+void iommu_free_domid(domid_t domid, unsigned long *map); -+ - #endif /* !__ARCH_X86_IOMMU_H__ */ - /* - * Local variables: ---- a/xen/include/asm-x86/pci.h -+++ b/xen/include/asm-x86/pci.h -@@ -15,6 +15,12 @@ - - struct arch_pci_dev { - vmask_t used_vectors; -+ /* -+ * These fields are (de)initialized under pcidevs-lock. Other uses of -+ * them don't race (de)initialization and hence don't strictly need any -+ * locking. -+ */ -+ domid_t pseudo_domid; - }; - - int pci_conf_write_intercept(unsigned int seg, unsigned int bdf, ---- a/xen/drivers/passthrough/amd/iommu.h -+++ b/xen/drivers/passthrough/amd/iommu.h -@@ -96,6 +96,7 @@ struct amd_iommu { - struct ring_buffer cmd_buffer; - struct ring_buffer event_log; - struct ring_buffer ppr_log; -+ unsigned long *domid_map; - - int exclusion_enable; - int exclusion_allow_all; ---- a/xen/drivers/passthrough/amd/iommu_detect.c -+++ b/xen/drivers/passthrough/amd/iommu_detect.c -@@ -180,6 +180,11 @@ int __init amd_iommu_detect_one_acpi( - if ( rt ) - goto out; - -+ iommu->domid_map = iommu_init_domid(); -+ rt = -ENOMEM; -+ if ( !iommu->domid_map ) -+ goto out; -+ - rt = pci_ro_device(iommu->seg, bus, PCI_DEVFN(dev, func)); - if ( rt ) - printk(XENLOG_ERR -@@ -191,7 +196,10 @@ int __init amd_iommu_detect_one_acpi( - - out: - if ( rt ) -+ { -+ xfree(iommu->domid_map); - xfree(iommu); -+ } - - return rt; - } ---- a/xen/drivers/passthrough/amd/pci_amd_iommu.c -+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c -@@ -554,6 +554,8 @@ static int amd_iommu_add_device(u8 devfn - struct amd_iommu *iommu; - u16 bdf; - struct ivrs_mappings *ivrs_mappings; -+ bool fresh_domid = false; -+ int ret; - - if ( !pdev->domain ) - return -EINVAL; -@@ -617,7 +619,22 @@ static int amd_iommu_add_device(u8 devfn - spin_unlock_irqrestore(&iommu->lock, flags); - } - -- return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); -+ if ( iommu_quarantine && pdev->arch.pseudo_domid == DOMID_INVALID ) -+ { -+ pdev->arch.pseudo_domid = iommu_alloc_domid(iommu->domid_map); -+ if ( pdev->arch.pseudo_domid == DOMID_INVALID ) -+ return -ENOSPC; -+ fresh_domid = true; -+ } -+ -+ ret = amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); -+ if ( ret && fresh_domid ) -+ { -+ iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map); -+ pdev->arch.pseudo_domid = DOMID_INVALID; -+ } -+ -+ return ret; - } - - static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev) -@@ -642,6 +659,9 @@ static int amd_iommu_remove_device(u8 de - - amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev); - -+ iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map); -+ pdev->arch.pseudo_domid = DOMID_INVALID; -+ - ivrs_mappings = get_ivrs_mappings(pdev->seg); - bdf = PCI_BDF2(pdev->bus, devfn); - if ( amd_iommu_perdev_intremap && ---- a/xen/drivers/passthrough/pci.c -+++ b/xen/drivers/passthrough/pci.c -@@ -339,6 +339,7 @@ static struct pci_dev *alloc_pdev(struct - *((u8*) &pdev->bus) = bus; - *((u8*) &pdev->devfn) = devfn; - pdev->domain = NULL; -+ pdev->arch.pseudo_domid = DOMID_INVALID; - INIT_LIST_HEAD(&pdev->msi_list); - - pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), -@@ -1349,9 +1350,13 @@ static int _dump_pci_devices(struct pci_ - - list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) - { -- printk("%04x:%02x:%02x.%u - %pd - node %-3d - MSIs < ", -- pseg->nr, pdev->bus, -- PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), pdev->domain, -+ printk("%04x:%02x:%02x.%u - ", pseg->nr, pdev->bus, -+ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); -+ if ( pdev->domain == dom_io ) -+ printk("DomIO:%x", pdev->arch.pseudo_domid); -+ else -+ printk("%pd", pdev->domain); -+ printk(" - node %-3d - MSIs < ", - (pdev->node != NUMA_NO_NODE) ? pdev->node : -1); - list_for_each_entry ( msi, &pdev->msi_list, list ) - printk("%d ", msi->irq); ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -22,6 +22,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -1195,7 +1196,7 @@ int __init iommu_alloc(struct acpi_drhd_ - { - struct vtd_iommu *iommu; - unsigned long sagaw, nr_dom; -- int agaw; -+ int agaw, rc; - - if ( nr_iommus >= MAX_IOMMUS ) - { -@@ -1288,7 +1289,16 @@ int __init iommu_alloc(struct acpi_drhd_ - if ( !iommu->domid_map ) - return -ENOMEM; - -+ iommu->pseudo_domid_map = iommu_init_domid(); -+ rc = -ENOMEM; -+ if ( !iommu->pseudo_domid_map ) -+ goto free; -+ - return 0; -+ -+ free: -+ iommu_free(drhd); -+ return rc; - } - - void __init iommu_free(struct acpi_drhd_unit *drhd) -@@ -1311,6 +1321,7 @@ void __init iommu_free(struct acpi_drhd_ - - xfree(iommu->domid_bitmap); - xfree(iommu->domid_map); -+ xfree(iommu->pseudo_domid_map); - - if ( iommu->msi.irq >= 0 ) - destroy_irq(iommu->msi.irq); -@@ -1591,8 +1602,8 @@ int domain_context_mapping_one( - return rc ?: pdev && prev_dom; - } - --static int domain_context_unmap(struct domain *d, uint8_t devfn, -- struct pci_dev *pdev); -+static const struct acpi_drhd_unit *domain_context_unmap( -+ struct domain *d, uint8_t devfn, struct pci_dev *pdev); - - static int domain_context_mapping(struct domain *domain, u8 devfn, - struct pci_dev *pdev) -@@ -1600,6 +1611,7 @@ static int domain_context_mapping(struct - struct acpi_drhd_unit *drhd; - const struct acpi_rmrr_unit *rmrr; - paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr; -+ domid_t orig_domid = pdev->arch.pseudo_domid; - int ret = 0; - unsigned int i, mode = 0; - uint16_t seg = pdev->seg, bdf; -@@ -1660,6 +1672,14 @@ static int domain_context_mapping(struct - break; - - case DEV_TYPE_PCIe_ENDPOINT: -+ if ( iommu_quarantine && orig_domid == DOMID_INVALID ) -+ { -+ pdev->arch.pseudo_domid = -+ iommu_alloc_domid(drhd->iommu->pseudo_domid_map); -+ if ( pdev->arch.pseudo_domid == DOMID_INVALID ) -+ return -ENOSPC; -+ } -+ - if ( iommu_debug ) - printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n", - domain->domain_id, seg, bus, -@@ -1675,6 +1695,14 @@ static int domain_context_mapping(struct - break; - - case DEV_TYPE_PCI: -+ if ( iommu_quarantine && orig_domid == DOMID_INVALID ) -+ { -+ pdev->arch.pseudo_domid = -+ iommu_alloc_domid(drhd->iommu->pseudo_domid_map); -+ if ( pdev->arch.pseudo_domid == DOMID_INVALID ) -+ return -ENOSPC; -+ } -+ - if ( iommu_debug ) - printk(VTDPREFIX "d%d:PCI: map %04x:%02x:%02x.%u\n", - domain->domain_id, seg, bus, -@@ -1750,6 +1778,13 @@ static int domain_context_mapping(struct - if ( !ret && devfn == pdev->devfn ) - pci_vtd_quirk(pdev); - -+ if ( ret && drhd && orig_domid == DOMID_INVALID ) -+ { -+ iommu_free_domid(pdev->arch.pseudo_domid, -+ drhd->iommu->pseudo_domid_map); -+ pdev->arch.pseudo_domid = DOMID_INVALID; -+ } -+ - return ret; - } - -@@ -1832,8 +1867,10 @@ int domain_context_unmap_one( - return rc; - } - --static int domain_context_unmap(struct domain *domain, u8 devfn, -- struct pci_dev *pdev) -+static const struct acpi_drhd_unit *domain_context_unmap( -+ struct domain *domain, -+ uint8_t devfn, -+ struct pci_dev *pdev) - { - struct acpi_drhd_unit *drhd; - struct vtd_iommu *iommu; -@@ -1843,7 +1880,7 @@ static int domain_context_unmap(struct d - - drhd = acpi_find_matched_drhd_unit(pdev); - if ( !drhd ) -- return -ENODEV; -+ return ERR_PTR(-ENODEV); - iommu = drhd->iommu; - - switch ( pdev->type ) -@@ -1854,7 +1891,7 @@ static int domain_context_unmap(struct d - domain->domain_id, seg, bus, - PCI_SLOT(devfn), PCI_FUNC(devfn)); - if ( !is_hardware_domain(domain) ) -- return -EPERM; -+ return ERR_PTR(-EPERM); - goto out; - - case DEV_TYPE_PCIe_BRIDGE: -@@ -1929,7 +1966,7 @@ static int domain_context_unmap(struct d - check_cleanup_domid_map(domain, pdev, iommu); - - out: -- return ret; -+ return ret ? ERR_PTR(ret) : drhd; - } - - static void iommu_domain_teardown(struct domain *d) -@@ -2153,16 +2190,17 @@ static int intel_iommu_enable_device(str - - static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev) - { -+ const struct acpi_drhd_unit *drhd; - struct acpi_rmrr_unit *rmrr; - u16 bdf; -- int ret, i; -+ unsigned int i; - - if ( !pdev->domain ) - return -EINVAL; - -- ret = domain_context_unmap(pdev->domain, devfn, pdev); -- if ( ret ) -- return ret; -+ drhd = domain_context_unmap(pdev->domain, devfn, pdev); -+ if ( IS_ERR(drhd) ) -+ return PTR_ERR(drhd); - - for_each_rmrr_device ( rmrr, bdf, i ) - { -@@ -2179,6 +2217,13 @@ static int intel_iommu_remove_device(u8 - rmrr->end_address, 0); - } - -+ if ( drhd ) -+ { -+ iommu_free_domid(pdev->arch.pseudo_domid, -+ drhd->iommu->pseudo_domid_map); -+ pdev->arch.pseudo_domid = DOMID_INVALID; -+ } -+ - return 0; - } - ---- a/xen/drivers/passthrough/vtd/iommu.h -+++ b/xen/drivers/passthrough/vtd/iommu.h -@@ -535,6 +535,7 @@ struct vtd_iommu { - } flush; - - struct list_head ats_devices; -+ unsigned long *pseudo_domid_map; /* "pseudo" domain id bitmap */ - unsigned long *domid_bitmap; /* domain id bitmap */ - u16 *domid_map; /* domain id mapping array */ - uint32_t version; ---- a/xen/drivers/passthrough/x86/iommu.c -+++ b/xen/drivers/passthrough/x86/iommu.c -@@ -375,6 +375,53 @@ void __hwdom_init arch_iommu_hwdom_init( - return; - } - -+unsigned long *__init iommu_init_domid(void) -+{ -+ if ( !iommu_quarantine ) -+ return ZERO_BLOCK_PTR; -+ -+ BUILD_BUG_ON(DOMID_MASK * 2U >= UINT16_MAX); -+ -+ return xzalloc_array(unsigned long, -+ BITS_TO_LONGS(UINT16_MAX - DOMID_MASK)); -+} -+ -+domid_t iommu_alloc_domid(unsigned long *map) -+{ -+ /* -+ * This is used uniformly across all IOMMUs, such that on typical -+ * systems we wouldn't re-use the same ID very quickly (perhaps never). -+ */ -+ static unsigned int start; -+ unsigned int idx = find_next_zero_bit(map, UINT16_MAX - DOMID_MASK, start); -+ -+ ASSERT(pcidevs_locked()); -+ -+ if ( idx >= UINT16_MAX - DOMID_MASK ) -+ idx = find_first_zero_bit(map, UINT16_MAX - DOMID_MASK); -+ if ( idx >= UINT16_MAX - DOMID_MASK ) -+ return DOMID_INVALID; -+ -+ __set_bit(idx, map); -+ -+ start = idx + 1; -+ -+ return idx | (DOMID_MASK + 1); -+} -+ -+void iommu_free_domid(domid_t domid, unsigned long *map) -+{ -+ ASSERT(pcidevs_locked()); -+ -+ if ( domid == DOMID_INVALID ) -+ return; -+ -+ ASSERT(domid > DOMID_MASK); -+ -+ if ( !__test_and_clear_bit(domid & DOMID_MASK, map) ) -+ BUG(); -+} -+ - /* - * Local variables: - * mode: C diff --git a/xsa400-4.14-09.patch b/xsa400-4.14-09.patch deleted file mode 100644 index 0423f67..0000000 --- a/xsa400-4.14-09.patch +++ /dev/null @@ -1,48 +0,0 @@ -From: Jan Beulich -Subject: IOMMU/x86: drop TLB flushes from quarantine_init() hooks - -The page tables just created aren't hooked up yet anywhere, so there's -nothing that could be present in any TLB, and hence nothing to flush. -Dropping this flush is, at least on the VT-d side, a prereq to per- -device domain ID use when quarantining devices, as dom_io isn't going -to be assigned a DID anymore: The warning in get_iommu_did() would -trigger. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant -Reviewed-by: Roger Pau Monné -Reviewed-by: Kevin Tian - ---- a/xen/drivers/passthrough/amd/iommu_map.c -+++ b/xen/drivers/passthrough/amd/iommu_map.c -@@ -585,8 +585,6 @@ int __init amd_iommu_quarantine_init(str - out: - spin_unlock(&hd->arch.mapping_lock); - -- amd_iommu_flush_all_pages(d); -- - /* Pages leaked in failure case */ - return level ? -ENOMEM : 0; - } ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -2914,7 +2914,6 @@ static int __init intel_iommu_quarantine - struct dma_pte *parent; - unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); - unsigned int level = agaw_to_level(agaw); -- int rc; - - if ( hd->arch.pgd_maddr ) - { -@@ -2961,10 +2960,8 @@ static int __init intel_iommu_quarantine - out: - spin_unlock(&hd->arch.mapping_lock); - -- rc = iommu_flush_iotlb_all(d); -- - /* Pages leaked in failure case */ -- return level ? -ENOMEM : rc; -+ return level ? -ENOMEM : 0; - } - - const struct iommu_ops __initconstrel intel_iommu_ops = { diff --git a/xsa400-4.14-10.patch b/xsa400-4.14-10.patch deleted file mode 100644 index afbc3e8..0000000 --- a/xsa400-4.14-10.patch +++ /dev/null @@ -1,40 +0,0 @@ -From: Jan Beulich -Subject: AMD/IOMMU: abstract maximum number of page table levels - -We will want to use the constant elsewhere. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/drivers/passthrough/amd/iommu.h -+++ b/xen/drivers/passthrough/amd/iommu.h -@@ -362,7 +362,7 @@ static inline int amd_iommu_get_paging_m - while ( max_frames > PTE_PER_TABLE_SIZE ) - { - max_frames = PTE_PER_TABLE_ALIGN(max_frames) >> PTE_PER_TABLE_SHIFT; -- if ( ++level > 6 ) -+ if ( ++level > IOMMU_MAX_PT_LEVELS ) - return -ENOMEM; - } - ---- a/xen/drivers/passthrough/amd/iommu-defs.h -+++ b/xen/drivers/passthrough/amd/iommu-defs.h -@@ -106,6 +106,7 @@ struct amd_iommu_dte { - bool tv:1; - unsigned int :5; - unsigned int had:2; -+#define IOMMU_MAX_PT_LEVELS 6 - unsigned int paging_mode:3; - uint64_t pt_root:40; - bool ppr:1; ---- a/xen/drivers/passthrough/amd/iommu_map.c -+++ b/xen/drivers/passthrough/amd/iommu_map.c -@@ -256,7 +256,7 @@ static int iommu_pde_from_dfn(struct dom - table = hd->arch.root_table; - level = hd->arch.paging_mode; - -- BUG_ON( table == NULL || level < 1 || level > 6 ); -+ BUG_ON( table == NULL || level < 1 || level > IOMMU_MAX_PT_LEVELS ); - - /* - * A frame number past what the current page tables can represent can't diff --git a/xsa400-4.14-11.patch b/xsa400-4.14-11.patch deleted file mode 100644 index e0a8181..0000000 --- a/xsa400-4.14-11.patch +++ /dev/null @@ -1,879 +0,0 @@ -From: Jan Beulich -Subject: IOMMU/x86: use per-device page tables for quarantining - -Devices with RMRRs / unity mapped regions, due to it being unspecified -how/when these memory regions may be accessed, may not be left -disconnected from the mappings of these regions (as long as it's not -certain that the device has been fully quiesced). Hence even the page -tables used when quarantining such devices need to have mappings of -those regions. This implies installing page tables in the first place -even when not in scratch-page quarantining mode. - -This is CVE-2022-26361 / part of XSA-400. - -While for the purpose here it would be sufficient to have devices with -RMRRs / unity mapped regions use per-device page tables, extend this to -all devices (in scratch-page quarantining mode). This allows the leaf -pages to be mapped r/w, thus covering also memory writes (rather than -just reads) issued by non-quiescent devices. - -Set up quarantine page tables as late as possible, yet early enough to -not encounter failure during de-assign. This means setup generally -happens in assign_device(), while (for now) the one in deassign_device() -is there mainly to be on the safe side. - -In VT-d's DID allocation function don't require the IOMMU lock to be -held anymore: All involved code paths hold pcidevs_lock, so this way we -avoid the need to acquire the IOMMU lock around the new call to -context_set_domain_id(). - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant -Reviewed-by: Kevin Tian -Reviewed-by: Roger Pau Monné - ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -1455,7 +1455,7 @@ int set_identity_p2m_entry(struct domain - struct p2m_domain *p2m = p2m_get_hostp2m(d); - int ret; - -- if ( !paging_mode_translate(p2m->domain) ) -+ if ( !paging_mode_translate(d) ) - { - if ( !is_iommu_enabled(d) ) - return 0; ---- a/xen/include/asm-x86/pci.h -+++ b/xen/include/asm-x86/pci.h -@@ -1,6 +1,8 @@ - #ifndef __X86_PCI_H__ - #define __X86_PCI_H__ - -+#include -+ - #define CF8_BDF(cf8) ( ((cf8) & 0x00ffff00) >> 8) - #define CF8_ADDR_LO(cf8) ( (cf8) & 0x000000fc) - #define CF8_ADDR_HI(cf8) ( ((cf8) & 0x0f000000) >> 16) -@@ -20,7 +22,18 @@ struct arch_pci_dev { - * them don't race (de)initialization and hence don't strictly need any - * locking. - */ -+ union { -+ /* Subset of struct arch_iommu's fields, to be used in dom_io. */ -+ struct { -+ uint64_t pgd_maddr; -+ } vtd; -+ struct { -+ struct page_info *root_table; -+ } amd; -+ }; - domid_t pseudo_domid; -+ mfn_t leaf_mfn; -+ struct page_list_head pgtables_list; - }; - - int pci_conf_write_intercept(unsigned int seg, unsigned int bdf, ---- a/xen/drivers/passthrough/amd/iommu.h -+++ b/xen/drivers/passthrough/amd/iommu.h -@@ -223,7 +223,8 @@ int amd_iommu_init_late(void); - int amd_iommu_update_ivrs_mapping_acpi(void); - int iov_adjust_irq_affinities(void); - --int amd_iommu_quarantine_init(struct domain *d); -+int amd_iommu_quarantine_init(struct pci_dev *pdev); -+void amd_iommu_quarantine_teardown(struct pci_dev *pdev); - - /* mapping functions */ - int __must_check amd_iommu_map_page(struct domain *d, dfn_t dfn, ---- a/xen/drivers/passthrough/amd/iommu_map.c -+++ b/xen/drivers/passthrough/amd/iommu_map.c -@@ -529,64 +529,137 @@ int amd_iommu_reserve_domain_unity_unmap - return rc; - } - --int __init amd_iommu_quarantine_init(struct domain *d) -+static int fill_qpt(union amd_iommu_pte *this, unsigned int level, -+ struct page_info *pgs[IOMMU_MAX_PT_LEVELS], -+ struct pci_dev *pdev) - { -- struct domain_iommu *hd = dom_iommu(d); -+ unsigned int i; -+ int rc = 0; -+ -+ for ( i = 0; !rc && i < PTE_PER_TABLE_SIZE; ++i ) -+ { -+ union amd_iommu_pte *pte = &this[i], *next; -+ -+ if ( !pte->pr ) -+ { -+ if ( !pgs[level] ) -+ { -+ /* -+ * The pgtable allocator is fine for the leaf page, as well as -+ * page table pages, and the resulting allocations are always -+ * zeroed. -+ */ -+ pgs[level] = alloc_amd_iommu_pgtable(); -+ if ( !pgs[level] ) -+ { -+ rc = -ENOMEM; -+ break; -+ } -+ -+ page_list_add(pgs[level], &pdev->arch.pgtables_list); -+ -+ if ( level ) -+ { -+ next = __map_domain_page(pgs[level]); -+ rc = fill_qpt(next, level - 1, pgs, pdev); -+ unmap_domain_page(next); -+ } -+ } -+ -+ /* -+ * PDEs are essentially a subset of PTEs, so this function -+ * is fine to use even at the leaf. -+ */ -+ set_iommu_pde_present(pte, mfn_x(page_to_mfn(pgs[level])), level, -+ true, true); -+ } -+ else if ( level && pte->next_level ) -+ { -+ page_list_add(mfn_to_page(_mfn(pte->mfn)), -+ &pdev->arch.pgtables_list); -+ next = map_domain_page(_mfn(pte->mfn)); -+ rc = fill_qpt(next, level - 1, pgs, pdev); -+ unmap_domain_page(next); -+ } -+ } -+ -+ return rc; -+} -+ -+int amd_iommu_quarantine_init(struct pci_dev *pdev) -+{ -+ struct domain_iommu *hd = dom_iommu(dom_io); - unsigned long end_gfn = - 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT); - unsigned int level = amd_iommu_get_paging_mode(end_gfn); -- union amd_iommu_pte *table; -+ unsigned int req_id = get_dma_requestor_id(pdev->seg, pdev->sbdf.bdf); -+ const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); -+ int rc; - -- if ( hd->arch.root_table ) -+ ASSERT(pcidevs_locked()); -+ ASSERT(!hd->arch.root_table); -+ -+ ASSERT(pdev->arch.pseudo_domid != DOMID_INVALID); -+ -+ if ( pdev->arch.amd.root_table ) - { -- ASSERT_UNREACHABLE(); -+ clear_domain_page(pdev->arch.leaf_mfn); - return 0; - } - -- spin_lock(&hd->arch.mapping_lock); -- -- hd->arch.root_table = alloc_amd_iommu_pgtable(); -- if ( !hd->arch.root_table ) -- goto out; -- -- table = __map_domain_page(hd->arch.root_table); -- while ( level ) -+ pdev->arch.amd.root_table = alloc_amd_iommu_pgtable(); -+ if ( !pdev->arch.amd.root_table ) -+ return -ENOMEM; -+ -+ /* Transiently install the root into DomIO, for iommu_identity_mapping(). */ -+ hd->arch.root_table = pdev->arch.amd.root_table; -+ -+ rc = amd_iommu_reserve_domain_unity_map(dom_io, -+ ivrs_mappings[req_id].unity_map, -+ 0); -+ -+ iommu_identity_map_teardown(dom_io); -+ hd->arch.root_table = NULL; -+ -+ if ( rc ) -+ printk("%04x:%02x:%02x.%u: quarantine unity mapping failed\n", -+ pdev->seg, pdev->bus, -+ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); -+ else - { -- struct page_info *pg; -- unsigned int i; -+ union amd_iommu_pte *root; -+ struct page_info *pgs[IOMMU_MAX_PT_LEVELS] = {}; - -- /* -- * The pgtable allocator is fine for the leaf page, as well as -- * page table pages, and the resulting allocations are always -- * zeroed. -- */ -- pg = alloc_amd_iommu_pgtable(); -- if ( !pg ) -- break; -+ spin_lock(&hd->arch.mapping_lock); - -- for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ ) -- { -- union amd_iommu_pte *pde = &table[i]; -+ root = __map_domain_page(pdev->arch.amd.root_table); -+ rc = fill_qpt(root, level - 1, pgs, pdev); -+ unmap_domain_page(root); - -- /* -- * PDEs are essentially a subset of PTEs, so this function -- * is fine to use even at the leaf. -- */ -- set_iommu_pde_present(pde, mfn_x(page_to_mfn(pg)), level - 1, -- false, true); -- } -+ pdev->arch.leaf_mfn = page_to_mfn(pgs[0]); - -- unmap_domain_page(table); -- table = __map_domain_page(pg); -- level--; -+ spin_unlock(&hd->arch.mapping_lock); - } -- unmap_domain_page(table); - -- out: -- spin_unlock(&hd->arch.mapping_lock); -+ if ( rc ) -+ amd_iommu_quarantine_teardown(pdev); -+ -+ return rc; -+} -+ -+void amd_iommu_quarantine_teardown(struct pci_dev *pdev) -+{ -+ struct page_info *pg; -+ -+ ASSERT(pcidevs_locked()); -+ -+ if ( !pdev->arch.amd.root_table ) -+ return; -+ -+ while ( (pg = page_list_remove_head(&pdev->arch.pgtables_list)) ) -+ free_amd_iommu_pgtable(pg); - -- /* Pages leaked in failure case */ -- return level ? -ENOMEM : 0; -+ pdev->arch.amd.root_table = NULL; - } - - /* ---- a/xen/drivers/passthrough/amd/pci_amd_iommu.c -+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c -@@ -122,6 +122,8 @@ static int __must_check amd_iommu_setup_ - u8 bus = pdev->bus; - struct domain_iommu *hd = dom_iommu(domain); - const struct ivrs_mappings *ivrs_dev; -+ const struct page_info *root_pg; -+ domid_t domid; - - BUG_ON(!hd->arch.paging_mode || !iommu->dev_table.buffer); - -@@ -141,14 +143,25 @@ static int __must_check amd_iommu_setup_ - dte = &table[req_id]; - ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; - -+ if ( domain != dom_io ) -+ { -+ root_pg = hd->arch.root_table; -+ domid = domain->domain_id; -+ } -+ else -+ { -+ root_pg = pdev->arch.amd.root_table; -+ domid = pdev->arch.pseudo_domid; -+ } -+ - spin_lock_irqsave(&iommu->lock, flags); - - if ( !dte->v || !dte->tv ) - { - /* bind DTE to domain page-tables */ - rc = amd_iommu_set_root_page_table( -- dte, page_to_maddr(hd->arch.root_table), -- domain->domain_id, hd->arch.paging_mode, sr_flags); -+ dte, page_to_maddr(root_pg), domid, -+ hd->arch.paging_mode, sr_flags); - if ( rc ) - { - ASSERT(rc < 0); -@@ -172,7 +185,7 @@ static int __must_check amd_iommu_setup_ - - amd_iommu_flush_device(iommu, req_id); - } -- else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.root_table)) ) -+ else if ( dte->pt_root != mfn_x(page_to_mfn(root_pg)) ) - { - /* - * Strictly speaking if the device is the only one with this requestor -@@ -185,8 +198,8 @@ static int __must_check amd_iommu_setup_ - rc = -EOPNOTSUPP; - else - rc = amd_iommu_set_root_page_table( -- dte, page_to_maddr(hd->arch.root_table), -- domain->domain_id, hd->arch.paging_mode, sr_flags); -+ dte, page_to_maddr(root_pg), domid, -+ hd->arch.paging_mode, sr_flags); - if ( rc < 0 ) - { - spin_unlock_irqrestore(&iommu->lock, flags); -@@ -205,6 +218,7 @@ static int __must_check amd_iommu_setup_ - * intended anyway. - */ - !pdev->domain->is_dying && -+ pdev->domain != dom_io && - (any_pdev_behind_iommu(pdev->domain, pdev, iommu) || - pdev->phantom_stride) ) - printk(" %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n", -@@ -235,9 +249,8 @@ static int __must_check amd_iommu_setup_ - AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, " - "root table = %#"PRIx64", " - "domain = %d, paging mode = %d\n", -- req_id, pdev->type, -- page_to_maddr(hd->arch.root_table), -- domain->domain_id, hd->arch.paging_mode); -+ req_id, pdev->type, page_to_maddr(root_pg), -+ domid, hd->arch.paging_mode); - - ASSERT(pcidevs_locked()); - -@@ -304,7 +317,7 @@ static int iov_enable_xt(void) - - int amd_iommu_alloc_root(struct domain_iommu *hd) - { -- if ( unlikely(!hd->arch.root_table) ) -+ if ( unlikely(!hd->arch.root_table) && hd != dom_iommu(dom_io) ) - { - hd->arch.root_table = alloc_amd_iommu_pgtable(); - if ( !hd->arch.root_table ) -@@ -395,7 +408,7 @@ static void amd_iommu_disable_domain_dev - - AMD_IOMMU_DEBUG("Disable: device id = %#x, " - "domain = %d, paging mode = %d\n", -- req_id, domain->domain_id, -+ req_id, dte->domain_id, - dom_iommu(domain)->arch.paging_mode); - } - spin_unlock_irqrestore(&iommu->lock, flags); -@@ -659,6 +672,8 @@ static int amd_iommu_remove_device(u8 de - - amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev); - -+ amd_iommu_quarantine_teardown(pdev); -+ - iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map); - pdev->arch.pseudo_domid = DOMID_INVALID; - ---- a/xen/drivers/passthrough/iommu.c -+++ b/xen/drivers/passthrough/iommu.c -@@ -447,21 +447,21 @@ int iommu_iotlb_flush_all(struct domain - return rc; - } - --static int __init iommu_quarantine_init(void) -+int iommu_quarantine_dev_init(device_t *dev) - { - const struct domain_iommu *hd = dom_iommu(dom_io); -- int rc; - -- dom_io->options |= XEN_DOMCTL_CDF_iommu; -+ if ( !iommu_quarantine || !hd->platform_ops->quarantine_init ) -+ return 0; - -- rc = iommu_domain_init(dom_io, 0); -- if ( rc ) -- return rc; -+ return iommu_call(hd->platform_ops, quarantine_init, dev); -+} - -- if ( !hd->platform_ops->quarantine_init ) -- return 0; -+static int __init iommu_quarantine_init(void) -+{ -+ dom_io->options |= XEN_DOMCTL_CDF_iommu; - -- return hd->platform_ops->quarantine_init(dom_io); -+ return iommu_domain_init(dom_io, 0); - } - - int __init iommu_setup(void) ---- a/xen/drivers/passthrough/pci.c -+++ b/xen/drivers/passthrough/pci.c -@@ -929,9 +929,16 @@ static int deassign_device(struct domain - return -ENODEV; - - /* De-assignment from dom_io should de-quarantine the device */ -- target = ((pdev->quarantine || iommu_quarantine) && -- pdev->domain != dom_io) ? -- dom_io : hardware_domain; -+ if ( (pdev->quarantine || iommu_quarantine) && pdev->domain != dom_io ) -+ { -+ ret = iommu_quarantine_dev_init(pci_to_dev(pdev)); -+ if ( ret ) -+ return ret; -+ -+ target = dom_io; -+ } -+ else -+ target = hardware_domain; - - while ( pdev->phantom_stride ) - { -@@ -1528,6 +1535,13 @@ static int assign_device(struct domain * - msixtbl_init(d); - } - -+ if ( pdev->domain != dom_io ) -+ { -+ rc = iommu_quarantine_dev_init(pci_to_dev(pdev)); -+ if ( rc ) -+ goto done; -+ } -+ - pdev->fault.count = 0; - - if ( (rc = hd->platform_ops->assign_device(d, devfn, pci_to_dev(pdev), flag)) ) ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -43,6 +43,12 @@ - #include "vtd.h" - #include "../ats.h" - -+#define DEVICE_DOMID(d, pdev) ((d) != dom_io ? (d)->domain_id \ -+ : (pdev)->arch.pseudo_domid) -+#define DEVICE_PGTABLE(d, pdev) ((d) != dom_io \ -+ ? dom_iommu(d)->arch.pgd_maddr \ -+ : (pdev)->arch.vtd.pgd_maddr) -+ - /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */ - bool __read_mostly untrusted_msi; - -@@ -84,13 +90,18 @@ static int get_iommu_did(domid_t domid, - - #define DID_FIELD_WIDTH 16 - #define DID_HIGH_OFFSET 8 -+ -+/* -+ * This function may have "context" passed as NULL, to merely obtain a DID -+ * for "domid". -+ */ - static int context_set_domain_id(struct context_entry *context, - domid_t domid, struct vtd_iommu *iommu) - { - unsigned long nr_dom, i; - int found = 0; - -- ASSERT(spin_is_locked(&iommu->lock)); -+ ASSERT(pcidevs_locked()); - - nr_dom = cap_ndoms(iommu->cap); - i = find_first_bit(iommu->domid_bitmap, nr_dom); -@@ -116,8 +127,13 @@ static int context_set_domain_id(struct - } - - set_bit(i, iommu->domid_bitmap); -- context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET); -- context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET; -+ -+ if ( context ) -+ { -+ context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET); -+ context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET; -+ } -+ - return 0; - } - -@@ -167,8 +183,12 @@ static void check_cleanup_domid_map(stru - const struct pci_dev *exclude, - struct vtd_iommu *iommu) - { -- bool found = any_pdev_behind_iommu(d, exclude, iommu); -+ bool found; -+ -+ if ( d == dom_io ) -+ return; - -+ found = any_pdev_behind_iommu(d, exclude, iommu); - /* - * Hidden devices are associated with DomXEN but usable by the hardware - * domain. Hence they need considering here as well. -@@ -1403,7 +1423,7 @@ int domain_context_mapping_one( - domid = iommu->domid_map[prev_did]; - if ( domid < DOMID_FIRST_RESERVED ) - prev_dom = rcu_lock_domain_by_id(domid); -- else if ( domid == DOMID_IO ) -+ else if ( pdev ? domid == pdev->arch.pseudo_domid : domid > DOMID_MASK ) - prev_dom = rcu_lock_domain(dom_io); - if ( !prev_dom ) - { -@@ -1580,15 +1600,12 @@ int domain_context_mapping_one( - { - if ( !prev_dom ) - ret = domain_context_unmap_one(domain, iommu, bus, devfn, -- domain->domain_id); -+ DEVICE_DOMID(domain, pdev)); - else if ( prev_dom != domain ) /* Avoid infinite recursion. */ -- { -- hd = dom_iommu(prev_dom); - ret = domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, -- domain->domain_id, -- hd->arch.pgd_maddr, -+ DEVICE_DOMID(prev_dom, pdev), -+ DEVICE_PGTABLE(prev_dom, pdev), - mode & MAP_WITH_RMRR) < 0; -- } - else - ret = 1; - -@@ -1610,7 +1627,7 @@ static int domain_context_mapping(struct - { - struct acpi_drhd_unit *drhd; - const struct acpi_rmrr_unit *rmrr; -- paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr; -+ paddr_t pgd_maddr = DEVICE_PGTABLE(domain, pdev); - domid_t orig_domid = pdev->arch.pseudo_domid; - int ret = 0; - unsigned int i, mode = 0; -@@ -1643,7 +1660,7 @@ static int domain_context_mapping(struct - break; - } - -- if ( domain != pdev->domain ) -+ if ( domain != pdev->domain && pdev->domain != dom_io ) - { - if ( pdev->domain->is_dying ) - mode |= MAP_OWNER_DYING; -@@ -1684,8 +1701,8 @@ static int domain_context_mapping(struct - printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n", - domain->domain_id, seg, bus, - PCI_SLOT(devfn), PCI_FUNC(devfn)); -- ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, -- pdev, domain->domain_id, pgd_maddr, -+ ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pdev, -+ DEVICE_DOMID(domain, pdev), pgd_maddr, - mode); - if ( ret > 0 ) - ret = 0; -@@ -1709,8 +1726,8 @@ static int domain_context_mapping(struct - PCI_SLOT(devfn), PCI_FUNC(devfn)); - - ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, -- pdev, domain->domain_id, pgd_maddr, -- mode); -+ pdev, DEVICE_DOMID(domain, pdev), -+ pgd_maddr, mode); - if ( ret < 0 ) - break; - prev_present = ret; -@@ -1738,8 +1755,8 @@ static int domain_context_mapping(struct - */ - if ( ret >= 0 ) - ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, -- NULL, domain->domain_id, pgd_maddr, -- mode); -+ NULL, DEVICE_DOMID(domain, pdev), -+ pgd_maddr, mode); - - /* - * Devices behind PCIe-to-PCI/PCIx bridge may generate different -@@ -1754,8 +1771,8 @@ static int domain_context_mapping(struct - if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && - (secbus != pdev->bus || pdev->devfn != 0) ) - ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, -- NULL, domain->domain_id, pgd_maddr, -- mode); -+ NULL, DEVICE_DOMID(domain, pdev), -+ pgd_maddr, mode); - - if ( ret ) - { -@@ -1905,7 +1922,7 @@ static const struct acpi_drhd_unit *doma - domain->domain_id, seg, bus, - PCI_SLOT(devfn), PCI_FUNC(devfn)); - ret = domain_context_unmap_one(domain, iommu, bus, devfn, -- domain->domain_id); -+ DEVICE_DOMID(domain, pdev)); - if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) - disable_ats_device(pdev); - -@@ -1916,7 +1933,7 @@ static const struct acpi_drhd_unit *doma - printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n", - domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); - ret = domain_context_unmap_one(domain, iommu, bus, devfn, -- domain->domain_id); -+ DEVICE_DOMID(domain, pdev)); - if ( ret ) - break; - -@@ -1939,18 +1956,12 @@ static const struct acpi_drhd_unit *doma - break; - } - -+ ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, -+ DEVICE_DOMID(domain, pdev)); - /* PCIe to PCI/PCIx bridge */ -- if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) -- { -- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, -- domain->domain_id); -- if ( !ret ) -- ret = domain_context_unmap_one(domain, iommu, secbus, 0, -- domain->domain_id); -- } -- else /* Legacy PCI bridge */ -- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, -- domain->domain_id); -+ if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) -+ ret = domain_context_unmap_one(domain, iommu, secbus, 0, -+ DEVICE_DOMID(domain, pdev)); - - break; - -@@ -1993,6 +2004,25 @@ static void iommu_domain_teardown(struct - cleanup_domid_map(d->domain_id, drhd->iommu); - } - -+static void quarantine_teardown(struct pci_dev *pdev, -+ const struct acpi_drhd_unit *drhd) -+{ -+ struct page_info *pg; -+ -+ ASSERT(pcidevs_locked()); -+ -+ if ( !pdev->arch.vtd.pgd_maddr ) -+ return; -+ -+ while ( (pg = page_list_remove_head(&pdev->arch.pgtables_list)) ) -+ free_domheap_page(pg); -+ -+ pdev->arch.vtd.pgd_maddr = 0; -+ -+ if ( drhd ) -+ cleanup_domid_map(pdev->arch.pseudo_domid, drhd->iommu); -+} -+ - static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn, - mfn_t mfn, unsigned int flags, - unsigned int *flush_flags) -@@ -2217,6 +2247,8 @@ static int intel_iommu_remove_device(u8 - rmrr->end_address, 0); - } - -+ quarantine_teardown(pdev, drhd); -+ - if ( drhd ) - { - iommu_free_domid(pdev->arch.pseudo_domid, -@@ -2908,60 +2940,139 @@ static void vtd_dump_p2m_table(struct do - vtd_dump_p2m_table_level(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw), 0, 0); - } - --static int __init intel_iommu_quarantine_init(struct domain *d) -+static int fill_qpt(struct dma_pte *this, unsigned int level, -+ paddr_t maddrs[6], struct pci_dev *pdev) - { -- struct domain_iommu *hd = dom_iommu(d); -- struct dma_pte *parent; -+ struct domain_iommu *hd = dom_iommu(dom_io); -+ unsigned int i; -+ int rc = 0; -+ -+ for ( i = 0; !rc && i < PTE_NUM; ++i ) -+ { -+ struct dma_pte *pte = &this[i], *next; -+ -+ if ( !dma_pte_present(*pte) ) -+ { -+ if ( !maddrs[level] ) -+ { -+ /* -+ * The pgtable allocator is fine for the leaf page, as well as -+ * page table pages, and the resulting allocations are always -+ * zeroed. -+ */ -+ maddrs[level] = alloc_pgtable_maddr(1, hd->node); -+ if ( !maddrs[level] ) -+ { -+ rc = -ENOMEM; -+ break; -+ } -+ -+ page_list_add(maddr_to_page(maddrs[level]), -+ &pdev->arch.pgtables_list); -+ -+ if ( level ) -+ { -+ next = map_vtd_domain_page(maddrs[level]); -+ rc = fill_qpt(next, level - 1, maddrs, pdev); -+ unmap_vtd_domain_page(next); -+ } -+ } -+ -+ dma_set_pte_addr(*pte, maddrs[level]); -+ dma_set_pte_readable(*pte); -+ dma_set_pte_writable(*pte); -+ } -+ else if ( level && !dma_pte_superpage(*pte) ) -+ { -+ page_list_add(maddr_to_page(dma_pte_addr(*pte)), -+ &pdev->arch.pgtables_list); -+ next = map_vtd_domain_page(dma_pte_addr(*pte)); -+ rc = fill_qpt(next, level - 1, maddrs, pdev); -+ unmap_vtd_domain_page(next); -+ } -+ } -+ -+ return rc; -+} -+ -+static int intel_iommu_quarantine_init(struct pci_dev *pdev) -+{ -+ struct domain_iommu *hd = dom_iommu(dom_io); -+ paddr_t maddr; - unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); - unsigned int level = agaw_to_level(agaw); -+ const struct acpi_drhd_unit *drhd; -+ const struct acpi_rmrr_unit *rmrr; -+ unsigned int i, bdf; -+ bool rmrr_found = false; -+ int rc; - -- if ( hd->arch.pgd_maddr ) -+ ASSERT(pcidevs_locked()); -+ ASSERT(!hd->arch.pgd_maddr); -+ -+ if ( pdev->arch.vtd.pgd_maddr ) - { -- ASSERT_UNREACHABLE(); -+ clear_domain_page(pdev->arch.leaf_mfn); - return 0; - } - -- spin_lock(&hd->arch.mapping_lock); -+ drhd = acpi_find_matched_drhd_unit(pdev); -+ if ( !drhd ) -+ return -ENODEV; - -- hd->arch.pgd_maddr = alloc_pgtable_maddr(1, hd->node); -- if ( !hd->arch.pgd_maddr ) -- goto out; -+ maddr = alloc_pgtable_maddr(1, hd->node); -+ if ( !maddr ) -+ return -ENOMEM; - -- parent = map_vtd_domain_page(hd->arch.pgd_maddr); -- while ( level ) -- { -- uint64_t maddr; -- unsigned int offset; -+ rc = context_set_domain_id(NULL, pdev->arch.pseudo_domid, drhd->iommu); - -- /* -- * The pgtable allocator is fine for the leaf page, as well as -- * page table pages, and the resulting allocations are always -- * zeroed. -- */ -- maddr = alloc_pgtable_maddr(1, hd->node); -- if ( !maddr ) -+ /* Transiently install the root into DomIO, for iommu_identity_mapping(). */ -+ hd->arch.pgd_maddr = maddr; -+ -+ for_each_rmrr_device ( rmrr, bdf, i ) -+ { -+ if ( rc ) - break; - -- for ( offset = 0; offset < PTE_NUM; offset++ ) -+ if ( rmrr->segment == pdev->seg && bdf == pdev->sbdf.bdf ) - { -- struct dma_pte *pte = &parent[offset]; -+ rmrr_found = true; - -- dma_set_pte_addr(*pte, maddr); -- dma_set_pte_readable(*pte); -+ rc = iommu_identity_mapping(dom_io, p2m_access_rw, -+ rmrr->base_address, rmrr->end_address, -+ 0); -+ if ( rc ) -+ printk(XENLOG_ERR VTDPREFIX -+ "%04x:%02x:%02x.%u: RMRR quarantine mapping failed\n", -+ pdev->seg, pdev->bus, -+ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); - } -- iommu_sync_cache(parent, PAGE_SIZE); -+ } - -- unmap_vtd_domain_page(parent); -- parent = map_vtd_domain_page(maddr); -- level--; -+ iommu_identity_map_teardown(dom_io); -+ hd->arch.pgd_maddr = 0; -+ pdev->arch.vtd.pgd_maddr = maddr; -+ -+ if ( !rc ) -+ { -+ struct dma_pte *root; -+ paddr_t maddrs[6] = {}; -+ -+ spin_lock(&hd->arch.mapping_lock); -+ -+ root = map_vtd_domain_page(maddr); -+ rc = fill_qpt(root, level - 1, maddrs, pdev); -+ unmap_vtd_domain_page(root); -+ -+ pdev->arch.leaf_mfn = maddr_to_mfn(maddrs[0]); -+ -+ spin_unlock(&hd->arch.mapping_lock); - } -- unmap_vtd_domain_page(parent); - -- out: -- spin_unlock(&hd->arch.mapping_lock); -+ if ( rc ) -+ quarantine_teardown(pdev, drhd); - -- /* Pages leaked in failure case */ -- return level ? -ENOMEM : 0; -+ return rc; - } - - const struct iommu_ops __initconstrel intel_iommu_ops = { ---- a/xen/drivers/passthrough/vtd/iommu.h -+++ b/xen/drivers/passthrough/vtd/iommu.h -@@ -509,7 +509,7 @@ struct vtd_iommu { - u32 nr_pt_levels; - u64 cap; - u64 ecap; -- spinlock_t lock; /* protect context, domain ids */ -+ spinlock_t lock; /* protect context */ - spinlock_t register_lock; /* protect iommu register handling */ - u64 root_maddr; /* root entry machine address */ - nodeid_t node; ---- a/xen/include/xen/iommu.h -+++ b/xen/include/xen/iommu.h -@@ -236,7 +236,7 @@ typedef int iommu_grdm_t(xen_pfn_t start - struct iommu_ops { - int (*init)(struct domain *d); - void (*hwdom_init)(struct domain *d); -- int (*quarantine_init)(struct domain *d); -+ int (*quarantine_init)(device_t *dev); - int (*add_device)(u8 devfn, device_t *dev); - int (*enable_device)(device_t *dev); - int (*remove_device)(u8 devfn, device_t *dev); -@@ -356,6 +356,7 @@ int __must_check iommu_suspend(void); - void iommu_resume(void); - void iommu_crash_shutdown(void); - int iommu_get_reserved_device_memory(iommu_grdm_t *, void *); -+int iommu_quarantine_dev_init(device_t *dev); - - void iommu_share_p2m_table(struct domain *d); -