From f458ebe460a5c29df70fb985d7e791f362c38ae0 Mon Sep 17 00:00:00 2001 From: Michael Young Date: Nov 03 2020 20:48:36 +0000 Subject: revised patch for XSA-286 (mitigating performance impact) --- diff --git a/xen.spec b/xen.spec index 34c318c..a1b2011 100644 --- a/xen.spec +++ b/xen.spec @@ -58,7 +58,7 @@ Summary: Xen is a virtual machine monitor Name: xen Version: 4.14.0 -Release: 8%{?dist} +Release: 9%{?dist} License: GPLv2+ and LGPLv2+ and BSD URL: http://xen.org/ Source0: https://downloads.xenproject.org/release/xen/%{version}/xen-%{version}.tar.gz @@ -136,12 +136,8 @@ Patch67: xsa347-4.14-1.patch Patch68: xsa347-4.14-2.patch Patch69: xsa347-4.14-3.patch Patch70: xen.gcc11.fixes.patch -Patch71: xsa286-4.14-0001-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch -Patch72: xsa286-4.14-0002-x86-mm-check-page-types-in-do_page_walk.patch -Patch73: xsa286-4.14-0003-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch -Patch74: xsa286-4.14-0004-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch -Patch75: xsa286-4.14-0005-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch -Patch76: xsa286-4.14-0006-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch +Patch71: xsa286-4.14-0001-x86-pv-Drop-FLUSH_TLB_GLOBAL-in-do_mmu_update-for-XP.patch +Patch72: xsa286-4.14-0002-x86-pv-Flush-TLB-in-response-to-paging-structure-cha.patch %if %build_qemutrad @@ -369,10 +365,6 @@ manage Xen virtual machines. %patch70 -p1 %patch71 -p1 %patch72 -p1 -%patch73 -p1 -%patch74 -p1 -%patch75 -p1 -%patch76 -p1 # qemu-xen-traditional patches pushd tools/qemu-xen-traditional @@ -966,7 +958,10 @@ fi %endif %changelog -* Fri Oct 30 2020 Jeff Law - 4.14.0-6 +* Tue Nov 03 2020 Michael Young - 4.14.0-9 +- revised patch for XSA-286 (mitigating performance impact) + +* Fri Oct 30 2020 Jeff Law - 4.14.0-8 - Work around gcc-11 stringop-overflow diagnostics as well * Wed Oct 28 2020 Michael Young - 4.14.0-7 @@ -975,8 +970,6 @@ fi - simplify grub scripts (patches from Thierry Vignaud ) - some fixes for gcc 11 -* Thu Oct 22 2020 Michael Young - 4.14.0-6.1 - * Tue Oct 20 2020 Michael Young - 4.14.0-6 - x86: Race condition in Xen mapping code [XSA-345, CVE-2020-27672] (#1891097) @@ -984,6 +977,7 @@ fi (#1891093) - unsafe AMD IOMMU page table updates [XSA-347, CVE-2020-27670] (#1891088) + * Tue Sep 22 2020 Michael Young - 4.14.0-5 - x86 pv: Crash when handling guest access to MSR_MISC_ENABLE [XSA-333, CVE-2020-25602] (#1881619) diff --git a/xsa286-4.14-0001-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch b/xsa286-4.14-0001-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch deleted file mode 100644 index 6459e1f..0000000 --- a/xsa286-4.14-0001-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch +++ /dev/null @@ -1,73 +0,0 @@ -From: Jan Beulich -Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk() - -The L3 one at least is going to be re-used by a subsequent patch, and -splitting the L4 one then as well seems only natural. - -This is part of XSA-286. - -Signed-off-by: Jan Beulich -Reviewed-by: George Dunlap -Reviewed-by: Andrew Cooper - -diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c -index 48fd60a876..c25eb01e41 100644 ---- a/xen/arch/x86/x86_64/mm.c -+++ b/xen/arch/x86/x86_64/mm.c -@@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START; - - l2_pgentry_t *compat_idle_pg_table_l2; - --void *do_page_walk(struct vcpu *v, unsigned long addr) -+static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) - { -- unsigned long mfn = pagetable_get_pfn(v->arch.guest_table); -- l4_pgentry_t l4e, *l4t; -- l3_pgentry_t l3e, *l3t; -- l2_pgentry_t l2e, *l2t; -- l1_pgentry_t l1e, *l1t; -+ unsigned long mfn = pagetable_get_pfn(root); -+ l4_pgentry_t *l4t, l4e; - -- if ( !is_pv_vcpu(v) || !is_canonical_address(addr) ) -- return NULL; -+ if ( !is_canonical_address(addr) ) -+ return l4e_empty(); - - l4t = map_domain_page(_mfn(mfn)); - l4e = l4t[l4_table_offset(addr)]; - unmap_domain_page(l4t); -+ -+ return l4e; -+} -+ -+static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) -+{ -+ l4_pgentry_t l4e = page_walk_get_l4e(root, addr); -+ l3_pgentry_t *l3t, l3e; -+ - if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) -- return NULL; -+ return l3e_empty(); - - l3t = map_l3t_from_l4e(l4e); - l3e = l3t[l3_table_offset(addr)]; - unmap_domain_page(l3t); -+ -+ return l3e; -+} -+ -+void *do_page_walk(struct vcpu *v, unsigned long addr) -+{ -+ l3_pgentry_t l3e; -+ l2_pgentry_t l2e, *l2t; -+ l1_pgentry_t l1e, *l1t; -+ unsigned long mfn; -+ -+ if ( !is_pv_vcpu(v) ) -+ return NULL; -+ -+ l3e = page_walk_get_l3e(v->arch.guest_table, addr); - mfn = l3e_get_pfn(l3e); - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) - return NULL; diff --git a/xsa286-4.14-0001-x86-pv-Drop-FLUSH_TLB_GLOBAL-in-do_mmu_update-for-XP.patch b/xsa286-4.14-0001-x86-pv-Drop-FLUSH_TLB_GLOBAL-in-do_mmu_update-for-XP.patch new file mode 100644 index 0000000..5e9109d --- /dev/null +++ b/xsa286-4.14-0001-x86-pv-Drop-FLUSH_TLB_GLOBAL-in-do_mmu_update-for-XP.patch @@ -0,0 +1,58 @@ +From 941f69a428cd989144300519e548e346c681a1b3 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 22 Oct 2020 11:28:58 +0100 +Subject: [PATCH 1/2] x86/pv: Drop FLUSH_TLB_GLOBAL in do_mmu_update() for XPTI + +c/s 9d1d31ad9498 "x86: slightly reduce Meltdown band-aid overhead" removed the +use of Global TLB flushes on the Xen entry path, but added a FLUSH_TLB_GLOBAL +to the L4 path in do_mmu_update(). + +However, this was unnecessary. + +It is the guests responsibility to perform appropriate TLB flushing if the L4 +modification altered an established mapping in a flush-relevant way. In this +case, an MMUEXT_OP hypercall will follow. The case which Xen needs to cover +is when new mappings are created, and the resync on the exit-to-guest path +covers this correctly. + +There is a corner case with multiple vCPUs in hypercalls at the same time, +which 9d1d31ad9498 changed, and this patch changes back to its original XPTI +behaviour. + +Architecturally, established TLB entries can continue to be used until the +broadcast flush has completed. Therefore, even with concurrent hypercalls, +the guest cannot depend on older mappings not being used until an MMUEXT_OP +hypercall completes. Xen's implementation of guest-initiated flushes will +take correct effect on top of an in-progress hypercall, picking up new mapping +setting before the other vCPU's MMUEXT_OP completes. + +Note: The correctness of this change is not impacted by whether XPTI uses +global mappings or not. Correctness there depends on the behaviour of Xen on +the entry/exit paths when switching two/from the XPTI "shadow" pagetables. + +This is (not really) XSA-286 (but necessary to simplify the logic). + +Fixes: 9d1d31ad9498 ("x86: slightly reduce Meltdown band-aid overhead") +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit 055e1c3a3d95b1e753148369fbc4ba48782dd602) +--- + xen/arch/x86/mm.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 3cb6fabdae..1caa2df0a5 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -4193,7 +4193,7 @@ long do_mmu_update( + + cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu)); + if ( !cpumask_empty(mask) ) +- flush_mask(mask, FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL); ++ flush_mask(mask, FLUSH_ROOT_PGTBL); + } + + perfc_add(num_page_updates, i); +-- +2.20.1 + diff --git a/xsa286-4.14-0002-x86-mm-check-page-types-in-do_page_walk.patch b/xsa286-4.14-0002-x86-mm-check-page-types-in-do_page_walk.patch deleted file mode 100644 index 3a05a74..0000000 --- a/xsa286-4.14-0002-x86-mm-check-page-types-in-do_page_walk.patch +++ /dev/null @@ -1,170 +0,0 @@ -From: Jan Beulich -Subject: x86/mm: check page types in do_page_walk() - -For page table entries read to be guaranteed valid, transiently locking -the pages and validating their types is necessary. Note that guest use -of linear page tables is intentionally not taken into account here, as -ordinary data (guest stacks) can't possibly live inside page tables. - -This is part of XSA-286. - -Signed-off-by: Jan Beulich -Reviewed-by: George Dunlap -Reviewed-by: Andrew Cooper - -diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c -index c25eb01e41..6305cf6033 100644 ---- a/xen/arch/x86/x86_64/mm.c -+++ b/xen/arch/x86/x86_64/mm.c -@@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2; - - static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) - { -- unsigned long mfn = pagetable_get_pfn(root); -- l4_pgentry_t *l4t, l4e; -+ mfn_t mfn = pagetable_get_mfn(root); -+ /* current's root page table can't disappear under our feet. */ -+ bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table)); -+ struct page_info *pg; -+ l4_pgentry_t l4e = l4e_empty(); - - if ( !is_canonical_address(addr) ) - return l4e_empty(); - -- l4t = map_domain_page(_mfn(mfn)); -- l4e = l4t[l4_table_offset(addr)]; -- unmap_domain_page(l4t); -+ pg = mfn_to_page(mfn); -+ if ( need_lock && !page_lock(pg) ) -+ return l4e_empty(); -+ -+ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table ) -+ { -+ l4_pgentry_t *l4t = map_domain_page(mfn); -+ -+ l4e = l4t[l4_table_offset(addr)]; -+ unmap_domain_page(l4t); -+ } -+ -+ if ( need_lock ) -+ page_unlock(pg); - - return l4e; - } -@@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) - static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) - { - l4_pgentry_t l4e = page_walk_get_l4e(root, addr); -- l3_pgentry_t *l3t, l3e; -+ mfn_t mfn = l4e_get_mfn(l4e); -+ struct page_info *pg; -+ l3_pgentry_t l3e = l3e_empty(); - - if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) - return l3e_empty(); - -- l3t = map_l3t_from_l4e(l4e); -- l3e = l3t[l3_table_offset(addr)]; -- unmap_domain_page(l3t); -+ pg = mfn_to_page(mfn); -+ if ( !page_lock(pg) ) -+ return l3e_empty(); -+ -+ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table ) -+ { -+ l3_pgentry_t *l3t = map_domain_page(mfn); -+ -+ l3e = l3t[l3_table_offset(addr)]; -+ unmap_domain_page(l3t); -+ } -+ -+ page_unlock(pg); - - return l3e; - } -@@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) - void *do_page_walk(struct vcpu *v, unsigned long addr) - { - l3_pgentry_t l3e; -- l2_pgentry_t l2e, *l2t; -- l1_pgentry_t l1e, *l1t; -- unsigned long mfn; -+ l2_pgentry_t l2e = l2e_empty(); -+ l1_pgentry_t l1e = l1e_empty(); -+ mfn_t mfn; -+ struct page_info *pg; - - if ( !is_pv_vcpu(v) ) - return NULL; - - l3e = page_walk_get_l3e(v->arch.guest_table, addr); -- mfn = l3e_get_pfn(l3e); -- if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) -+ mfn = l3e_get_mfn(l3e); -+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) - return NULL; - if ( (l3e_get_flags(l3e) & _PAGE_PSE) ) - { -- mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)); -+ mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1))); - goto ret; - } - -- l2t = map_domain_page(_mfn(mfn)); -- l2e = l2t[l2_table_offset(addr)]; -- unmap_domain_page(l2t); -- mfn = l2e_get_pfn(l2e); -- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) -+ pg = mfn_to_page(mfn); -+ if ( !page_lock(pg) ) -+ return NULL; -+ -+ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table ) -+ { -+ const l2_pgentry_t *l2t = map_domain_page(mfn); -+ -+ l2e = l2t[l2_table_offset(addr)]; -+ unmap_domain_page(l2t); -+ } -+ -+ page_unlock(pg); -+ -+ mfn = l2e_get_mfn(l2e); -+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) - return NULL; - if ( (l2e_get_flags(l2e) & _PAGE_PSE) ) - { -- mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)); -+ mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1))); - goto ret; - } - -- l1t = map_domain_page(_mfn(mfn)); -- l1e = l1t[l1_table_offset(addr)]; -- unmap_domain_page(l1t); -- mfn = l1e_get_pfn(l1e); -- if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) -+ pg = mfn_to_page(mfn); -+ if ( !page_lock(pg) ) -+ return NULL; -+ -+ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) -+ { -+ const l1_pgentry_t *l1t = map_domain_page(mfn); -+ -+ l1e = l1t[l1_table_offset(addr)]; -+ unmap_domain_page(l1t); -+ } -+ -+ page_unlock(pg); -+ -+ mfn = l1e_get_mfn(l1e); -+ if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) - return NULL; - - ret: -- return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK); -+ return map_domain_page(mfn) + (addr & ~PAGE_MASK); - } - - /* diff --git a/xsa286-4.14-0002-x86-pv-Flush-TLB-in-response-to-paging-structure-cha.patch b/xsa286-4.14-0002-x86-pv-Flush-TLB-in-response-to-paging-structure-cha.patch new file mode 100644 index 0000000..22af1ca --- /dev/null +++ b/xsa286-4.14-0002-x86-pv-Flush-TLB-in-response-to-paging-structure-cha.patch @@ -0,0 +1,166 @@ +From 10bb63c203f42d931fa1fa7dbbae7ce1765cecf2 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 19 Oct 2020 15:51:22 +0100 +Subject: [PATCH 2/2] x86/pv: Flush TLB in response to paging structure changes + +With MMU_UPDATE, a PV guest can make changes to higher level pagetables. This +is safe from Xen's point of view (as the update only affects guest mappings), +and the guest is required to flush (if necessary) after making updates. + +However, Xen's use of linear pagetables (UPDATE_VA_MAPPING, GNTTABOP_map, +writeable pagetables, etc.) is an implementation detail outside of the +API/ABI. + +Changes in the paging structure require invalidations in the linear pagetable +range for subsequent accesses into the linear pagetables to access non-stale +mappings. Xen must provide suitable flushing to prevent intermixed guest +actions from accidentally accessing/modifying the wrong pagetable. + +For all L2 and higher modifications, flush the TLB. PV guests cannot create +L2 or higher entries with the Global bit set, so no mappings established in +the linear range can be global. (This could in principle be an order 39 flush +starting at LINEAR_PT_VIRT_START, but no such mechanism exists in practice.) + +Express the necessary flushes as a set of booleans which accumulate across the +operation. Comment the flushing logic extensively. + +This is XSA-286. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit 16a20963b3209788f2c0d3a3eebb7d92f03f5883) +--- + xen/arch/x86/mm.c | 69 ++++++++++++++++++++++++++++++++++++++++------- + 1 file changed, 59 insertions(+), 10 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 1caa2df0a5..61cf6a7b9b 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -3896,7 +3896,8 @@ long do_mmu_update( + struct vcpu *curr = current, *v = curr; + struct domain *d = v->domain, *pt_owner = d, *pg_owner; + mfn_t map_mfn = INVALID_MFN, mfn; +- bool sync_guest = false; ++ bool flush_linear_pt = false, flush_root_pt_local = false, ++ flush_root_pt_others = false; + uint32_t xsm_needed = 0; + uint32_t xsm_checked = 0; + int rc = put_old_guest_table(curr); +@@ -4046,6 +4047,8 @@ long do_mmu_update( + break; + rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD, v); ++ if ( !rc ) ++ flush_linear_pt = true; + break; + + case PGT_l3_page_table: +@@ -4053,6 +4056,8 @@ long do_mmu_update( + break; + rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD, v); ++ if ( !rc ) ++ flush_linear_pt = true; + break; + + case PGT_l4_page_table: +@@ -4060,6 +4065,8 @@ long do_mmu_update( + break; + rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD, v); ++ if ( !rc ) ++ flush_linear_pt = true; + if ( !rc && pt_owner->arch.pv.xpti ) + { + bool local_in_use = false; +@@ -4068,7 +4075,7 @@ long do_mmu_update( + mfn) ) + { + local_in_use = true; +- get_cpu_info()->root_pgt_changed = true; ++ flush_root_pt_local = true; + } + + /* +@@ -4080,7 +4087,7 @@ long do_mmu_update( + (1 + !!(page->u.inuse.type_info & PGT_pinned) + + mfn_eq(pagetable_get_mfn(curr->arch.guest_table_user), + mfn) + local_in_use) ) +- sync_guest = true; ++ flush_root_pt_others = true; + } + break; + +@@ -4182,19 +4189,61 @@ long do_mmu_update( + if ( va ) + unmap_domain_page(va); + +- if ( sync_guest ) ++ /* ++ * Perform required TLB maintenance. ++ * ++ * This logic currently depend on flush_linear_pt being a superset of the ++ * flush_root_pt_* conditions. ++ * ++ * pt_owner may not be current->domain. This may occur during ++ * construction of 32bit PV guests, or debugging of PV guests. The ++ * behaviour cannot be correct with domain unpaused. We therefore expect ++ * pt_owner->dirty_cpumask to be empty, but it is a waste of effort to ++ * explicitly check for, and exclude, this corner case. ++ * ++ * flush_linear_pt requires a FLUSH_TLB to all dirty CPUs. The flush must ++ * be performed now to maintain correct behaviour across a multicall. ++ * i.e. we cannot relax FLUSH_TLB to FLUSH_ROOT_PGTBL, given that the ++ * former is a side effect of the latter, because the resync (which is in ++ * the return-to-guest path) happens too late. ++ * ++ * flush_root_pt_* requires FLUSH_ROOT_PGTBL on either the local CPU ++ * (implies pt_owner == current->domain and current->processor set in ++ * pt_owner->dirty_cpumask), and/or all *other* dirty CPUs as there are ++ * references we can't account for locally. ++ */ ++ if ( flush_linear_pt /* || flush_root_pt_local || flush_root_pt_others */ ) + { ++ unsigned int cpu = smp_processor_id(); ++ cpumask_t *mask = pt_owner->dirty_cpumask; ++ + /* +- * Force other vCPU-s of the affected guest to pick up L4 entry +- * changes (if any). ++ * Always handle local flushing separately (if applicable), to ++ * separate the flush invocations appropriately for scope of the two ++ * flush_root_pt_* variables. + */ +- unsigned int cpu = smp_processor_id(); +- cpumask_t *mask = per_cpu(scratch_cpumask, cpu); ++ if ( likely(cpumask_test_cpu(cpu, mask)) ) ++ { ++ mask = per_cpu(scratch_cpumask, cpu); + +- cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu)); ++ cpumask_copy(mask, pt_owner->dirty_cpumask); ++ __cpumask_clear_cpu(cpu, mask); ++ ++ flush_local(FLUSH_TLB | ++ (flush_root_pt_local ? FLUSH_ROOT_PGTBL : 0)); ++ } ++ else ++ /* Sanity check. flush_root_pt_local implies local cpu is dirty. */ ++ ASSERT(!flush_root_pt_local); ++ ++ /* Flush the remote dirty CPUs. Does not include the local CPU. */ + if ( !cpumask_empty(mask) ) +- flush_mask(mask, FLUSH_ROOT_PGTBL); ++ flush_mask(mask, FLUSH_TLB | ++ (flush_root_pt_others ? FLUSH_ROOT_PGTBL : 0)); + } ++ else ++ /* Sanity check. flush_root_pt_* implies flush_linear_pt. */ ++ ASSERT(!flush_root_pt_local && !flush_root_pt_others); + + perfc_add(num_page_updates, i); + +-- +2.20.1 + diff --git a/xsa286-4.14-0003-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch b/xsa286-4.14-0003-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch deleted file mode 100644 index 5a5c251..0000000 --- a/xsa286-4.14-0003-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch +++ /dev/null @@ -1,92 +0,0 @@ -From: Jan Beulich -Subject: x86/mm: avoid using linear page tables in map_guest_l1e() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Replace the linear L2 table access by an actual page walk. - -This is part of XSA-286. - -Reported-by: Jann Horn -Signed-off-by: Jan Beulich -Signed-off-by: Roger Pau Monné -Reviewed-by: George Dunlap -Reviewed-by: Andrew Cooper - -diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c -index 5d4cd00941..7be098f5ef 100644 ---- a/xen/arch/x86/pv/mm.c -+++ b/xen/arch/x86/pv/mm.c -@@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn) - if ( unlikely(!__addr_ok(linear)) ) - return NULL; - -- /* Find this l1e and its enclosing l1mfn in the linear map. */ -- if ( __copy_from_user(&l2e, -- &__linear_l2_table[l2_linear_offset(linear)], -- sizeof(l2_pgentry_t)) ) -+ if ( unlikely(!(current->arch.flags & TF_kernel_mode)) ) -+ { -+ ASSERT_UNREACHABLE(); - return NULL; -+ } -+ -+ /* Find this l1e and its enclosing l1mfn. */ -+ l2e = page_walk_get_l2e(current->arch.guest_table, linear); - - /* Check flags that it will be safe to read the l1e. */ - if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT ) -diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c -index 6305cf6033..71a8bfc024 100644 ---- a/xen/arch/x86/x86_64/mm.c -+++ b/xen/arch/x86/x86_64/mm.c -@@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) - return l3e; - } - -+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr) -+{ -+ l3_pgentry_t l3e = page_walk_get_l3e(root, addr); -+ mfn_t mfn = l3e_get_mfn(l3e); -+ struct page_info *pg; -+ l2_pgentry_t l2e = l2e_empty(); -+ -+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || -+ (l3e_get_flags(l3e) & _PAGE_PSE) ) -+ return l2e_empty(); -+ -+ pg = mfn_to_page(mfn); -+ if ( !page_lock(pg) ) -+ return l2e_empty(); -+ -+ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table ) -+ { -+ l2_pgentry_t *l2t = map_domain_page(mfn); -+ -+ l2e = l2t[l2_table_offset(addr)]; -+ unmap_domain_page(l2t); -+ } -+ -+ page_unlock(pg); -+ -+ return l2e; -+} -+ - void *do_page_walk(struct vcpu *v, unsigned long addr) - { - l3_pgentry_t l3e; -diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h -index 7e74996053..12ea812381 100644 ---- a/xen/include/asm-x86/mm.h -+++ b/xen/include/asm-x86/mm.h -@@ -579,7 +579,9 @@ void audit_domains(void); - void make_cr3(struct vcpu *v, mfn_t mfn); - void update_cr3(struct vcpu *v); - int vcpu_destroy_pagetables(struct vcpu *); -+ - void *do_page_walk(struct vcpu *v, unsigned long addr); -+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr); - - /* Allocator functions for Xen pagetables. */ - void *alloc_xen_pagetable(void); diff --git a/xsa286-4.14-0004-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch b/xsa286-4.14-0004-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch deleted file mode 100644 index 9783f79..0000000 --- a/xsa286-4.14-0004-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch +++ /dev/null @@ -1,172 +0,0 @@ -From: Jan Beulich -Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e() - -First of all drop guest_get_eff_l1e() entirely - there's no actual user -of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around -its only call site. - -Then replace the linear L1 table access by an actual page walk. - -This is part of XSA-286. - -Reported-by: Jann Horn -Signed-off-by: Jan Beulich -Reviewed-by: George Dunlap -Reviewed-by: Andrew Cooper - -diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c -index 7be098f5ef..5e4081aecd 100644 ---- a/xen/arch/x86/pv/mm.c -+++ b/xen/arch/x86/pv/mm.c -@@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn) - } - - /* -- * Read the guest's l1e that maps this address, from the kernel-mode -- * page tables. -- */ --static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear) --{ -- struct vcpu *curr = current; -- const bool user_mode = !(curr->arch.flags & TF_kernel_mode); -- l1_pgentry_t l1e; -- -- if ( user_mode ) -- toggle_guest_pt(curr); -- -- l1e = guest_get_eff_l1e(linear); -- -- if ( user_mode ) -- toggle_guest_pt(curr); -- -- return l1e; --} -- --/* - * Map a guest's LDT page (covering the byte at @offset from start of the LDT) - * into Xen's virtual range. Returns true if the mapping changed, false - * otherwise. -diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h -index a1bd473b29..43d33a1fd1 100644 ---- a/xen/arch/x86/pv/mm.h -+++ b/xen/arch/x86/pv/mm.h -@@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn); - - int new_guest_cr3(mfn_t mfn); - --/* Read a PV guest's l1e that maps this linear address. */ --static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear) -+/* -+ * Read the guest's l1e that maps this address, from the kernel-mode -+ * page tables. -+ */ -+static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear) - { -- l1_pgentry_t l1e; -+ l1_pgentry_t l1e = l1e_empty(); - - ASSERT(!paging_mode_translate(current->domain)); - ASSERT(!paging_mode_external(current->domain)); - -- if ( unlikely(!__addr_ok(linear)) || -- __copy_from_user(&l1e, -- &__linear_l1_table[l1_linear_offset(linear)], -- sizeof(l1_pgentry_t)) ) -- l1e = l1e_empty(); -+ if ( likely(__addr_ok(linear)) ) -+ l1e = page_walk_get_l1e(current->arch.guest_table, linear); - - return l1e; - } -diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c -index 0eedb70002..ce31dd401d 100644 ---- a/xen/arch/x86/pv/ro-page-fault.c -+++ b/xen/arch/x86/pv/ro-page-fault.c -@@ -349,7 +349,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs) - bool mmio_ro; - - /* Attempt to read the PTE that maps the VA being accessed. */ -- pte = guest_get_eff_l1e(addr); -+ pte = guest_get_eff_kern_l1e(addr); - - /* We are only looking for read-only mappings */ - if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) ) -diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c -index 71a8bfc024..9e87a55174 100644 ---- a/xen/arch/x86/x86_64/mm.c -+++ b/xen/arch/x86/x86_64/mm.c -@@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr) - return l2e; - } - -+/* -+ * For now no "set_accessed" parameter, as all callers want it set to true. -+ * For now also no "set_dirty" parameter, as all callers deal with r/o -+ * mappings, and we don't want to set the dirty bit there (conflicts with -+ * CET-SS). However, as there are CPUs which may set the dirty bit on r/o -+ * PTEs, the logic below tolerates the bit becoming set "behind our backs". -+ */ -+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr) -+{ -+ l2_pgentry_t l2e = page_walk_get_l2e(root, addr); -+ mfn_t mfn = l2e_get_mfn(l2e); -+ struct page_info *pg; -+ l1_pgentry_t l1e = l1e_empty(); -+ -+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || -+ (l2e_get_flags(l2e) & _PAGE_PSE) ) -+ return l1e_empty(); -+ -+ pg = mfn_to_page(mfn); -+ if ( !page_lock(pg) ) -+ return l1e_empty(); -+ -+ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) -+ { -+ l1_pgentry_t *l1t = map_domain_page(mfn); -+ -+ l1e = l1t[l1_table_offset(addr)]; -+ -+ if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) == -+ _PAGE_PRESENT ) -+ { -+ l1_pgentry_t ol1e = l1e; -+ -+ l1e_add_flags(l1e, _PAGE_ACCESSED); -+ /* -+ * Best effort only; with the lock held the page shouldn't -+ * change anyway, except for the dirty bit to perhaps become set. -+ */ -+ while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]), -+ l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) != -+ l1e_get_intpte(ol1e) && -+ !(l1e_get_flags(l1e) & _PAGE_DIRTY) ) -+ { -+ l1e_add_flags(ol1e, _PAGE_DIRTY); -+ l1e_add_flags(l1e, _PAGE_DIRTY); -+ } -+ } -+ -+ unmap_domain_page(l1t); -+ } -+ -+ page_unlock(pg); -+ -+ return l1e; -+} -+ - void *do_page_walk(struct vcpu *v, unsigned long addr) - { - l3_pgentry_t l3e; -diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h -index 12ea812381..da1a6f5712 100644 ---- a/xen/include/asm-x86/mm.h -+++ b/xen/include/asm-x86/mm.h -@@ -582,6 +582,7 @@ int vcpu_destroy_pagetables(struct vcpu *); - - void *do_page_walk(struct vcpu *v, unsigned long addr); - l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr); -+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr); - - /* Allocator functions for Xen pagetables. */ - void *alloc_xen_pagetable(void); diff --git a/xsa286-4.14-0005-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch b/xsa286-4.14-0005-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch deleted file mode 100644 index ca38773..0000000 --- a/xsa286-4.14-0005-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch +++ /dev/null @@ -1,101 +0,0 @@ -From: Jan Beulich -Subject: x86/mm: avoid using top level linear page tables in - {,un}map_domain_page() - -Move the page table recursion two levels down. This entails avoiding -to free the recursive mapping prematurely in free_perdomain_mappings(). - -This is part of XSA-286. - -Reported-by: Jann Horn -Signed-off-by: Jan Beulich -Reviewed-by: George Dunlap -Reviewed-by: Andrew Cooper - -diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c -index b03728e18e..ed6a2bf081 100644 ---- a/xen/arch/x86/domain_page.c -+++ b/xen/arch/x86/domain_page.c -@@ -65,7 +65,8 @@ void __init mapcache_override_current(struct vcpu *v) - #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER) - #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1) - #define MAPCACHE_L1ENT(idx) \ -- __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))] -+ ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \ -+ ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx] - - void *map_domain_page(mfn_t mfn) - { -@@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *d) - { - struct mapcache_domain *dcache = &d->arch.pv.mapcache; - unsigned int bitmap_pages; -+ int rc; - - ASSERT(is_pv_domain(d)); - -@@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain *d) - return 0; - #endif - -+ BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1)); - BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 + -- 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) > -+ 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) + -+ (1U << L2_PAGETABLE_SHIFT) > - MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20)); - bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long)); - dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE; -@@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain *d) - - spin_lock_init(&dcache->lock); - -- return create_perdomain_mapping(d, (unsigned long)dcache->inuse, -- 2 * bitmap_pages + 1, -- NIL(l1_pgentry_t *), NULL); -+ rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse, -+ 2 * bitmap_pages + 1, -+ NIL(l1_pgentry_t *), NULL); -+ if ( !rc ) -+ { -+ /* -+ * Install mapping of our L2 table into its own last slot, for easy -+ * access to the L1 entries via MAPCACHE_L1ENT(). -+ */ -+ l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg); -+ l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)]; -+ l2_pgentry_t *l2t = map_l2t_from_l3e(l3e); -+ -+ l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e); -+ unmap_domain_page(l2t); -+ unmap_domain_page(l3t); -+ } -+ -+ return rc; - } - - int mapcache_vcpu_init(struct vcpu *v) -@@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void *ptr) - else - { - ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END); -- pl1e = &__linear_l1_table[l1_linear_offset(va)]; -+ pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START)); - } - - return l1e_get_mfn(*pl1e); -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 82bc676553..582ea09725 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -5953,6 +5953,10 @@ void free_perdomain_mappings(struct domain *d) - { - struct page_info *l1pg = l2e_get_page(l2tab[j]); - -+ /* mapcache_domain_init() installs a recursive entry. */ -+ if ( l1pg == l2pg ) -+ continue; -+ - if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 ) - { - l1_pgentry_t *l1tab = __map_domain_page(l1pg); diff --git a/xsa286-4.14-0006-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch b/xsa286-4.14-0006-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch deleted file mode 100644 index 69ad37c..0000000 --- a/xsa286-4.14-0006-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch +++ /dev/null @@ -1,106 +0,0 @@ -From: Jan Beulich -Subject: x86/mm: restrict use of linear page tables to shadow mode code - -Other code does not require them to be set up anymore, so restrict when -to populate the respective L4 slot and reduce visibility of the -accessors. - -While with the removal of all uses the vulnerability is actually fixed, -removing the creation of the linear mapping adds an extra layer of -protection. Similarly reducing visibility of the accessors mostly -eliminates the risk of undue re-introduction of uses of the linear -mappings. - -This is (not strictly) part of XSA-286. - -Signed-off-by: Jan Beulich -Reviewed-by: George Dunlap -Reviewed-by: Andrew Cooper - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 582ea09725..57333bb120 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -1682,9 +1682,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn, - l4t[l4_table_offset(PCI_MCFG_VIRT_START)] = - idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)]; - -- /* Slot 258: Self linear mappings. */ -+ /* Slot 258: Self linear mappings (shadow pt only). */ - ASSERT(!mfn_eq(l4mfn, INVALID_MFN)); - l4t[l4_table_offset(LINEAR_PT_VIRT_START)] = -+ !shadow_mode_external(d) ? l4e_empty() : - l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW); - - /* Slot 259: Shadow linear mappings (if applicable) .*/ -diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h -index 3fd3f0617a..bb2f50cb6e 100644 ---- a/xen/arch/x86/mm/shadow/private.h -+++ b/xen/arch/x86/mm/shadow/private.h -@@ -139,6 +139,15 @@ enum { - # define GUEST_PTE_SIZE 4 - #endif - -+/* Where to find each level of the linear mapping */ -+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) -+#define __linear_l2_table \ -+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) -+#define __linear_l3_table \ -+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) -+#define __linear_l4_table \ -+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) -+ - /****************************************************************************** - * Auditing routines - */ -diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c -index 9e87a55174..ce03f83f52 100644 ---- a/xen/arch/x86/x86_64/mm.c -+++ b/xen/arch/x86/x86_64/mm.c -@@ -808,9 +808,6 @@ void __init paging_init(void) - - machine_to_phys_mapping_valid = 1; - -- /* Set up linear page table mapping. */ -- l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)], -- l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW)); - return; - - nomem: -diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h -index 665e9cc31d..17b8ea0cfd 100644 ---- a/xen/include/asm-x86/config.h -+++ b/xen/include/asm-x86/config.h -@@ -197,7 +197,7 @@ extern unsigned char boot_edid_info[128]; - */ - #define PCI_MCFG_VIRT_START (PML4_ADDR(257)) - #define PCI_MCFG_VIRT_END (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES) --/* Slot 258: linear page table (guest table). */ -+/* Slot 258: linear page table (monitor table, HVM only). */ - #define LINEAR_PT_VIRT_START (PML4_ADDR(258)) - #define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES) - /* Slot 259: linear page table (shadow table). */ -diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h -index f632affaef..fd2574267c 100644 ---- a/xen/include/asm-x86/page.h -+++ b/xen/include/asm-x86/page.h -@@ -294,19 +294,6 @@ void copy_page_sse2(void *, const void *); - #define vmap_to_mfn(va) _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va)))) - #define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va)) - --#endif /* !defined(__ASSEMBLY__) */ -- --/* Where to find each level of the linear mapping */ --#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) --#define __linear_l2_table \ -- ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) --#define __linear_l3_table \ -- ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) --#define __linear_l4_table \ -- ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) -- -- --#ifndef __ASSEMBLY__ - extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES]; - extern l2_pgentry_t *compat_idle_pg_table_l2; - extern unsigned int m2p_compat_vstart;