From 3b402adc22be6ae8feaf75279cece717bbe0fe7b Mon Sep 17 00:00:00 2001 From: Michael Young Date: Nov 12 2019 22:27:28 +0000 Subject: add missing XSA-299 patches --- diff --git a/xen.spec b/xen.spec index de5ba3e..5e7cb22 100644 --- a/xen.spec +++ b/xen.spec @@ -58,7 +58,7 @@ Summary: Xen is a virtual machine monitor Name: xen Version: 4.12.1 -Release: 5%{?dist} +Release: 6%{?dist} License: GPLv2+ and LGPLv2+ and BSD URL: http://xen.org/ Source0: https://downloads.xenproject.org/release/xen/%{version}/xen-%{version}.tar.gz @@ -117,20 +117,30 @@ Patch54: xen.python38.patch Patch55: xsa296.patch Patch56: xsa298.patch Patch57: xsa299-4.12-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch -Patch58: xsa301-master-1.patch -Patch59: xsa301-master-2.patch -Patch60: xsa301-master-3.patch -Patch61: xsa302-4.12-0001-IOMMU-add-missing-HVM-check.patch -Patch62: xsa302-4.12-0002-passthrough-quarantine-PCI-devices.patch -Patch63: xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch -Patch64: xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch -Patch65: xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch -Patch66: xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch -Patch67: xsa304-4.12-1.patch -Patch68: xsa304-4.12-2.patch -Patch69: xsa304-4.12-3.patch -Patch70: xsa305-4.12-1.patch -Patch71: xsa305-4.12-2.patch +Patch58: xsa299-4.12-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch +Patch59: xsa299-4.12-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch +Patch60: xsa299-4.12-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch +Patch61: xsa299-4.12-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch +Patch62: xsa299-4.12-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch +Patch63: xsa299-4.12-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch +Patch64: xsa299-4.12-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch +Patch65: xsa299-4.12-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch +Patch66: xsa299-4.12-0010-x86-mm-Fix-nested-de-validation-on-error.patch +Patch67: xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch +Patch68: xsa301-master-1.patch +Patch69: xsa301-master-2.patch +Patch70: xsa301-master-3.patch +Patch71: xsa302-4.12-0001-IOMMU-add-missing-HVM-check.patch +Patch72: xsa302-4.12-0002-passthrough-quarantine-PCI-devices.patch +Patch73: xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch +Patch74: xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch +Patch75: xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch +Patch76: xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch +Patch77: xsa304-4.12-1.patch +Patch78: xsa304-4.12-2.patch +Patch79: xsa304-4.12-3.patch +Patch80: xsa305-4.12-1.patch +Patch81: xsa305-4.12-2.patch %if %build_qemutrad @@ -335,10 +345,8 @@ manage Xen virtual machines. %patch58 -p1 %patch59 -p1 %patch60 -p1 -%ifarch %{ix86} x86_64 %patch61 -p1 %patch62 -p1 -%endif %patch63 -p1 %patch64 -p1 %patch65 -p1 @@ -347,7 +355,19 @@ manage Xen virtual machines. %patch68 -p1 %patch69 -p1 %patch70 -p1 +%ifarch %{ix86} x86_64 %patch71 -p1 +%patch72 -p1 +%endif +%patch73 -p1 +%patch74 -p1 +%patch75 -p1 +%patch76 -p1 +%patch77 -p1 +%patch78 -p1 +%patch79 -p1 +%patch80 -p1 +%patch81 -p1 # qemu-xen-traditional patches pushd tools/qemu-xen-traditional @@ -911,6 +931,9 @@ fi %endif %changelog +* Tue Nov 12 2019 Michael Young - 4.12.1-6 +- add missing XSA-299 patches + * Tue Nov 12 2019 Michael Young - 4.12.1-5 - x86: Machine Check Error on Page Size Change DoS [XSA-304, CVE-2018-12207] - TSX Asynchronous Abort speculative side channel [XSA-305, CVE-2019-11135] diff --git a/xsa299-4.12-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch b/xsa299-4.12-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch new file mode 100644 index 0000000..a74598e --- /dev/null +++ b/xsa299-4.12-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch @@ -0,0 +1,99 @@ +From b490792c18f74b76ec8161721c1e07f810e36309 Mon Sep 17 00:00:00 2001 +From: George Dunlap +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 02/11] x86/mm: Don't re-set PGT_pinned on a partially + de-validated page + +When unpinning pagetables, if an operation is interrupted, +relinquish_memory() re-sets PGT_pinned so that the un-pin will +pickedup again when the hypercall restarts. + +This is appropriate when put_page_and_type_preemptible() returns +-EINTR, which indicates that the page is back in its initial state +(i.e., completely validated). However, for -ERESTART, this leads to a +state where a page has both PGT_pinned and PGT_partial set. + +This happens to work at the moment, although it's not really a +"canonical" state; but in subsequent patches, where we need to make a +distinction in handling between PGT_validated and PGT_partial pages, +this causes issues. + +Move to a "canonical" state by: +- Only re-setting PGT_pinned on -EINTR +- Re-dropping the refcount held by PGT_pinned on -ERESTART + +In the latter case, the PGT_partial bit will be cleared further down +with the rest of the other PGT_partial pages. + +While here, clean up some trainling whitespace. + +This is part of XSA-299. + +Reported-by: George Dunlap +Signed-off-by: George Dunlap +Reviewed-by: Jan Beulich +--- + xen/arch/x86/domain.c | 31 ++++++++++++++++++++++++++++--- + 1 file changed, 28 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 2585327834..59df8a6d8d 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -114,7 +114,7 @@ static void play_dead(void) + * this case, heap corruption or #PF can occur (when heap debugging is + * enabled). For example, even printk() can involve tasklet scheduling, + * which touches per-cpu vars. +- * ++ * + * Consider very carefully when adding code to *dead_idle. Most hypervisor + * subsystems are unsafe to call. + */ +@@ -1909,9 +1909,34 @@ static int relinquish_memory( + break; + case -ERESTART: + case -EINTR: ++ /* ++ * -EINTR means PGT_validated has been re-set; re-set ++ * PGT_pinned again so that it gets picked up next time ++ * around. ++ * ++ * -ERESTART, OTOH, means PGT_partial is set instead. Put ++ * it back on the list, but don't set PGT_pinned; the ++ * section below will finish off de-validation. But we do ++ * need to drop the general ref associated with ++ * PGT_pinned, since put_page_and_type_preemptible() ++ * didn't do it. ++ * ++ * NB we can do an ASSERT for PGT_validated, since we ++ * "own" the type ref; but theoretically, the PGT_partial ++ * could be cleared by someone else. ++ */ ++ if ( ret == -EINTR ) ++ { ++ ASSERT(page->u.inuse.type_info & PGT_validated); ++ set_bit(_PGT_pinned, &page->u.inuse.type_info); ++ } ++ else ++ put_page(page); ++ + ret = -ERESTART; ++ ++ /* Put the page back on the list and drop the ref we grabbed above */ + page_list_add(page, list); +- set_bit(_PGT_pinned, &page->u.inuse.type_info); + put_page(page); + goto out; + default: +@@ -2161,7 +2186,7 @@ void vcpu_kick(struct vcpu *v) + * pending flag. These values may fluctuate (after all, we hold no + * locks) but the key insight is that each change will cause + * evtchn_upcall_pending to be polled. +- * ++ * + * NB2. We save the running flag across the unblock to avoid a needless + * IPI for domains that we IPI'd to unblock. + */ +-- +2.23.0 + diff --git a/xsa299-4.12-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch b/xsa299-4.12-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch new file mode 100644 index 0000000..226e548 --- /dev/null +++ b/xsa299-4.12-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch @@ -0,0 +1,618 @@ +From 0f9f61e5737fdd346550ec6e30161fa99e4653fa Mon Sep 17 00:00:00 2001 +From: George Dunlap +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 03/11] x86/mm: Separate out partial_pte tristate into + individual flags + +At the moment, partial_pte is a tri-state that contains two distinct bits +of information: + +1. If zero, the pte at index [nr_validated_ptes] is un-validated. If + non-zero, the pte was last seen with PGT_partial set. + +2. If positive, the pte at index [nr_validated_ptes] does not hold a + general reference count. If negative, it does. + +To make future patches more clear, separate out this functionality +into two distinct, named bits: PTF_partial_set (for #1) and +PTF_partial_general_ref (for #2). + +Additionally, a number of functions which need this information also +take other flags to control behavior (such as `preemptible` and +`defer`). These are hard to read in the caller (since you only see +'true' or 'false'), and ugly when many are added together. In +preparation for adding yet another flag in a future patch, collapse +all of these into a single `flag` variable. + +NB that this does mean checking for what was previously the '-1' +condition a bit more ugly in the put_page_from_lNe functions (since +you have to check for both partial_set and general ref); but this +clause will go away in a future patch. + +Also note that the original comment had an off-by-one error: +partial_flags (like partial_pte before it) concerns +plNe[nr_validated_ptes], not plNe[nr_validated_ptes+1]. + +No functional change intended. + +This is part of XSA-299. + +Reported-by: George Dunlap +Signed-off-by: George Dunlap +Reviewed-by: Jan Beulich +--- + xen/arch/x86/mm.c | 165 ++++++++++++++++++++++++--------------- + xen/include/asm-x86/mm.h | 41 +++++++--- + 2 files changed, 128 insertions(+), 78 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index a1b55c10ff..3f6f8cc9b8 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1094,20 +1094,35 @@ get_page_from_l1e( + } + + #ifdef CONFIG_PV ++ ++/* ++ * The following flags are used to specify behavior of various get and ++ * put commands. The first two are also stored in page->partial_flags ++ * to indicate the state of the page pointed to by ++ * page->pte[page->nr_validated_entries]. See the comment in mm.h for ++ * more information. ++ */ ++#define PTF_partial_set (1 << 0) ++#define PTF_partial_general_ref (1 << 1) ++#define PTF_preemptible (1 << 2) ++#define PTF_defer (1 << 3) ++ + static int get_page_and_type_from_mfn( + mfn_t mfn, unsigned long type, struct domain *d, +- int partial, int preemptible) ++ unsigned int flags) + { + struct page_info *page = mfn_to_page(mfn); + int rc; ++ bool preemptible = flags & PTF_preemptible, ++ partial_ref = flags & PTF_partial_general_ref; + +- if ( likely(partial >= 0) && ++ if ( likely(!partial_ref) && + unlikely(!get_page_from_mfn(mfn, d)) ) + return -EINVAL; + + rc = _get_page_type(page, type, preemptible); + +- if ( unlikely(rc) && partial >= 0 && ++ if ( unlikely(rc) && !partial_ref && + (!preemptible || page != current->arch.old_guest_table) ) + put_page(page); + +@@ -1117,7 +1132,7 @@ static int get_page_and_type_from_mfn( + define_get_linear_pagetable(l2); + static int + get_page_from_l2e( +- l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial) ++ l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned int flags) + { + unsigned long mfn = l2e_get_pfn(l2e); + int rc; +@@ -1129,8 +1144,9 @@ get_page_from_l2e( + return -EINVAL; + } + +- rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, +- partial, false); ++ ASSERT(!(flags & PTF_preemptible)); ++ ++ rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, flags); + if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) + rc = 0; + +@@ -1140,7 +1156,7 @@ get_page_from_l2e( + define_get_linear_pagetable(l3); + static int + get_page_from_l3e( +- l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial) ++ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, unsigned int flags) + { + int rc; + +@@ -1152,7 +1168,7 @@ get_page_from_l3e( + } + + rc = get_page_and_type_from_mfn( +- l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1); ++ l3e_get_mfn(l3e), PGT_l2_page_table, d, flags | PTF_preemptible); + if ( unlikely(rc == -EINVAL) && + !is_pv_32bit_domain(d) && + get_l3_linear_pagetable(l3e, pfn, d) ) +@@ -1164,7 +1180,7 @@ get_page_from_l3e( + define_get_linear_pagetable(l4); + static int + get_page_from_l4e( +- l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial) ++ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, unsigned int flags) + { + int rc; + +@@ -1176,7 +1192,7 @@ get_page_from_l4e( + } + + rc = get_page_and_type_from_mfn( +- l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1); ++ l4e_get_mfn(l4e), PGT_l3_page_table, d, flags | PTF_preemptible); + if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) ) + rc = 0; + +@@ -1277,7 +1293,7 @@ static void put_data_page(struct page_info *page, bool writeable) + * Note also that this automatically deals correctly with linear p.t.'s. + */ + static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, +- int partial, bool defer) ++ unsigned int flags) + { + int rc = 0; + +@@ -1300,12 +1316,13 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + struct page_info *pg = l2e_get_page(l2e); + struct page_info *ptpg = mfn_to_page(_mfn(pfn)); + +- if ( unlikely(partial > 0) ) ++ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == ++ PTF_partial_set ) + { +- ASSERT(!defer); ++ ASSERT(!(flags & PTF_defer)); + rc = _put_page_type(pg, true, ptpg); + } +- else if ( defer ) ++ else if ( flags & PTF_defer ) + { + current->arch.old_guest_ptpg = ptpg; + current->arch.old_guest_table = pg; +@@ -1322,7 +1339,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + } + + static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, +- int partial, bool defer) ++ unsigned int flags) + { + struct page_info *pg; + int rc; +@@ -1345,13 +1362,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + + pg = l3e_get_page(l3e); + +- if ( unlikely(partial > 0) ) ++ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == ++ PTF_partial_set ) + { +- ASSERT(!defer); ++ ASSERT(!(flags & PTF_defer)); + return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); + } + +- if ( defer ) ++ if ( flags & PTF_defer ) + { + current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); + current->arch.old_guest_table = pg; +@@ -1366,7 +1384,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + } + + static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, +- int partial, bool defer) ++ unsigned int flags) + { + int rc = 1; + +@@ -1375,13 +1393,14 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + { + struct page_info *pg = l4e_get_page(l4e); + +- if ( unlikely(partial > 0) ) ++ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == ++ PTF_partial_set ) + { +- ASSERT(!defer); ++ ASSERT(!(flags & PTF_defer)); + return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); + } + +- if ( defer ) ++ if ( flags & PTF_defer ) + { + current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); + current->arch.old_guest_table = pg; +@@ -1492,12 +1511,13 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + unsigned long pfn = mfn_x(page_to_mfn(page)); + l2_pgentry_t *pl2e; + unsigned int i; +- int rc = 0, partial = page->partial_pte; ++ int rc = 0; ++ unsigned int partial_flags = page->partial_flags; + + pl2e = map_domain_page(_mfn(pfn)); + + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; +- i++, partial = 0 ) ++ i++, partial_flags = 0 ) + { + l2_pgentry_t l2e; + +@@ -1520,17 +1540,18 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + rc = -EINTR; + } + else +- rc = get_page_from_l2e(l2e, pfn, d, partial); ++ rc = get_page_from_l2e(l2e, pfn, d, partial_flags); + + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_pte = partial ?: 1; ++ /* Set 'set', retain 'general ref' */ ++ page->partial_flags = partial_flags | PTF_partial_set; + } + else if ( rc == -EINTR && i ) + { + page->nr_validated_ptes = i; +- page->partial_pte = 0; ++ page->partial_flags = 0; + rc = -ERESTART; + } + else if ( rc < 0 && rc != -EINTR ) +@@ -1539,7 +1560,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + if ( i ) + { + page->nr_validated_ptes = i; +- page->partial_pte = 0; ++ page->partial_flags = 0; + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } +@@ -1563,7 +1584,8 @@ static int alloc_l3_table(struct page_info *page) + unsigned long pfn = mfn_x(page_to_mfn(page)); + l3_pgentry_t *pl3e; + unsigned int i; +- int rc = 0, partial = page->partial_pte; ++ int rc = 0; ++ unsigned int partial_flags = page->partial_flags; + + pl3e = map_domain_page(_mfn(pfn)); + +@@ -1578,7 +1600,7 @@ static int alloc_l3_table(struct page_info *page) + memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); + + for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; +- i++, partial = 0 ) ++ i++, partial_flags = 0 ) + { + l3_pgentry_t l3e = pl3e[i]; + +@@ -1597,7 +1619,8 @@ static int alloc_l3_table(struct page_info *page) + else + rc = get_page_and_type_from_mfn( + l3e_get_mfn(l3e), +- PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1); ++ PGT_l2_page_table | PGT_pae_xen_l2, d, ++ partial_flags | PTF_preemptible); + } + else if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) + { +@@ -1606,17 +1629,18 @@ static int alloc_l3_table(struct page_info *page) + rc = -EINTR; + } + else +- rc = get_page_from_l3e(l3e, pfn, d, partial); ++ rc = get_page_from_l3e(l3e, pfn, d, partial_flags); + + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_pte = partial ?: 1; ++ /* Set 'set', leave 'general ref' set if this entry was set */ ++ page->partial_flags = partial_flags | PTF_partial_set; + } + else if ( rc == -EINTR && i ) + { + page->nr_validated_ptes = i; +- page->partial_pte = 0; ++ page->partial_flags = 0; + rc = -ERESTART; + } + if ( rc < 0 ) +@@ -1633,7 +1657,7 @@ static int alloc_l3_table(struct page_info *page) + if ( i ) + { + page->nr_validated_ptes = i; +- page->partial_pte = 0; ++ page->partial_flags = 0; + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } +@@ -1767,10 +1791,11 @@ static int alloc_l4_table(struct page_info *page) + unsigned long pfn = mfn_x(page_to_mfn(page)); + l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn)); + unsigned int i; +- int rc = 0, partial = page->partial_pte; ++ int rc = 0; ++ unsigned int partial_flags = page->partial_flags; + + for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; +- i++, partial = 0 ) ++ i++, partial_flags = 0 ) + { + l4_pgentry_t l4e; + +@@ -1786,12 +1811,13 @@ static int alloc_l4_table(struct page_info *page) + rc = -EINTR; + } + else +- rc = get_page_from_l4e(l4e, pfn, d, partial); ++ rc = get_page_from_l4e(l4e, pfn, d, partial_flags); + + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_pte = partial ?: 1; ++ /* Set 'set', leave 'general ref' set if this entry was set */ ++ page->partial_flags = partial_flags | PTF_partial_set; + } + else if ( rc < 0 ) + { +@@ -1801,7 +1827,7 @@ static int alloc_l4_table(struct page_info *page) + if ( i ) + { + page->nr_validated_ptes = i; +- page->partial_pte = 0; ++ page->partial_flags = 0; + if ( rc == -EINTR ) + rc = -ERESTART; + else +@@ -1853,19 +1879,20 @@ static int free_l2_table(struct page_info *page) + struct domain *d = page_get_owner(page); + unsigned long pfn = mfn_x(page_to_mfn(page)); + l2_pgentry_t *pl2e; +- int rc = 0, partial = page->partial_pte; +- unsigned int i = page->nr_validated_ptes - !partial; ++ int rc = 0; ++ unsigned int partial_flags = page->partial_flags, ++ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); + + pl2e = map_domain_page(_mfn(pfn)); + + for ( ; ; ) + { + if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) ) +- rc = put_page_from_l2e(pl2e[i], pfn, partial, false); ++ rc = put_page_from_l2e(pl2e[i], pfn, partial_flags); + if ( rc < 0 ) + break; + +- partial = 0; ++ partial_flags = 0; + + if ( !i-- ) + break; +@@ -1887,12 +1914,14 @@ static int free_l2_table(struct page_info *page) + else if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_pte = partial ?: -1; ++ page->partial_flags = (partial_flags & PTF_partial_set) ? ++ partial_flags : ++ (PTF_partial_set | PTF_partial_general_ref); + } + else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) + { + page->nr_validated_ptes = i + 1; +- page->partial_pte = 0; ++ page->partial_flags = 0; + rc = -ERESTART; + } + +@@ -1904,18 +1933,19 @@ static int free_l3_table(struct page_info *page) + struct domain *d = page_get_owner(page); + unsigned long pfn = mfn_x(page_to_mfn(page)); + l3_pgentry_t *pl3e; +- int rc = 0, partial = page->partial_pte; +- unsigned int i = page->nr_validated_ptes - !partial; ++ int rc = 0; ++ unsigned int partial_flags = page->partial_flags, ++ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); + + pl3e = map_domain_page(_mfn(pfn)); + + for ( ; ; ) + { +- rc = put_page_from_l3e(pl3e[i], pfn, partial, 0); ++ rc = put_page_from_l3e(pl3e[i], pfn, partial_flags); + if ( rc < 0 ) + break; + +- partial = 0; ++ partial_flags = 0; + if ( rc == 0 ) + pl3e[i] = unadjust_guest_l3e(pl3e[i], d); + +@@ -1934,12 +1964,14 @@ static int free_l3_table(struct page_info *page) + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_pte = partial ?: -1; ++ page->partial_flags = (partial_flags & PTF_partial_set) ? ++ partial_flags : ++ (PTF_partial_set | PTF_partial_general_ref); + } + else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) + { + page->nr_validated_ptes = i + 1; +- page->partial_pte = 0; ++ page->partial_flags = 0; + rc = -ERESTART; + } + return rc > 0 ? 0 : rc; +@@ -1950,26 +1982,29 @@ static int free_l4_table(struct page_info *page) + struct domain *d = page_get_owner(page); + unsigned long pfn = mfn_x(page_to_mfn(page)); + l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn)); +- int rc = 0, partial = page->partial_pte; +- unsigned int i = page->nr_validated_ptes - !partial; ++ int rc = 0; ++ unsigned partial_flags = page->partial_flags, ++ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); + + do { + if ( is_guest_l4_slot(d, i) ) +- rc = put_page_from_l4e(pl4e[i], pfn, partial, 0); ++ rc = put_page_from_l4e(pl4e[i], pfn, partial_flags); + if ( rc < 0 ) + break; +- partial = 0; ++ partial_flags = 0; + } while ( i-- ); + + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_pte = partial ?: -1; ++ page->partial_flags = (partial_flags & PTF_partial_set) ? ++ partial_flags : ++ (PTF_partial_set | PTF_partial_general_ref); + } + else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) + { + page->nr_validated_ptes = i + 1; +- page->partial_pte = 0; ++ page->partial_flags = 0; + rc = -ERESTART; + } + +@@ -2247,7 +2282,7 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, + return -EBUSY; + } + +- put_page_from_l2e(ol2e, pfn, 0, true); ++ put_page_from_l2e(ol2e, pfn, PTF_defer); + + return rc; + } +@@ -2315,7 +2350,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e, + if ( !create_pae_xen_mappings(d, pl3e) ) + BUG(); + +- put_page_from_l3e(ol3e, pfn, 0, 1); ++ put_page_from_l3e(ol3e, pfn, PTF_defer); + return rc; + } + +@@ -2378,7 +2413,7 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, + return -EFAULT; + } + +- put_page_from_l4e(ol4e, pfn, 0, 1); ++ put_page_from_l4e(ol4e, pfn, PTF_defer); + return rc; + } + #endif /* CONFIG_PV */ +@@ -2649,7 +2684,7 @@ int free_page_type(struct page_info *page, unsigned long type, + if ( !(type & PGT_partial) ) + { + page->nr_validated_ptes = 1U << PAGETABLE_ORDER; +- page->partial_pte = 0; ++ page->partial_flags = 0; + } + + switch ( type & PGT_type_mask ) +@@ -2946,7 +2981,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, + if ( !(x & PGT_partial) ) + { + page->nr_validated_ptes = 0; +- page->partial_pte = 0; ++ page->partial_flags = 0; + } + page->linear_pt_count = 0; + rc = alloc_page_type(page, type, preemptible); +@@ -3122,7 +3157,7 @@ int new_guest_cr3(mfn_t mfn) + return 0; + } + +- rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1); ++ rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, PTF_preemptible); + switch ( rc ) + { + case 0: +@@ -3473,7 +3508,7 @@ long do_mmuext_op( + if ( op.arg1.mfn != 0 ) + { + rc = get_page_and_type_from_mfn( +- _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1); ++ _mfn(op.arg1.mfn), PGT_root_page_table, currd, PTF_preemptible); + + if ( unlikely(rc) ) + { +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index 6faa563167..8406ac3c37 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -228,19 +228,34 @@ struct page_info + * setting the flag must not drop that reference, whereas the instance + * clearing it will have to. + * +- * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has +- * been partially validated. This implies that the general reference +- * to the page (acquired from get_page_from_lNe()) would be dropped +- * (again due to the apparent failure) and hence must be re-acquired +- * when resuming the validation, but must not be dropped when picking +- * up the page for invalidation. ++ * If partial_flags & PTF_partial_set is set, then the page at ++ * at @nr_validated_ptes had PGT_partial set as a result of an ++ * operation on the current page. (That page may or may not ++ * still have PGT_partial set.) + * +- * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has +- * been partially invalidated. This is basically the opposite case of +- * above, i.e. the general reference to the page was not dropped in +- * put_page_from_lNe() (due to the apparent failure), and hence it +- * must be dropped when the put operation is resumed (and completes), +- * but it must not be acquired if picking up the page for validation. ++ * If PTF_partial_general_ref is set, then the PTE at ++ * @nr_validated_ptef holds a general reference count for the ++ * page. ++ * ++ * This happens: ++ * - During de-validation, if de-validation of the page was ++ * interrupted ++ * - During validation, if an invalid entry is encountered and ++ * validation is preemptible ++ * - During validation, if PTF_partial_general_ref was set on ++ * this entry to begin with (perhaps because we're picking ++ * up from a partial de-validation). ++ * ++ * When resuming validation, if PTF_partial_general_ref is clear, ++ * then a general reference must be re-acquired; if it is set, no ++ * reference should be acquired. ++ * ++ * When resuming de-validation, if PTF_partial_general_ref is ++ * clear, no reference should be dropped; if it is set, a ++ * reference should be dropped. ++ * ++ * NB that PTF_partial_set and PTF_partial_general_ref are ++ * defined in mm.c, the only place where they are used. + * + * The 3rd field, @linear_pt_count, indicates + * - by a positive value, how many same-level page table entries a page +@@ -251,7 +266,7 @@ struct page_info + struct { + u16 nr_validated_ptes:PAGETABLE_ORDER + 1; + u16 :16 - PAGETABLE_ORDER - 1 - 2; +- s16 partial_pte:2; ++ u16 partial_flags:2; + s16 linear_pt_count; + }; + +-- +2.23.0 + diff --git a/xsa299-4.12-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch b/xsa299-4.12-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch new file mode 100644 index 0000000..d07c233 --- /dev/null +++ b/xsa299-4.12-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch @@ -0,0 +1,140 @@ +From db1d801aa8dcb918a27486a6e8d9cf5d7307dec3 Mon Sep 17 00:00:00 2001 +From: George Dunlap +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 04/11] x86/mm: Use flags for _put_page_type rather than a + boolean + +This is in mainly in preparation for _put_page_type taking the +partial_flags value in the future. It also makes it easier to read in +the caller (since you see a flag name rather than `true` or `false`). + +No functional change intended. + +This is part of XSA-299. + +Reported-by: George Dunlap +Signed-off-by: George Dunlap +Reviewed-by: Jan Beulich +--- + xen/arch/x86/mm.c | 25 +++++++++++++------------ + 1 file changed, 13 insertions(+), 12 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 3f6f8cc9b8..0740b61af8 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1200,7 +1200,7 @@ get_page_from_l4e( + } + #endif /* CONFIG_PV */ + +-static int _put_page_type(struct page_info *page, bool preemptible, ++static int _put_page_type(struct page_info *page, unsigned int flags, + struct page_info *ptpg); + + void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner) +@@ -1320,7 +1320,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + PTF_partial_set ) + { + ASSERT(!(flags & PTF_defer)); +- rc = _put_page_type(pg, true, ptpg); ++ rc = _put_page_type(pg, PTF_preemptible, ptpg); + } + else if ( flags & PTF_defer ) + { +@@ -1329,7 +1329,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + } + else + { +- rc = _put_page_type(pg, true, ptpg); ++ rc = _put_page_type(pg, PTF_preemptible, ptpg); + if ( likely(!rc) ) + put_page(pg); + } +@@ -1366,7 +1366,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + PTF_partial_set ) + { + ASSERT(!(flags & PTF_defer)); +- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); ++ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); + } + + if ( flags & PTF_defer ) +@@ -1376,7 +1376,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + return 0; + } + +- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); ++ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); + if ( likely(!rc) ) + put_page(pg); + +@@ -1397,7 +1397,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + PTF_partial_set ) + { + ASSERT(!(flags & PTF_defer)); +- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); ++ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); + } + + if ( flags & PTF_defer ) +@@ -1407,7 +1407,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + return 0; + } + +- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); ++ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); + if ( likely(!rc) ) + put_page(pg); + } +@@ -2757,10 +2757,11 @@ static int _put_final_page_type(struct page_info *page, unsigned long type, + } + + +-static int _put_page_type(struct page_info *page, bool preemptible, ++static int _put_page_type(struct page_info *page, unsigned int flags, + struct page_info *ptpg) + { + unsigned long nx, x, y = page->u.inuse.type_info; ++ bool preemptible = flags & PTF_preemptible; + + ASSERT(current_locked_page_ne_check(page)); + +@@ -2969,7 +2970,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, + + if ( unlikely(iommu_ret) ) + { +- _put_page_type(page, false, NULL); ++ _put_page_type(page, 0, NULL); + rc = iommu_ret; + goto out; + } +@@ -2996,7 +2997,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, + + void put_page_type(struct page_info *page) + { +- int rc = _put_page_type(page, false, NULL); ++ int rc = _put_page_type(page, 0, NULL); + ASSERT(rc == 0); + (void)rc; + } +@@ -3013,7 +3014,7 @@ int get_page_type(struct page_info *page, unsigned long type) + + int put_page_type_preemptible(struct page_info *page) + { +- return _put_page_type(page, true, NULL); ++ return _put_page_type(page, PTF_preemptible, NULL); + } + + int get_page_type_preemptible(struct page_info *page, unsigned long type) +@@ -3030,7 +3031,7 @@ int put_old_guest_table(struct vcpu *v) + if ( !v->arch.old_guest_table ) + return 0; + +- switch ( rc = _put_page_type(v->arch.old_guest_table, true, ++ switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible, + v->arch.old_guest_ptpg) ) + { + case -EINTR: +-- +2.23.0 + diff --git a/xsa299-4.12-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch b/xsa299-4.12-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch new file mode 100644 index 0000000..9cfbb73 --- /dev/null +++ b/xsa299-4.12-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch @@ -0,0 +1,79 @@ +From 6f257854c8778774210281c5c21028c4b7739b44 Mon Sep 17 00:00:00 2001 +From: George Dunlap +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 05/11] x86/mm: Rework get_page_and_type_from_mfn conditional + +Make it easier to read by declaring the conditions in which we will +retain the ref, rather than the conditions under which we release it. + +The only way (page == current->arch.old_guest_table) can be true is if +preemptible is true; so remove this from the query itself, and add an +ASSERT() to that effect on the opposite path. + +No functional change intended. + +NB that alloc_lN_table() mishandle the "linear pt failure" situation +described in the comment; this will be addressed in a future patch. + +This is part of XSA-299. + +Reported-by: George Dunlap +Signed-off-by: George Dunlap +Reviewed-by: Jan Beulich +--- + xen/arch/x86/mm.c | 39 +++++++++++++++++++++++++++++++++++++-- + 1 file changed, 37 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 0740b61af8..0a4d39a2c3 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1122,8 +1122,43 @@ static int get_page_and_type_from_mfn( + + rc = _get_page_type(page, type, preemptible); + +- if ( unlikely(rc) && !partial_ref && +- (!preemptible || page != current->arch.old_guest_table) ) ++ /* ++ * Retain the refcount if: ++ * - page is fully validated (rc == 0) ++ * - page is not validated (rc < 0) but: ++ * - We came in with a reference (partial_ref) ++ * - page is partially validated but there's been an error ++ * (page == current->arch.old_guest_table) ++ * ++ * The partial_ref-on-error clause is worth an explanation. There ++ * are two scenarios where partial_ref might be true coming in: ++ * - mfn has been partially demoted as type `type`; i.e. has ++ * PGT_partial set ++ * - mfn has been partially demoted as L(type+1) (i.e., a linear ++ * page; e.g. we're being called from get_page_from_l2e with ++ * type == PGT_l1_table, but the mfn is PGT_l2_table) ++ * ++ * If there's an error, in the first case, _get_page_type will ++ * either return -ERESTART, in which case we want to retain the ++ * ref (as the caller will consider it retained), or -EINVAL, in ++ * which case old_guest_table will be set; in both cases, we need ++ * to retain the ref. ++ * ++ * In the second case, if there's an error, _get_page_type() can ++ * *only* return -EINVAL, and *never* set old_guest_table. In ++ * that case we also want to retain the reference, to allow the ++ * page to continue to be torn down (i.e., PGT_partial cleared) ++ * safely. ++ * ++ * Also note that we shouldn't be able to leave with the reference ++ * count retained unless we succeeded, or the operation was ++ * preemptible. ++ */ ++ if ( likely(!rc) || partial_ref ) ++ /* nothing */; ++ else if ( page == current->arch.old_guest_table ) ++ ASSERT(preemptible); ++ else + put_page(page); + + return rc; +-- +2.23.0 + diff --git a/xsa299-4.12-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch b/xsa299-4.12-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch new file mode 100644 index 0000000..72ee3ea --- /dev/null +++ b/xsa299-4.12-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch @@ -0,0 +1,111 @@ +From 4ad70553611a7a4e4494d5a3b51b5cc295a488e0 Mon Sep 17 00:00:00 2001 +From: George Dunlap +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 06/11] x86/mm: Have alloc_l[23]_table clear partial_flags when + preempting + +In order to allow recursive pagetable promotions and demotions to be +interrupted, Xen must keep track of the state of the sub-pages +promoted or demoted. This is stored in two elements in the page +struct: nr_entries_validated and partial_flags. + +The rule is that entries [0, nr_entries_validated) should always be +validated and hold a general reference count. If partial_flags is +zero, then [nr_entries_validated] is not validated and no reference +count is held. If PTF_partial_set is set, then [nr_entries_validated] +is partially validated. + +At the moment, a distinction is made between promotion and demotion +with regard to whether the entry itself "holds" a general reference +count: when entry promotion is interrupted (i.e., returns -ERESTART), +the entry is not considered to hold a reference; when entry demotion +is interrupted, the entry is still considered to hold a general +reference. + +PTF_partial_general_ref is used to distinguish between these cases. +If clear, it's a partial promotion => no general reference count held +by the entry; if set, it's partial demotion, so a general reference +count held. Because promotions and demotions can be interleaved, this +value is passed to get_page_and_type_from_mfn and put_page_from_l*e, +to be able to properly handle reference counts. + +Unfortunately, when alloc_l[23]_table check hypercall_preempt_check() +and return -ERESTART, they set nr_entries_validated, but don't clear +partial_flags. + +If we were picking up from a previously-interrupted promotion, that +means that PTF_partial_set would be set even though +[nr_entries_validated] was not partially validated. This means that +if the page in this state were de-validated, put_page_type() would +erroneously be called on that entry. + +Perhaps worse, if we were racing with a de-validation, then we might +leave both PTF_partial_set and PTF_partial_general_ref; and when +de-validation picked up again, both the type and the general ref would +be erroneously dropped from [nr_entries_validated]. + +In a sense, the real issue here is code duplication. Rather than +duplicate the interruption code, set rc to -EINTR and fall through to +the code which already handles that case correctly. + +Given the logic at this point, it should be impossible for +partial_flags to be non-zero; add an ASSERT() to catch any changes. + +This is part of XSA-299. + +Reported-by: George Dunlap +Signed-off-by: George Dunlap +Reviewed-by: Jan Beulich +--- + xen/arch/x86/mm.c | 25 ++++++------------------- + 1 file changed, 6 insertions(+), 19 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 0a4d39a2c3..bbd29a68f4 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1554,21 +1554,13 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; + i++, partial_flags = 0 ) + { +- l2_pgentry_t l2e; ++ l2_pgentry_t l2e = pl2e[i]; + + if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) +- { +- page->nr_validated_ptes = i; +- rc = -ERESTART; +- break; +- } +- +- if ( !is_guest_l2_slot(d, type, i) ) ++ rc = -EINTR; ++ else if ( !is_guest_l2_slot(d, type, i) ) + continue; +- +- l2e = pl2e[i]; +- +- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) ++ else if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) + { + if ( !pv_l1tf_check_l2e(d, l2e) ) + continue; +@@ -1640,13 +1632,8 @@ static int alloc_l3_table(struct page_info *page) + l3_pgentry_t l3e = pl3e[i]; + + if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) +- { +- page->nr_validated_ptes = i; +- rc = -ERESTART; +- break; +- } +- +- if ( is_pv_32bit_domain(d) && (i == 3) ) ++ rc = -EINTR; ++ else if ( is_pv_32bit_domain(d) && (i == 3) ) + { + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || + (l3e_get_flags(l3e) & l3_disallow_mask(d)) ) +-- +2.23.0 + diff --git a/xsa299-4.12-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch b/xsa299-4.12-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch new file mode 100644 index 0000000..ef390e2 --- /dev/null +++ b/xsa299-4.12-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch @@ -0,0 +1,378 @@ +From 51fe4e67d954649fcf103116be6206a769f0db1e Mon Sep 17 00:00:00 2001 +From: George Dunlap +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 07/11] x86/mm: Always retain a general ref on partial + +In order to allow recursive pagetable promotions and demotions to be +interrupted, Xen must keep track of the state of the sub-pages +promoted or demoted. This is stored in two elements in the page struct: +nr_entries_validated and partial_flags. + +The rule is that entries [0, nr_entries_validated) should always be +validated and hold a general reference count. If partial_flags is +zero, then [nr_entries_validated] is not validated and no reference +count is held. If PTF_partial_set is set, then [nr_entries_validated] +is partially validated. + +At the moment, a distinction is made between promotion and demotion +with regard to whether the entry itself "holds" a general reference +count: when entry promotion is interrupted (i.e., returns -ERESTART), +the entry is not considered to hold a reference; when entry demotion +is interrupted, the entry is still considered to hold a general +reference. + +PTF_partial_general_ref is used to distinguish between these cases. +If clear, it's a partial promotion => no general reference count held +by the entry; if set, it's partial demotion, so a general reference +count held. Because promotions and demotions can be interleaved, this +value is passed to get_page_and_type_from_mfn and put_page_from_l*e, +to be able to properly handle reference counts. + +Unfortunately, because a refcount is not held, it is possible to +engineer a situation where PFT_partial_set is set but the page in +question has been assigned to another domain. A sketch is provided in +the appendix. + +Fix this by having the parent page table entry hold a general +reference count whenever PFT_partial_set is set. (For clarity of +change, keep two separate flags. These will be collapsed in a +subsequent changeset.) + +This has two basic implications. On the put_page_from_lNe() side, +this mean that the (partial_set && !partial_ref) case can never happen, +and no longer needs to be special-cased. + +Secondly, because both flags are set together, there's no need to carry over +existing bits from partial_pte. + +(NB there is still another issue with calling _put_page_type() on a +page which had PGT_partial set; that will be handled in a subsequent +patch.) + +On the get_page_and_type_from_mfn() side, we need to distinguish +between callers which hold a reference on partial (i.e., +alloc_lN_table()), and those which do not (new_cr3, PIN_LN_TABLE, and +so on): pass a flag if the type should be retained on interruption. + +NB that since l1 promotion can't be preempted, that get_page_from_l2e +can't return -ERESTART. + +This is part of XSA-299. + +Reported-by: George Dunlap +Signed-off-by: George Dunlap +Reviewed-by: Jan Beulich +----- +* Appendix: Engineering PTF_partial_set while a page belongs to a + foreign domain + +Suppose A is a page which can be promoted to an l3, and B is a page +which can be promoted to an l2, and A[x] points to B. B has +PGC_allocated set but no other general references. + +V1: PIN_L3 A. + A is validated, B is validated. + A.type_count = 1 | PGT_validated | PGT_pinned + B.type_count = 1 | PGT_validated + B.count = 2 | PGC_allocated (A[x] holds a general ref) + +V1: UNPIN A. + A begins de-validation. + Arrange to be interrupted when i < x + V1->old_guest_table = A + V1->old_guest_table_ref_held = false + A.type_count = 1 | PGT_partial + A.nr_validated_entries = i < x + B.type_count = 0 + B.count = 1 | PGC_allocated + +V2: MOD_L4_ENTRY to point some l4e to A. + Picks up re-validation of A. + Arrange to be interrupted halfway through B's validation + B.type_count = 1 | PGT_partial + B.count = 2 | PGC_allocated (PGT_partial holds a general ref) + A.type_count = 1 | PGT_partial + A.nr_validated_entries = x + A.partial_pte = PTF_partial_set + +V3: MOD_L3_ENTRY to point some other l3e (not in A) to B. + Validates B. + B.type_count = 1 | PGT_validated + B.count = 2 | PGC_allocated ("other l3e" holds a general ref) + +V3: MOD_L3_ENTRY to clear l3e pointing to B. + Devalidates B. + B.type_count = 0 + B.count = 1 | PGC_allocated + +V3: decrease_reservation(B) + Clears PGC_allocated + B.count = 0 => B is freed + +B gets assigned to a different domain + +V1: Restarts UNPIN of A + put_old_guest_table(A) + ... + free_l3_table(A) + +Now since A.partial_flags has PTF_partial_set, free_l3_table() will +call put_page_from_l3e() on A[x], which points to B, while B is owned +by another domain. + +If A[x] held a general refcount for B on partial validation, as it does +for partial de-validation, then B would still have a reference count of +1 after PGC_allocated was freed; so B wouldn't be freed until after +put_page_from_l3e() had happend on A[x]. +--- + xen/arch/x86/mm.c | 84 +++++++++++++++++++++++----------------- + xen/include/asm-x86/mm.h | 15 ++++--- + 2 files changed, 58 insertions(+), 41 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index bbd29a68f4..4d3ebf341d 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1102,10 +1102,11 @@ get_page_from_l1e( + * page->pte[page->nr_validated_entries]. See the comment in mm.h for + * more information. + */ +-#define PTF_partial_set (1 << 0) +-#define PTF_partial_general_ref (1 << 1) +-#define PTF_preemptible (1 << 2) +-#define PTF_defer (1 << 3) ++#define PTF_partial_set (1 << 0) ++#define PTF_partial_general_ref (1 << 1) ++#define PTF_preemptible (1 << 2) ++#define PTF_defer (1 << 3) ++#define PTF_retain_ref_on_restart (1 << 4) + + static int get_page_and_type_from_mfn( + mfn_t mfn, unsigned long type, struct domain *d, +@@ -1114,7 +1115,11 @@ static int get_page_and_type_from_mfn( + struct page_info *page = mfn_to_page(mfn); + int rc; + bool preemptible = flags & PTF_preemptible, +- partial_ref = flags & PTF_partial_general_ref; ++ partial_ref = flags & PTF_partial_general_ref, ++ partial_set = flags & PTF_partial_set, ++ retain_ref = flags & PTF_retain_ref_on_restart; ++ ++ ASSERT(partial_ref == partial_set); + + if ( likely(!partial_ref) && + unlikely(!get_page_from_mfn(mfn, d)) ) +@@ -1127,13 +1132,15 @@ static int get_page_and_type_from_mfn( + * - page is fully validated (rc == 0) + * - page is not validated (rc < 0) but: + * - We came in with a reference (partial_ref) ++ * - page is partially validated (rc == -ERESTART), and the ++ * caller has asked the ref to be retained in that case + * - page is partially validated but there's been an error + * (page == current->arch.old_guest_table) + * + * The partial_ref-on-error clause is worth an explanation. There + * are two scenarios where partial_ref might be true coming in: +- * - mfn has been partially demoted as type `type`; i.e. has +- * PGT_partial set ++ * - mfn has been partially promoted / demoted as type `type`; ++ * i.e. has PGT_partial set + * - mfn has been partially demoted as L(type+1) (i.e., a linear + * page; e.g. we're being called from get_page_from_l2e with + * type == PGT_l1_table, but the mfn is PGT_l2_table) +@@ -1156,7 +1163,8 @@ static int get_page_and_type_from_mfn( + */ + if ( likely(!rc) || partial_ref ) + /* nothing */; +- else if ( page == current->arch.old_guest_table ) ++ else if ( page == current->arch.old_guest_table || ++ (retain_ref && rc == -ERESTART) ) + ASSERT(preemptible); + else + put_page(page); +@@ -1354,8 +1362,8 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == + PTF_partial_set ) + { +- ASSERT(!(flags & PTF_defer)); +- rc = _put_page_type(pg, PTF_preemptible, ptpg); ++ /* partial_set should always imply partial_ref */ ++ BUG(); + } + else if ( flags & PTF_defer ) + { +@@ -1400,8 +1408,8 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == + PTF_partial_set ) + { +- ASSERT(!(flags & PTF_defer)); +- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); ++ /* partial_set should always imply partial_ref */ ++ BUG(); + } + + if ( flags & PTF_defer ) +@@ -1431,8 +1439,8 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == + PTF_partial_set ) + { +- ASSERT(!(flags & PTF_defer)); +- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); ++ /* partial_set should always imply partial_ref */ ++ BUG(); + } + + if ( flags & PTF_defer ) +@@ -1569,13 +1577,22 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + else + rc = get_page_from_l2e(l2e, pfn, d, partial_flags); + +- if ( rc == -ERESTART ) +- { +- page->nr_validated_ptes = i; +- /* Set 'set', retain 'general ref' */ +- page->partial_flags = partial_flags | PTF_partial_set; +- } +- else if ( rc == -EINTR && i ) ++ /* ++ * It shouldn't be possible for get_page_from_l2e to return ++ * -ERESTART, since we never call this with PTF_preemptible. ++ * (alloc_l1_table may return -EINTR on an L1TF-vulnerable ++ * entry.) ++ * ++ * NB that while on a "clean" promotion, we can never get ++ * PGT_partial. It is possible to arrange for an l2e to ++ * contain a partially-devalidated l2; but in that case, both ++ * of the following functions will fail anyway (the first ++ * because the page in question is not an l1; the second ++ * because the page is not fully validated). ++ */ ++ ASSERT(rc != -ERESTART); ++ ++ if ( rc == -EINTR && i ) + { + page->nr_validated_ptes = i; + page->partial_flags = 0; +@@ -1584,6 +1601,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + else if ( rc < 0 && rc != -EINTR ) + { + gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i); ++ ASSERT(current->arch.old_guest_table == NULL); + if ( i ) + { + page->nr_validated_ptes = i; +@@ -1642,7 +1660,7 @@ static int alloc_l3_table(struct page_info *page) + rc = get_page_and_type_from_mfn( + l3e_get_mfn(l3e), + PGT_l2_page_table | PGT_pae_xen_l2, d, +- partial_flags | PTF_preemptible); ++ partial_flags | PTF_preemptible | PTF_retain_ref_on_restart); + } + else if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) + { +@@ -1651,13 +1669,14 @@ static int alloc_l3_table(struct page_info *page) + rc = -EINTR; + } + else +- rc = get_page_from_l3e(l3e, pfn, d, partial_flags); ++ rc = get_page_from_l3e(l3e, pfn, d, ++ partial_flags | PTF_retain_ref_on_restart); + + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; + /* Set 'set', leave 'general ref' set if this entry was set */ +- page->partial_flags = partial_flags | PTF_partial_set; ++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; + } + else if ( rc == -EINTR && i ) + { +@@ -1833,13 +1852,14 @@ static int alloc_l4_table(struct page_info *page) + rc = -EINTR; + } + else +- rc = get_page_from_l4e(l4e, pfn, d, partial_flags); ++ rc = get_page_from_l4e(l4e, pfn, d, ++ partial_flags | PTF_retain_ref_on_restart); + + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; + /* Set 'set', leave 'general ref' set if this entry was set */ +- page->partial_flags = partial_flags | PTF_partial_set; ++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; + } + else if ( rc < 0 ) + { +@@ -1936,9 +1956,7 @@ static int free_l2_table(struct page_info *page) + else if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_flags = (partial_flags & PTF_partial_set) ? +- partial_flags : +- (PTF_partial_set | PTF_partial_general_ref); ++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; + } + else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) + { +@@ -1986,9 +2004,7 @@ static int free_l3_table(struct page_info *page) + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_flags = (partial_flags & PTF_partial_set) ? +- partial_flags : +- (PTF_partial_set | PTF_partial_general_ref); ++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; + } + else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) + { +@@ -2019,9 +2035,7 @@ static int free_l4_table(struct page_info *page) + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_flags = (partial_flags & PTF_partial_set) ? +- partial_flags : +- (PTF_partial_set | PTF_partial_general_ref); ++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; + } + else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) + { +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index 8406ac3c37..02079e1324 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -238,22 +238,25 @@ struct page_info + * page. + * + * This happens: +- * - During de-validation, if de-validation of the page was ++ * - During validation or de-validation, if the operation was + * interrupted + * - During validation, if an invalid entry is encountered and + * validation is preemptible + * - During validation, if PTF_partial_general_ref was set on +- * this entry to begin with (perhaps because we're picking +- * up from a partial de-validation). ++ * this entry to begin with (perhaps because it picked up a ++ * previous operation) + * +- * When resuming validation, if PTF_partial_general_ref is clear, +- * then a general reference must be re-acquired; if it is set, no +- * reference should be acquired. ++ * When resuming validation, if PTF_partial_general_ref is ++ * clear, then a general reference must be re-acquired; if it ++ * is set, no reference should be acquired. + * + * When resuming de-validation, if PTF_partial_general_ref is + * clear, no reference should be dropped; if it is set, a + * reference should be dropped. + * ++ * NB at the moment, PTF_partial_set should be set if and only if ++ * PTF_partial_general_ref is set. ++ * + * NB that PTF_partial_set and PTF_partial_general_ref are + * defined in mm.c, the only place where they are used. + * +-- +2.23.0 + diff --git a/xsa299-4.12-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch b/xsa299-4.12-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch new file mode 100644 index 0000000..6cf41d1 --- /dev/null +++ b/xsa299-4.12-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch @@ -0,0 +1,227 @@ +From 8a8d836f7f7418e659d37817a66cd7a6b115042b Mon Sep 17 00:00:00 2001 +From: George Dunlap +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 08/11] x86/mm: Collapse PTF_partial_set and + PTF_partial_general_ref into one + +...now that they are equivalent. No functional change intended. + +Reported-by: George Dunlap +Signed-off-by: George Dunlap +Reviewed-by: Jan Beulich +--- + xen/arch/x86/mm.c | 50 +++++++++++----------------------------- + xen/include/asm-x86/mm.h | 29 +++++++++++------------ + 2 files changed, 26 insertions(+), 53 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 4d3ebf341d..886e93b8aa 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1097,13 +1097,12 @@ get_page_from_l1e( + + /* + * The following flags are used to specify behavior of various get and +- * put commands. The first two are also stored in page->partial_flags +- * to indicate the state of the page pointed to by ++ * put commands. The first is also stored in page->partial_flags to ++ * indicate the state of the page pointed to by + * page->pte[page->nr_validated_entries]. See the comment in mm.h for + * more information. + */ + #define PTF_partial_set (1 << 0) +-#define PTF_partial_general_ref (1 << 1) + #define PTF_preemptible (1 << 2) + #define PTF_defer (1 << 3) + #define PTF_retain_ref_on_restart (1 << 4) +@@ -1115,13 +1114,10 @@ static int get_page_and_type_from_mfn( + struct page_info *page = mfn_to_page(mfn); + int rc; + bool preemptible = flags & PTF_preemptible, +- partial_ref = flags & PTF_partial_general_ref, + partial_set = flags & PTF_partial_set, + retain_ref = flags & PTF_retain_ref_on_restart; + +- ASSERT(partial_ref == partial_set); +- +- if ( likely(!partial_ref) && ++ if ( likely(!partial_set) && + unlikely(!get_page_from_mfn(mfn, d)) ) + return -EINVAL; + +@@ -1131,14 +1127,14 @@ static int get_page_and_type_from_mfn( + * Retain the refcount if: + * - page is fully validated (rc == 0) + * - page is not validated (rc < 0) but: +- * - We came in with a reference (partial_ref) ++ * - We came in with a reference (partial_set) + * - page is partially validated (rc == -ERESTART), and the + * caller has asked the ref to be retained in that case + * - page is partially validated but there's been an error + * (page == current->arch.old_guest_table) + * +- * The partial_ref-on-error clause is worth an explanation. There +- * are two scenarios where partial_ref might be true coming in: ++ * The partial_set-on-error clause is worth an explanation. There ++ * are two scenarios where partial_set might be true coming in: + * - mfn has been partially promoted / demoted as type `type`; + * i.e. has PGT_partial set + * - mfn has been partially demoted as L(type+1) (i.e., a linear +@@ -1161,7 +1157,7 @@ static int get_page_and_type_from_mfn( + * count retained unless we succeeded, or the operation was + * preemptible. + */ +- if ( likely(!rc) || partial_ref ) ++ if ( likely(!rc) || partial_set ) + /* nothing */; + else if ( page == current->arch.old_guest_table || + (retain_ref && rc == -ERESTART) ) +@@ -1359,13 +1355,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + struct page_info *pg = l2e_get_page(l2e); + struct page_info *ptpg = mfn_to_page(_mfn(pfn)); + +- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == +- PTF_partial_set ) +- { +- /* partial_set should always imply partial_ref */ +- BUG(); +- } +- else if ( flags & PTF_defer ) ++ if ( flags & PTF_defer ) + { + current->arch.old_guest_ptpg = ptpg; + current->arch.old_guest_table = pg; +@@ -1405,13 +1395,6 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + + pg = l3e_get_page(l3e); + +- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == +- PTF_partial_set ) +- { +- /* partial_set should always imply partial_ref */ +- BUG(); +- } +- + if ( flags & PTF_defer ) + { + current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); +@@ -1436,13 +1419,6 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + { + struct page_info *pg = l4e_get_page(l4e); + +- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == +- PTF_partial_set ) +- { +- /* partial_set should always imply partial_ref */ +- BUG(); +- } +- + if ( flags & PTF_defer ) + { + current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); +@@ -1676,7 +1652,7 @@ static int alloc_l3_table(struct page_info *page) + { + page->nr_validated_ptes = i; + /* Set 'set', leave 'general ref' set if this entry was set */ +- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; ++ page->partial_flags = PTF_partial_set; + } + else if ( rc == -EINTR && i ) + { +@@ -1859,7 +1835,7 @@ static int alloc_l4_table(struct page_info *page) + { + page->nr_validated_ptes = i; + /* Set 'set', leave 'general ref' set if this entry was set */ +- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; ++ page->partial_flags = PTF_partial_set; + } + else if ( rc < 0 ) + { +@@ -1956,7 +1932,7 @@ static int free_l2_table(struct page_info *page) + else if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; ++ page->partial_flags = PTF_partial_set; + } + else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) + { +@@ -2004,7 +1980,7 @@ static int free_l3_table(struct page_info *page) + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; ++ page->partial_flags = PTF_partial_set; + } + else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) + { +@@ -2035,7 +2011,7 @@ static int free_l4_table(struct page_info *page) + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; ++ page->partial_flags = PTF_partial_set; + } + else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) + { +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index 02079e1324..f0fd35bf6b 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -233,7 +233,7 @@ struct page_info + * operation on the current page. (That page may or may not + * still have PGT_partial set.) + * +- * If PTF_partial_general_ref is set, then the PTE at ++ * Additionally, if PTF_partial_set is set, then the PTE at + * @nr_validated_ptef holds a general reference count for the + * page. + * +@@ -242,23 +242,20 @@ struct page_info + * interrupted + * - During validation, if an invalid entry is encountered and + * validation is preemptible +- * - During validation, if PTF_partial_general_ref was set on +- * this entry to begin with (perhaps because it picked up a ++ * - During validation, if PTF_partial_set was set on this ++ * entry to begin with (perhaps because it picked up a + * previous operation) + * +- * When resuming validation, if PTF_partial_general_ref is +- * clear, then a general reference must be re-acquired; if it +- * is set, no reference should be acquired. ++ * When resuming validation, if PTF_partial_set is clear, then ++ * a general reference must be re-acquired; if it is set, no ++ * reference should be acquired. + * +- * When resuming de-validation, if PTF_partial_general_ref is +- * clear, no reference should be dropped; if it is set, a +- * reference should be dropped. ++ * When resuming de-validation, if PTF_partial_set is clear, ++ * no reference should be dropped; if it is set, a reference ++ * should be dropped. + * +- * NB at the moment, PTF_partial_set should be set if and only if +- * PTF_partial_general_ref is set. +- * +- * NB that PTF_partial_set and PTF_partial_general_ref are +- * defined in mm.c, the only place where they are used. ++ * NB that PTF_partial_set is defined in mm.c, the only place ++ * where it is used. + * + * The 3rd field, @linear_pt_count, indicates + * - by a positive value, how many same-level page table entries a page +@@ -268,8 +265,8 @@ struct page_info + */ + struct { + u16 nr_validated_ptes:PAGETABLE_ORDER + 1; +- u16 :16 - PAGETABLE_ORDER - 1 - 2; +- u16 partial_flags:2; ++ u16 :16 - PAGETABLE_ORDER - 1 - 1; ++ u16 partial_flags:1; + s16 linear_pt_count; + }; + +-- +2.23.0 + diff --git a/xsa299-4.12-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch b/xsa299-4.12-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch new file mode 100644 index 0000000..bbaba79 --- /dev/null +++ b/xsa299-4.12-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch @@ -0,0 +1,106 @@ +From da3d1d258e54fe600f7f75287183b74d957ec63b Mon Sep 17 00:00:00 2001 +From: George Dunlap +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 09/11] x86/mm: Properly handle linear pagetable promotion + failures + +In order to allow recursive pagetable promotions and demotions to be +interrupted, Xen must keep track of the state of the sub-pages +promoted or demoted. This is stored in two elements in the page +struct: nr_entries_validated and partial_flags. + +The rule is that entries [0, nr_entries_validated) should always be +validated and hold a general reference count. If partial_flags is +zero, then [nr_entries_validated] is not validated and no reference +count is held. If PTF_partial_set is set, then [nr_entries_validated] +is partially validated, and a general reference count is held. + +Unfortunately, in cases where an entry began with PTF_partial_set set, +and get_page_from_lNe() returns -EINVAL, the PTF_partial_set bit is +erroneously dropped. (This scenario can be engineered mainly by the +use of interleaving of promoting and demoting a page which has "linear +pagetable" entries; see the appendix for a sketch.) This means that +we will "leak" a general reference count on the page in question, +preventing the page from being freed. + +Fix this by setting page->partial_flags to the partial_flags local +variable. + +This is part of XSA-299. + +Reported-by: George Dunlap +Signed-off-by: George Dunlap +Reviewed-by: Jan Beulich +----- +Appendix + +Suppose A and B can both be promoted to L2 pages, and A[x] points to B. + +V1: PIN_L2 B. + B.type_count = 1 | PGT_validated + B.count = 2 | PGC_allocated + +V1: MOD_L3_ENTRY pointing something to A. + In the process of validating A[x], grab an extra type / ref on B: + B.type_count = 2 | PGT_validated + B.count = 3 | PGC_allocated + A.type_count = 1 | PGT_validated + A.count = 2 | PGC_allocated + +V1: UNPIN B. + B.type_count = 1 | PGT_validate + B.count = 2 | PGC_allocated + +V1: MOD_L3_ENTRY removing the reference to A. + De-validate A, down to A[x], which points to B. + Drop the final type on B. Arrange to be interrupted. + B.type_count = 1 | PGT_partial + B.count = 2 | PGC_allocated + A.type_count = 1 | PGT_partial + A.nr_validated_entries = x + A.partial_pte = -1 + +V2: MOD_L3_ENTRY adds a reference to A. + +At this point, get_page_from_l2e(A[x]) tries +get_page_and_type_from_mfn(), which fails because it's the wrong type; +and get_l2_linear_pagetable() also fails, because B isn't validated as +an l2 anymore. +--- + xen/arch/x86/mm.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 886e93b8aa..0a094291da 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1581,7 +1581,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + if ( i ) + { + page->nr_validated_ptes = i; +- page->partial_flags = 0; ++ page->partial_flags = partial_flags; + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } +@@ -1674,7 +1674,7 @@ static int alloc_l3_table(struct page_info *page) + if ( i ) + { + page->nr_validated_ptes = i; +- page->partial_flags = 0; ++ page->partial_flags = partial_flags; + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } +@@ -1845,7 +1845,7 @@ static int alloc_l4_table(struct page_info *page) + if ( i ) + { + page->nr_validated_ptes = i; +- page->partial_flags = 0; ++ page->partial_flags = partial_flags; + if ( rc == -EINTR ) + rc = -ERESTART; + else +-- +2.23.0 + diff --git a/xsa299-4.12-0010-x86-mm-Fix-nested-de-validation-on-error.patch b/xsa299-4.12-0010-x86-mm-Fix-nested-de-validation-on-error.patch new file mode 100644 index 0000000..7d5f022 --- /dev/null +++ b/xsa299-4.12-0010-x86-mm-Fix-nested-de-validation-on-error.patch @@ -0,0 +1,166 @@ +From b3e169dc8daeae85b0b51c25fdb142e2e552ec7f Mon Sep 17 00:00:00 2001 +From: George Dunlap +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 10/11] x86/mm: Fix nested de-validation on error + +If an invalid entry is discovered when validating a page-table tree, +the entire tree which has so far been validated must be de-validated. +Since this may take a long time, alloc_l[2-4]_table() set current +vcpu's old_guest_table immediately; put_old_guest_table() will make +sure that put_page_type() will be called to finish off the +de-validation before any other MMU operations can happen on the vcpu. + +The invariant for partial pages should be: + +* Entries [0, nr_validated_ptes) should be completely validated; + put_page_type() will de-validate these. + +* If [nr_validated_ptes] is partially validated, partial_flags should + set PTF_partiaL_set. put_page_type() will be called on this page to + finish off devalidation, and the appropriate refcount adjustments + will be done. + +alloc_l[2-3]_table() indicates partial validation to its callers by +setting current->old_guest_table. + +Unfortunately, this is mishandled. + +Take the case where validating lNe[x] returns an error. + +First, alloc_l3_table() doesn't check old_guest_table at all; as a +result, partial_flags is not set when it should be. nr_validated_ptes +is set to x; and since PFT_partial_set clear, de-validation resumes at +nr_validated_ptes-1. This means that the l2 page at pl3e[x] will not +have put_page_type() called on it when de-validating the rest of the +l3: it will be stuck in the PGT_partial state until the domain is +destroyed, or until it is re-used as an l2. (Any other page type will +fail.) + +Worse, alloc_l4_table(), rather than setting PTF_partial_set as it +should, sets nr_validated_ptes to x+1. When de-validating, since +partial is 0, this will correctly resume calling put_page_type at [x]; +but, if the put_page_type() is never called, but instead +get_page_type() is called, validation will pick up at [x+1], +neglecting to validate [x]. If the rest of the validation succeeds, +the l4 will be validated even though [x] is invalid. + +Fix this in both cases by setting PTF_partial_set if old_guest_table +is set. + +While here, add some safety catches: +- old_guest_table must point to the page contained in + [nr_validated_ptes]. +- alloc_l1_page shouldn't set old_guest_table + +If we experience one of these situations in production builds, it's +safer to avoid calling put_page_type for the pages in question. If +they have PGT_partial set, they will be cleaned up on domain +destruction; if not, we have no idea whether a type count is safe to +drop. Retaining an extra type ref that should have been dropped may +trigger a BUG() on the free_domain_page() path, but dropping a type +count that shouldn't be dropped may cause a privilege escalation. + +This is part of XSA-299. + +Reported-by: George Dunlap +Signed-off-by: George Dunlap +Reviewed-by: Jan Beulich +--- + xen/arch/x86/mm.c | 53 +++++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 51 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 0a094291da..a432e69c74 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1580,6 +1580,20 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + ASSERT(current->arch.old_guest_table == NULL); + if ( i ) + { ++ /* ++ * alloc_l1_table() doesn't set old_guest_table; it does ++ * its own tear-down immediately on failure. If it ++ * did we'd need to check it and set partial_flags as we ++ * do in alloc_l[34]_table(). ++ * ++ * Note on the use of ASSERT: if it's non-null and ++ * hasn't been cleaned up yet, it should have ++ * PGT_partial set; and so the type will be cleaned up ++ * on domain destruction. Unfortunately, we would ++ * leak the general ref held by old_guest_table; but ++ * leaking a page is less bad than a host crash. ++ */ ++ ASSERT(current->arch.old_guest_table == NULL); + page->nr_validated_ptes = i; + page->partial_flags = partial_flags; + current->arch.old_guest_ptpg = NULL; +@@ -1607,6 +1621,7 @@ static int alloc_l3_table(struct page_info *page) + unsigned int i; + int rc = 0; + unsigned int partial_flags = page->partial_flags; ++ l3_pgentry_t l3e = l3e_empty(); + + pl3e = map_domain_page(_mfn(pfn)); + +@@ -1623,7 +1638,7 @@ static int alloc_l3_table(struct page_info *page) + for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; + i++, partial_flags = 0 ) + { +- l3_pgentry_t l3e = pl3e[i]; ++ l3e = pl3e[i]; + + if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) + rc = -EINTR; +@@ -1675,6 +1690,24 @@ static int alloc_l3_table(struct page_info *page) + { + page->nr_validated_ptes = i; + page->partial_flags = partial_flags; ++ if ( current->arch.old_guest_table ) ++ { ++ /* ++ * We've experienced a validation failure. If ++ * old_guest_table is set, "transfer" the general ++ * reference count to pl3e[nr_validated_ptes] by ++ * setting PTF_partial_set. ++ * ++ * As a precaution, check that old_guest_table is the ++ * page pointed to by pl3e[nr_validated_ptes]. If ++ * not, it's safer to leak a type ref on production ++ * builds. ++ */ ++ if ( current->arch.old_guest_table == l3e_get_page(l3e) ) ++ page->partial_flags = PTF_partial_set; ++ else ++ ASSERT_UNREACHABLE(); ++ } + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } +@@ -1851,7 +1884,23 @@ static int alloc_l4_table(struct page_info *page) + else + { + if ( current->arch.old_guest_table ) +- page->nr_validated_ptes++; ++ { ++ /* ++ * We've experienced a validation failure. If ++ * old_guest_table is set, "transfer" the general ++ * reference count to pl3e[nr_validated_ptes] by ++ * setting PTF_partial_set. ++ * ++ * As a precaution, check that old_guest_table is the ++ * page pointed to by pl4e[nr_validated_ptes]. If ++ * not, it's safer to leak a type ref on production ++ * builds. ++ */ ++ if ( current->arch.old_guest_table == l4e_get_page(l4e) ) ++ page->partial_flags = PTF_partial_set; ++ else ++ ASSERT_UNREACHABLE(); ++ } + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } +-- +2.23.0 + diff --git a/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch b/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch new file mode 100644 index 0000000..ad7e6fe --- /dev/null +++ b/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch @@ -0,0 +1,413 @@ +From ea3dc624c5e6325a9c2f079e52a85965d4ab6ce8 Mon Sep 17 00:00:00 2001 +From: George Dunlap +Date: Thu, 10 Oct 2019 17:57:50 +0100 +Subject: [PATCH 11/11] x86/mm: Don't drop a type ref unless you held a ref to + begin with + +Validation and de-validation of pagetable trees may take arbitrarily +large amounts of time, and so must be preemptible. This is indicated +by setting the PGT_partial bit in the type_info, and setting +nr_validated_entries and partial_flags appropriately. Specifically, +if the entry at [nr_validated_entries] is partially validated, +partial_flags should have the PGT_partial_set bit set, and the entry +should hold a general reference count. During de-validation, +put_page_type() is called on partially validated entries. + +Unfortunately, there are a number of issues with the current algorithm. + +First, doing a "normal" put_page_type() is not safe when no type ref +is held: there is nothing to stop another vcpu from coming along and +picking up validation again: at which point the put_page_type may drop +the only page ref on an in-use page. Some examples are listed in the +appendix. + +The core issue is that put_page_type() is being called both to clean +up PGT_partial, and to drop a type count; and has no way of knowing +which is which; and so if in between, PGT_partial is cleared, +put_page_type() will drop the type ref erroneously. + +What is needed is to distinguish between two states: +- Dropping a type ref which is held +- Cleaning up a page which has been partially de/validated + +Fix this by telling put_page_type() which of the two activities you +intend. + +When cleaning up a partial de/validation, take no action unless you +find a page partially validated. + +If put_page_type() is called without PTF_partial_set, and finds the +page in a PGT_partial state anyway, then there's certainly been a +misaccounting somewhere, and carrying on would almost certainly cause +a security issue, so crash the host instead. + +In put_page_from_lNe, pass partial_flags on to _put_page_type(). + +old_guest_table may be set either with a fully validated page (when +using the "deferred put" pattern), or with a partially validated page +(when a normal "de-validation" is interrupted, or when a validation +fails part-way through due to invalid entries). Add a flag, +old_guest_table_partial, to indicate which of these it is, and use +that to pass the appropriate flag to _put_page_type(). + +While here, delete stray trailing whitespace. + +This is part of XSA-299. + +Reported-by: George Dunlap +Signed-off-by: George Dunlap +Reviewed-by: Jan Beulich +----- +Appendix: + +Suppose page A, when interpreted as an l3 pagetable, contains all +valid entries; and suppose A[x] points to page B, which when +interpreted as an l2 pagetable, contains all valid entries. + +P1: PIN_L3_TABLE + A -> PGT_l3_table | 1 | valid + B -> PGT_l2_table | 1 | valid + +P1: UNPIN_TABLE + > Arrange to interrupt after B has been de-validated + B: + type_info -> PGT_l2_table | 0 + A: + type_info -> PGT_l3_table | 1 | partial + nr_validated_enties -> (less than x) + +P2: mod_l4_entry to point to A + > Arrange for this to be interrupted while B is being validated + B: + type_info -> PGT_l2_table | 1 | partial + (nr_validated_entires &c set as appropriate) + A: + type_info -> PGT_l3_table | 1 | partial + nr_validated_entries -> x + partial_pte = 1 + +P3: mod_l3_entry some other unrelated l3 to point to B: + B: + type_info -> PGT_l2_table | 1 + +P1: Restart UNPIN_TABLE + +At this point, since A.nr_validate_entries == x and A.partial_pte != +0, free_l3_table() will call put_page_from_l3e() on pl3e[x], dropping +its type count to 0 while it's still being pointed to by some other l3 + +A similar issue arises with old_guest_table. Consider the following +scenario: + +Suppose A is a page which, when interpreted as an l2, has valid entries +until entry x, which is invalid. + +V1: PIN_L2_TABLE(A) + + A -> PGT_l2_table | 1 | PGT_partial + V1 -> old_guest_table = A + + +V2: PIN_L2_TABLE(A) + + A -> PGT_l2_table | 1 | PGT_partial + V2 -> old_guest_table = A + + put_old_guest_table() + _put_page_type(A) + A -> PGT_l2_table | 0 + +V1: + put_old_guest_table() + _put_page_type(A) # UNDERFLOW + +Indeed, it is possible to engineer for old_guest_table for every vcpu +a guest has to point to the same page. +--- + xen/arch/x86/domain.c | 6 +++ + xen/arch/x86/mm.c | 99 +++++++++++++++++++++++++++++++----- + xen/include/asm-x86/domain.h | 4 +- + 3 files changed, 95 insertions(+), 14 deletions(-) + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 59df8a6d8d..f1ae5f89f5 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -1104,9 +1104,15 @@ int arch_set_info_guest( + rc = -ERESTART; + /* Fallthrough */ + case -ERESTART: ++ /* ++ * NB that we're putting the kernel-mode table ++ * here, which we've already successfully ++ * validated above; hence partial = false; ++ */ + v->arch.old_guest_ptpg = NULL; + v->arch.old_guest_table = + pagetable_get_page(v->arch.guest_table); ++ v->arch.old_guest_table_partial = false; + v->arch.guest_table = pagetable_null(); + break; + default: +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index a432e69c74..81774368a0 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1359,10 +1359,11 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + { + current->arch.old_guest_ptpg = ptpg; + current->arch.old_guest_table = pg; ++ current->arch.old_guest_table_partial = false; + } + else + { +- rc = _put_page_type(pg, PTF_preemptible, ptpg); ++ rc = _put_page_type(pg, flags | PTF_preemptible, ptpg); + if ( likely(!rc) ) + put_page(pg); + } +@@ -1385,6 +1386,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + unsigned long mfn = l3e_get_pfn(l3e); + bool writeable = l3e_get_flags(l3e) & _PAGE_RW; + ++ ASSERT(!(flags & PTF_partial_set)); + ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))); + do { + put_data_page(mfn_to_page(_mfn(mfn)), writeable); +@@ -1397,12 +1399,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + + if ( flags & PTF_defer ) + { ++ ASSERT(!(flags & PTF_partial_set)); + current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); + current->arch.old_guest_table = pg; ++ current->arch.old_guest_table_partial = false; + return 0; + } + +- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); ++ rc = _put_page_type(pg, flags | PTF_preemptible, mfn_to_page(_mfn(pfn))); + if ( likely(!rc) ) + put_page(pg); + +@@ -1421,12 +1425,15 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + + if ( flags & PTF_defer ) + { ++ ASSERT(!(flags & PTF_partial_set)); + current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); + current->arch.old_guest_table = pg; ++ current->arch.old_guest_table_partial = false; + return 0; + } + +- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); ++ rc = _put_page_type(pg, flags | PTF_preemptible, ++ mfn_to_page(_mfn(pfn))); + if ( likely(!rc) ) + put_page(pg); + } +@@ -1535,6 +1542,14 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + + pl2e = map_domain_page(_mfn(pfn)); + ++ /* ++ * NB that alloc_l2_table will never set partial_pte on an l2; but ++ * free_l2_table might if a linear_pagetable entry is interrupted ++ * partway through de-validation. In that circumstance, ++ * get_page_from_l2e() will always return -EINVAL; and we must ++ * retain the type ref by doing the normal partial_flags tracking. ++ */ ++ + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; + i++, partial_flags = 0 ) + { +@@ -1598,6 +1613,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + page->partial_flags = partial_flags; + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; ++ current->arch.old_guest_table_partial = true; + } + } + if ( rc < 0 ) +@@ -1704,12 +1720,16 @@ static int alloc_l3_table(struct page_info *page) + * builds. + */ + if ( current->arch.old_guest_table == l3e_get_page(l3e) ) ++ { ++ ASSERT(current->arch.old_guest_table_partial); + page->partial_flags = PTF_partial_set; ++ } + else + ASSERT_UNREACHABLE(); + } + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; ++ current->arch.old_guest_table_partial = true; + } + while ( i-- > 0 ) + pl3e[i] = unadjust_guest_l3e(pl3e[i], d); +@@ -1897,12 +1917,16 @@ static int alloc_l4_table(struct page_info *page) + * builds. + */ + if ( current->arch.old_guest_table == l4e_get_page(l4e) ) ++ { ++ ASSERT(current->arch.old_guest_table_partial); + page->partial_flags = PTF_partial_set; ++ } + else + ASSERT_UNREACHABLE(); + } + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; ++ current->arch.old_guest_table_partial = true; + } + } + } +@@ -2831,6 +2855,28 @@ static int _put_page_type(struct page_info *page, unsigned int flags, + x = y; + nx = x - 1; + ++ /* ++ * Is this expected to do a full reference drop, or only ++ * cleanup partial validation / devalidation? ++ * ++ * If the former, the caller must hold a "full" type ref; ++ * which means the page must be validated. If the page is ++ * *not* fully validated, continuing would almost certainly ++ * open up a security hole. An exception to this is during ++ * domain destruction, where PGT_validated can be dropped ++ * without dropping a type ref. ++ * ++ * If the latter, do nothing unless type PGT_partial is set. ++ * If it is set, the type count must be 1. ++ */ ++ if ( !(flags & PTF_partial_set) ) ++ BUG_ON((x & PGT_partial) || ++ !((x & PGT_validated) || page_get_owner(page)->is_dying)); ++ else if ( !(x & PGT_partial) ) ++ return 0; ++ else ++ BUG_ON((x & PGT_count_mask) != 1); ++ + ASSERT((x & PGT_count_mask) != 0); + + switch ( nx & (PGT_locked | PGT_count_mask) ) +@@ -3092,17 +3138,34 @@ int put_old_guest_table(struct vcpu *v) + if ( !v->arch.old_guest_table ) + return 0; + +- switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible, +- v->arch.old_guest_ptpg) ) ++ rc = _put_page_type(v->arch.old_guest_table, ++ PTF_preemptible | ++ ( v->arch.old_guest_table_partial ? ++ PTF_partial_set : 0 ), ++ v->arch.old_guest_ptpg); ++ ++ if ( rc == -ERESTART || rc == -EINTR ) + { +- case -EINTR: +- case -ERESTART: ++ v->arch.old_guest_table_partial = (rc == -ERESTART); + return -ERESTART; +- case 0: +- put_page(v->arch.old_guest_table); + } + ++ /* ++ * It shouldn't be possible for _put_page_type() to return ++ * anything else at the moment; but if it does happen in ++ * production, leaking the type ref is probably the best thing to ++ * do. Either way, drop the general ref held by old_guest_table. ++ */ ++ ASSERT(rc == 0); ++ ++ put_page(v->arch.old_guest_table); + v->arch.old_guest_table = NULL; ++ v->arch.old_guest_ptpg = NULL; ++ /* ++ * Safest default if someone sets old_guest_table without ++ * explicitly setting old_guest_table_partial. ++ */ ++ v->arch.old_guest_table_partial = true; + + return rc; + } +@@ -3253,11 +3316,11 @@ int new_guest_cr3(mfn_t mfn) + switch ( rc = put_page_and_type_preemptible(page) ) + { + case -EINTR: +- rc = -ERESTART; +- /* fallthrough */ + case -ERESTART: + curr->arch.old_guest_ptpg = NULL; + curr->arch.old_guest_table = page; ++ curr->arch.old_guest_table_partial = (rc == -ERESTART); ++ rc = -ERESTART; + break; + default: + BUG_ON(rc); +@@ -3494,6 +3557,7 @@ long do_mmuext_op( + { + curr->arch.old_guest_ptpg = NULL; + curr->arch.old_guest_table = page; ++ curr->arch.old_guest_table_partial = false; + } + } + } +@@ -3528,6 +3592,11 @@ long do_mmuext_op( + case -ERESTART: + curr->arch.old_guest_ptpg = NULL; + curr->arch.old_guest_table = page; ++ /* ++ * EINTR means we still hold the type ref; ERESTART ++ * means PGT_partial holds the type ref ++ */ ++ curr->arch.old_guest_table_partial = (rc == -ERESTART); + rc = 0; + break; + default: +@@ -3596,11 +3665,15 @@ long do_mmuext_op( + switch ( rc = put_page_and_type_preemptible(page) ) + { + case -EINTR: +- rc = -ERESTART; +- /* fallthrough */ + case -ERESTART: + curr->arch.old_guest_ptpg = NULL; + curr->arch.old_guest_table = page; ++ /* ++ * EINTR means we still hold the type ref; ++ * ERESTART means PGT_partial holds the ref ++ */ ++ curr->arch.old_guest_table_partial = (rc == -ERESTART); ++ rc = -ERESTART; + break; + default: + BUG_ON(rc); +diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h +index 214e44ce1c..2cfce7b36b 100644 +--- a/xen/include/asm-x86/domain.h ++++ b/xen/include/asm-x86/domain.h +@@ -307,7 +307,7 @@ struct arch_domain + + struct paging_domain paging; + struct p2m_domain *p2m; +- /* To enforce lock ordering in the pod code wrt the ++ /* To enforce lock ordering in the pod code wrt the + * page_alloc lock */ + int page_alloc_unlock_level; + +@@ -581,6 +581,8 @@ struct arch_vcpu + struct page_info *old_guest_table; /* partially destructed pagetable */ + struct page_info *old_guest_ptpg; /* containing page table of the */ + /* former, if any */ ++ bool old_guest_table_partial; /* Are we dropping a type ref, or just ++ * finishing up a partial de-validation? */ + /* guest_table holds a ref to the page, and also a type-count unless + * shadow refcounts are in use */ + pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */ +-- +2.23.0 +