|
|
593eb17 |
From: Jan Beulich <jbeulich@suse.com>
|
|
|
593eb17 |
Subject: x86/mm: also allow L2 (un)validation to be preemptible
|
|
|
593eb17 |
|
|
|
593eb17 |
Commit c612481d1c ("x86/mm: Plumbing to allow any PTE update to fail
|
|
|
593eb17 |
with -ERESTART") added assertions next to the {alloc,free}_l2_table()
|
|
|
593eb17 |
invocations to document (and validate in debug builds) that L2
|
|
|
593eb17 |
(un)validations are always preemptible.
|
|
|
593eb17 |
|
|
|
593eb17 |
The assertion in free_page_type() was now observed to trigger when
|
|
|
593eb17 |
recursive L2 page tables get cleaned up.
|
|
|
593eb17 |
|
|
|
593eb17 |
In particular put_page_from_l2e()'s assumption that _put_page_type()
|
|
|
593eb17 |
would always succeed is now wrong, resulting in a partially un-validated
|
|
|
593eb17 |
page left in a domain, which has no other means of getting cleaned up
|
|
|
593eb17 |
later on. If not causing any problems earlier, this would ultimately
|
|
|
593eb17 |
trigger the check for ->u.inuse.type_info having a zero count when
|
|
|
593eb17 |
freeing the page during cleanup after the domain has died.
|
|
|
593eb17 |
|
|
|
593eb17 |
As a result it should be considered a mistake to not have extended
|
|
|
593eb17 |
preemption fully to L2 when it was added to L3/L4 table handling, which
|
|
|
593eb17 |
this change aims to correct.
|
|
|
593eb17 |
|
|
|
593eb17 |
The validation side additions are done just for symmetry.
|
|
|
593eb17 |
|
|
|
593eb17 |
This is part of XSA-290.
|
|
|
593eb17 |
|
|
|
593eb17 |
Reported-by: Manuel Bouyer <bouyer@antioche.eu.org>
|
|
|
593eb17 |
Tested-by: Manuel Bouyer <bouyer@antioche.eu.org>
|
|
|
593eb17 |
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
|
|
593eb17 |
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
|
593eb17 |
|
|
|
593eb17 |
--- a/xen/arch/x86/mm.c
|
|
|
593eb17 |
+++ b/xen/arch/x86/mm.c
|
|
|
593eb17 |
@@ -1126,7 +1126,7 @@ get_page_from_l1e(
|
|
|
593eb17 |
define_get_linear_pagetable(l2);
|
|
|
593eb17 |
static int
|
|
|
593eb17 |
get_page_from_l2e(
|
|
|
593eb17 |
- l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
|
|
|
593eb17 |
+ l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial)
|
|
|
593eb17 |
{
|
|
|
593eb17 |
unsigned long mfn = l2e_get_pfn(l2e);
|
|
|
593eb17 |
int rc;
|
|
|
593eb17 |
@@ -1141,7 +1141,8 @@ get_page_from_l2e(
|
|
|
593eb17 |
return -EINVAL;
|
|
|
593eb17 |
}
|
|
|
593eb17 |
|
|
|
593eb17 |
- rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, 0, 0);
|
|
|
593eb17 |
+ rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d,
|
|
|
593eb17 |
+ partial, false);
|
|
|
593eb17 |
if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
|
|
|
593eb17 |
rc = 0;
|
|
|
593eb17 |
|
|
|
593eb17 |
@@ -1295,8 +1296,11 @@ void put_page_from_l1e(l1_pgentry_t l1e,
|
|
|
593eb17 |
* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
|
|
|
593eb17 |
* Note also that this automatically deals correctly with linear p.t.'s.
|
|
|
593eb17 |
*/
|
|
|
593eb17 |
-static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
|
|
|
593eb17 |
+static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
|
|
|
593eb17 |
+ int partial, bool defer)
|
|
|
593eb17 |
{
|
|
|
593eb17 |
+ int rc = 0;
|
|
|
593eb17 |
+
|
|
|
593eb17 |
if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
|
|
|
593eb17 |
return 1;
|
|
|
593eb17 |
|
|
|
593eb17 |
@@ -1311,13 +1315,27 @@ static int put_page_from_l2e(l2_pgentry_
|
|
|
593eb17 |
else
|
|
|
593eb17 |
{
|
|
|
593eb17 |
struct page_info *pg = l2e_get_page(l2e);
|
|
|
593eb17 |
- int rc = _put_page_type(pg, false, mfn_to_page(_mfn(pfn)));
|
|
|
593eb17 |
+ struct page_info *ptpg = mfn_to_page(_mfn(pfn));
|
|
|
593eb17 |
|
|
|
593eb17 |
- ASSERT(!rc);
|
|
|
593eb17 |
- put_page(pg);
|
|
|
593eb17 |
+ if ( unlikely(partial > 0) )
|
|
|
593eb17 |
+ {
|
|
|
593eb17 |
+ ASSERT(!defer);
|
|
|
593eb17 |
+ rc = _put_page_type(pg, true, ptpg);
|
|
|
593eb17 |
+ }
|
|
|
593eb17 |
+ else if ( defer )
|
|
|
593eb17 |
+ {
|
|
|
593eb17 |
+ current->arch.old_guest_ptpg = ptpg;
|
|
|
593eb17 |
+ current->arch.old_guest_table = pg;
|
|
|
593eb17 |
+ }
|
|
|
593eb17 |
+ else
|
|
|
593eb17 |
+ {
|
|
|
593eb17 |
+ rc = _put_page_type(pg, true, ptpg);
|
|
|
593eb17 |
+ if ( likely(!rc) )
|
|
|
593eb17 |
+ put_page(pg);
|
|
|
593eb17 |
+ }
|
|
|
593eb17 |
}
|
|
|
593eb17 |
|
|
|
593eb17 |
- return 0;
|
|
|
593eb17 |
+ return rc;
|
|
|
593eb17 |
}
|
|
|
593eb17 |
|
|
|
593eb17 |
static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
|
|
|
593eb17 |
@@ -1487,11 +1505,12 @@ static int alloc_l2_table(struct page_in
|
|
|
593eb17 |
unsigned long pfn = mfn_x(page_to_mfn(page));
|
|
|
593eb17 |
l2_pgentry_t *pl2e;
|
|
|
593eb17 |
unsigned int i;
|
|
|
593eb17 |
- int rc = 0;
|
|
|
593eb17 |
+ int rc = 0, partial = page->partial_pte;
|
|
|
593eb17 |
|
|
|
593eb17 |
pl2e = map_domain_page(_mfn(pfn));
|
|
|
593eb17 |
|
|
|
593eb17 |
- for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
|
|
|
593eb17 |
+ for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
|
|
|
593eb17 |
+ i++, partial = 0 )
|
|
|
593eb17 |
{
|
|
|
593eb17 |
if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
|
|
|
593eb17 |
{
|
|
|
593eb17 |
@@ -1501,23 +1520,33 @@ static int alloc_l2_table(struct page_in
|
|
|
593eb17 |
}
|
|
|
593eb17 |
|
|
|
593eb17 |
if ( !is_guest_l2_slot(d, type, i) ||
|
|
|
593eb17 |
- (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
|
|
|
593eb17 |
+ (rc = get_page_from_l2e(pl2e[i], pfn, d, partial)) > 0 )
|
|
|
593eb17 |
continue;
|
|
|
593eb17 |
|
|
|
593eb17 |
- if ( unlikely(rc == -ERESTART) )
|
|
|
593eb17 |
+ if ( rc == -ERESTART )
|
|
|
593eb17 |
{
|
|
|
593eb17 |
page->nr_validated_ptes = i;
|
|
|
593eb17 |
- break;
|
|
|
593eb17 |
+ page->partial_pte = partial ?: 1;
|
|
|
593eb17 |
}
|
|
|
593eb17 |
-
|
|
|
593eb17 |
- if ( rc < 0 )
|
|
|
593eb17 |
+ else if ( rc == -EINTR && i )
|
|
|
593eb17 |
+ {
|
|
|
593eb17 |
+ page->nr_validated_ptes = i;
|
|
|
593eb17 |
+ page->partial_pte = 0;
|
|
|
593eb17 |
+ rc = -ERESTART;
|
|
|
593eb17 |
+ }
|
|
|
593eb17 |
+ else if ( rc < 0 && rc != -EINTR )
|
|
|
593eb17 |
{
|
|
|
593eb17 |
gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
|
|
|
593eb17 |
- while ( i-- > 0 )
|
|
|
593eb17 |
- if ( is_guest_l2_slot(d, type, i) )
|
|
|
593eb17 |
- put_page_from_l2e(pl2e[i], pfn);
|
|
|
593eb17 |
- break;
|
|
|
593eb17 |
+ if ( i )
|
|
|
593eb17 |
+ {
|
|
|
593eb17 |
+ page->nr_validated_ptes = i;
|
|
|
593eb17 |
+ page->partial_pte = 0;
|
|
|
593eb17 |
+ current->arch.old_guest_ptpg = NULL;
|
|
|
593eb17 |
+ current->arch.old_guest_table = page;
|
|
|
593eb17 |
+ }
|
|
|
593eb17 |
}
|
|
|
593eb17 |
+ if ( rc < 0 )
|
|
|
593eb17 |
+ break;
|
|
|
593eb17 |
|
|
|
593eb17 |
pl2e[i] = adjust_guest_l2e(pl2e[i], d);
|
|
|
593eb17 |
}
|
|
|
593eb17 |
@@ -1797,28 +1826,50 @@ static int free_l2_table(struct page_inf
|
|
|
593eb17 |
struct domain *d = page_get_owner(page);
|
|
|
593eb17 |
unsigned long pfn = mfn_x(page_to_mfn(page));
|
|
|
593eb17 |
l2_pgentry_t *pl2e;
|
|
|
593eb17 |
- unsigned int i = page->nr_validated_ptes - 1;
|
|
|
593eb17 |
- int err = 0;
|
|
|
593eb17 |
+ int rc = 0, partial = page->partial_pte;
|
|
|
593eb17 |
+ unsigned int i = page->nr_validated_ptes - !partial;
|
|
|
593eb17 |
|
|
|
593eb17 |
pl2e = map_domain_page(_mfn(pfn));
|
|
|
593eb17 |
|
|
|
593eb17 |
- ASSERT(page->nr_validated_ptes);
|
|
|
593eb17 |
- do {
|
|
|
593eb17 |
- if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
|
|
|
593eb17 |
- put_page_from_l2e(pl2e[i], pfn) == 0 &&
|
|
|
593eb17 |
- i && hypercall_preempt_check() )
|
|
|
593eb17 |
+ for ( ; ; )
|
|
|
593eb17 |
+ {
|
|
|
593eb17 |
+ if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
|
|
|
593eb17 |
+ rc = put_page_from_l2e(pl2e[i], pfn, partial, false);
|
|
|
593eb17 |
+ if ( rc < 0 )
|
|
|
593eb17 |
+ break;
|
|
|
593eb17 |
+
|
|
|
593eb17 |
+ partial = 0;
|
|
|
593eb17 |
+
|
|
|
593eb17 |
+ if ( !i-- )
|
|
|
593eb17 |
+ break;
|
|
|
593eb17 |
+
|
|
|
593eb17 |
+ if ( hypercall_preempt_check() )
|
|
|
593eb17 |
{
|
|
|
593eb17 |
- page->nr_validated_ptes = i;
|
|
|
593eb17 |
- err = -ERESTART;
|
|
|
593eb17 |
+ rc = -EINTR;
|
|
|
593eb17 |
+ break;
|
|
|
593eb17 |
}
|
|
|
593eb17 |
- } while ( !err && i-- );
|
|
|
593eb17 |
+ }
|
|
|
593eb17 |
|
|
|
593eb17 |
unmap_domain_page(pl2e);
|
|
|
593eb17 |
|
|
|
593eb17 |
- if ( !err )
|
|
|
593eb17 |
+ if ( rc >= 0 )
|
|
|
593eb17 |
+ {
|
|
|
593eb17 |
page->u.inuse.type_info &= ~PGT_pae_xen_l2;
|
|
|
593eb17 |
+ rc = 0;
|
|
|
593eb17 |
+ }
|
|
|
593eb17 |
+ else if ( rc == -ERESTART )
|
|
|
593eb17 |
+ {
|
|
|
593eb17 |
+ page->nr_validated_ptes = i;
|
|
|
593eb17 |
+ page->partial_pte = partial ?: -1;
|
|
|
593eb17 |
+ }
|
|
|
593eb17 |
+ else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
|
|
|
593eb17 |
+ {
|
|
|
593eb17 |
+ page->nr_validated_ptes = i + 1;
|
|
|
593eb17 |
+ page->partial_pte = 0;
|
|
|
593eb17 |
+ rc = -ERESTART;
|
|
|
593eb17 |
+ }
|
|
|
593eb17 |
|
|
|
593eb17 |
- return err;
|
|
|
593eb17 |
+ return rc;
|
|
|
593eb17 |
}
|
|
|
593eb17 |
|
|
|
593eb17 |
static int free_l3_table(struct page_info *page)
|
|
|
593eb17 |
@@ -2138,7 +2189,7 @@ static int mod_l2_entry(l2_pgentry_t *pl
|
|
|
593eb17 |
return -EBUSY;
|
|
|
593eb17 |
}
|
|
|
593eb17 |
|
|
|
593eb17 |
- if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d)) < 0) )
|
|
|
593eb17 |
+ if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d, 0)) < 0) )
|
|
|
593eb17 |
return rc;
|
|
|
593eb17 |
|
|
|
593eb17 |
nl2e = adjust_guest_l2e(nl2e, d);
|
|
|
593eb17 |
@@ -2157,7 +2208,8 @@ static int mod_l2_entry(l2_pgentry_t *pl
|
|
|
593eb17 |
return -EBUSY;
|
|
|
593eb17 |
}
|
|
|
593eb17 |
|
|
|
593eb17 |
- put_page_from_l2e(ol2e, pfn);
|
|
|
593eb17 |
+ put_page_from_l2e(ol2e, pfn, 0, true);
|
|
|
593eb17 |
+
|
|
|
593eb17 |
return rc;
|
|
|
593eb17 |
}
|
|
|
593eb17 |
|