161d5c0
From: Andrew Cooper <andrew.cooper3@citrix.com>
161d5c0
Subject: x86/pv: Fix ABAC cmpxchg() race in _get_page_type()
161d5c0
161d5c0
_get_page_type() suffers from a race condition where it incorrectly assumes
161d5c0
that because 'x' was read and a subsequent a cmpxchg() succeeds, the type
161d5c0
cannot have changed in-between.  Consider:
161d5c0
161d5c0
CPU A:
161d5c0
  1. Creates an L2e referencing pg
161d5c0
     `-> _get_page_type(pg, PGT_l1_page_table), sees count 0, type PGT_writable_page
161d5c0
  2.     Issues flush_tlb_mask()
161d5c0
CPU B:
161d5c0
  3. Creates a writeable mapping of pg
161d5c0
     `-> _get_page_type(pg, PGT_writable_page), count increases to 1
161d5c0
  4. Writes into new mapping, creating a TLB entry for pg
161d5c0
  5. Removes the writeable mapping of pg
161d5c0
     `-> _put_page_type(pg), count goes back down to 0
161d5c0
CPU A:
161d5c0
  7.     Issues cmpxchg(), setting count 1, type PGT_l1_page_table
161d5c0
161d5c0
CPU B now has a writeable mapping to pg, which Xen believes is a pagetable and
161d5c0
suitably protected (i.e. read-only).  The TLB flush in step 2 must be deferred
161d5c0
until after the guest is prohibited from creating new writeable mappings,
161d5c0
which is after step 7.
161d5c0
161d5c0
Defer all safety actions until after the cmpxchg() has successfully taken the
161d5c0
intended typeref, because that is what prevents concurrent users from using
161d5c0
the old type.
161d5c0
161d5c0
Also remove the early validation for writeable and shared pages.  This removes
161d5c0
race conditions where one half of a parallel mapping attempt can return
161d5c0
successfully before:
161d5c0
 * The IOMMU pagetables are in sync with the new page type
161d5c0
 * Writeable mappings to shared pages have been torn down
161d5c0
161d5c0
This is part of XSA-401 / CVE-2022-26362.
161d5c0
161d5c0
Reported-by: Jann Horn <jannh@google.com>
161d5c0
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
161d5c0
Reviewed-by: Jan Beulich <jbeulich@suse.com>
161d5c0
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
161d5c0
161d5c0
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
161d5c0
index ddd32f88c798..1693b580b152 100644
161d5c0
--- a/xen/arch/x86/mm.c
161d5c0
+++ b/xen/arch/x86/mm.c
161d5c0
@@ -2962,56 +2962,12 @@ static int _get_page_type(struct page_info *page, unsigned long type,
161d5c0
              * Type changes are permitted when the typeref is 0.  If the type
161d5c0
              * actually changes, the page needs re-validating.
161d5c0
              */
161d5c0
-            struct domain *d = page_get_owner(page);
161d5c0
-
161d5c0
-            if ( d && shadow_mode_enabled(d) )
161d5c0
-               shadow_prepare_page_type_change(d, page, type);
161d5c0
 
161d5c0
             ASSERT(!(x & PGT_pae_xen_l2));
161d5c0
             if ( (x & PGT_type_mask) != type )
161d5c0
             {
161d5c0
-                /*
161d5c0
-                 * On type change we check to flush stale TLB entries. It is
161d5c0
-                 * vital that no other CPUs are left with writeable mappings
161d5c0
-                 * to a frame which is intending to become pgtable/segdesc.
161d5c0
-                 */
161d5c0
-                cpumask_t *mask = this_cpu(scratch_cpumask);
161d5c0
-
161d5c0
-                BUG_ON(in_irq());
161d5c0
-                cpumask_copy(mask, d->dirty_cpumask);
161d5c0
-
161d5c0
-                /* Don't flush if the timestamp is old enough */
161d5c0
-                tlbflush_filter(mask, page->tlbflush_timestamp);
161d5c0
-
161d5c0
-                if ( unlikely(!cpumask_empty(mask)) &&
161d5c0
-                     /* Shadow mode: track only writable pages. */
161d5c0
-                     (!shadow_mode_enabled(d) ||
161d5c0
-                      ((nx & PGT_type_mask) == PGT_writable_page)) )
161d5c0
-                {
161d5c0
-                    perfc_incr(need_flush_tlb_flush);
161d5c0
-                    /*
161d5c0
-                     * If page was a page table make sure the flush is
161d5c0
-                     * performed using an IPI in order to avoid changing the
161d5c0
-                     * type of a page table page under the feet of
161d5c0
-                     * spurious_page_fault().
161d5c0
-                     */
161d5c0
-                    flush_mask(mask,
161d5c0
-                               (x & PGT_type_mask) &&
161d5c0
-                               (x & PGT_type_mask) <= PGT_root_page_table
161d5c0
-                               ? FLUSH_TLB | FLUSH_FORCE_IPI
161d5c0
-                               : FLUSH_TLB);
161d5c0
-                }
161d5c0
-
161d5c0
-                /* We lose existing type and validity. */
161d5c0
                 nx &= ~(PGT_type_mask | PGT_validated);
161d5c0
                 nx |= type;
161d5c0
-
161d5c0
-                /*
161d5c0
-                 * No special validation needed for writable pages.
161d5c0
-                 * Page tables and GDT/LDT need to be scanned for validity.
161d5c0
-                 */
161d5c0
-                if ( type == PGT_writable_page || type == PGT_shared_page )
161d5c0
-                    nx |= PGT_validated;
161d5c0
             }
161d5c0
         }
161d5c0
         else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
161d5c0
@@ -3092,6 +3048,56 @@ static int _get_page_type(struct page_info *page, unsigned long type,
161d5c0
             return -EINTR;
161d5c0
     }
161d5c0
 
161d5c0
+    /*
161d5c0
+     * One typeref has been taken and is now globally visible.
161d5c0
+     *
161d5c0
+     * The page is either in the "validate locked" state (PGT_[type] | 1) or
161d5c0
+     * fully validated (PGT_[type] | PGT_validated | >0).
161d5c0
+     */
161d5c0
+
161d5c0
+    if ( unlikely((x & PGT_count_mask) == 0) )
161d5c0
+    {
161d5c0
+        struct domain *d = page_get_owner(page);
161d5c0
+
161d5c0
+        if ( d && shadow_mode_enabled(d) )
161d5c0
+            shadow_prepare_page_type_change(d, page, type);
161d5c0
+
161d5c0
+        if ( (x & PGT_type_mask) != type )
161d5c0
+        {
161d5c0
+            /*
161d5c0
+             * On type change we check to flush stale TLB entries. It is
161d5c0
+             * vital that no other CPUs are left with writeable mappings
161d5c0
+             * to a frame which is intending to become pgtable/segdesc.
161d5c0
+             */
161d5c0
+            cpumask_t *mask = this_cpu(scratch_cpumask);
161d5c0
+
161d5c0
+            BUG_ON(in_irq());
161d5c0
+            cpumask_copy(mask, d->dirty_cpumask);
161d5c0
+
161d5c0
+            /* Don't flush if the timestamp is old enough */
161d5c0
+            tlbflush_filter(mask, page->tlbflush_timestamp);
161d5c0
+
161d5c0
+            if ( unlikely(!cpumask_empty(mask)) &&
161d5c0
+                 /* Shadow mode: track only writable pages. */
161d5c0
+                 (!shadow_mode_enabled(d) ||
161d5c0
+                  ((nx & PGT_type_mask) == PGT_writable_page)) )
161d5c0
+            {
161d5c0
+                perfc_incr(need_flush_tlb_flush);
161d5c0
+                /*
161d5c0
+                 * If page was a page table make sure the flush is
161d5c0
+                 * performed using an IPI in order to avoid changing the
161d5c0
+                 * type of a page table page under the feet of
161d5c0
+                 * spurious_page_fault().
161d5c0
+                 */
161d5c0
+                flush_mask(mask,
161d5c0
+                           (x & PGT_type_mask) &&
161d5c0
+                           (x & PGT_type_mask) <= PGT_root_page_table
161d5c0
+                           ? FLUSH_TLB | FLUSH_FORCE_IPI
161d5c0
+                           : FLUSH_TLB);
161d5c0
+            }
161d5c0
+        }
161d5c0
+    }
161d5c0
+
161d5c0
     if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) !=
161d5c0
                   (type == PGT_writable_page)) )
161d5c0
     {
161d5c0
@@ -3120,13 +3126,25 @@ static int _get_page_type(struct page_info *page, unsigned long type,
161d5c0
 
161d5c0
     if ( unlikely(!(nx & PGT_validated)) )
161d5c0
     {
161d5c0
-        if ( !(x & PGT_partial) )
161d5c0
+        /*
161d5c0
+         * No special validation needed for writable or shared pages.  Page
161d5c0
+         * tables and GDT/LDT need to have their contents audited.
161d5c0
+         *
161d5c0
+         * per validate_page(), non-atomic updates are fine here.
161d5c0
+         */
161d5c0
+        if ( type == PGT_writable_page || type == PGT_shared_page )
161d5c0
+            page->u.inuse.type_info |= PGT_validated;
161d5c0
+        else
161d5c0
         {
161d5c0
-            page->nr_validated_ptes = 0;
161d5c0
-            page->partial_flags = 0;
161d5c0
-            page->linear_pt_count = 0;
161d5c0
+            if ( !(x & PGT_partial) )
161d5c0
+            {
161d5c0
+                page->nr_validated_ptes = 0;
161d5c0
+                page->partial_flags = 0;
161d5c0
+                page->linear_pt_count = 0;
161d5c0
+            }
161d5c0
+
161d5c0
+            rc = validate_page(page, type, preemptible);
161d5c0
         }
161d5c0
-        rc = validate_page(page, type, preemptible);
161d5c0
     }
161d5c0
 
161d5c0
  out: