161d5c0
From: Andrew Cooper <andrew.cooper3@citrix.com>
161d5c0
Subject: x86/pv: Track and flush non-coherent mappings of RAM
161d5c0
161d5c0
There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with
161d5c0
devices that make non-coherent writes.  The Linux sound subsystem makes
161d5c0
extensive use of this technique.
161d5c0
161d5c0
For such usecases, the guest's DMA buffer is mapped and consistently used as
161d5c0
WC, and Xen doesn't interact with the buffer.
161d5c0
161d5c0
However, a mischevious guest can use WC mappings to deliberately create
161d5c0
non-coherency between the cache and RAM, and use this to trick Xen into
161d5c0
validating a pagetable which isn't actually safe.
161d5c0
161d5c0
Allocate a new PGT_non_coherent to track the non-coherency of mappings.  Set
161d5c0
it whenever a non-coherent writeable mapping is created.  If the page is used
161d5c0
as anything other than PGT_writable_page, force a cache flush before
161d5c0
validation.  Also force a cache flush before the page is returned to the heap.
161d5c0
161d5c0
This is CVE-2022-26364, part of XSA-402.
161d5c0
161d5c0
Reported-by: Jann Horn <jannh@google.com>
161d5c0
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
161d5c0
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
161d5c0
Reviewed-by: Jan Beulich <jbeulich@suse.com>
161d5c0
161d5c0
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
161d5c0
index ab32d13a1a0d..bab9624fabb7 100644
161d5c0
--- a/xen/arch/x86/mm.c
161d5c0
+++ b/xen/arch/x86/mm.c
161d5c0
@@ -997,6 +997,15 @@ get_page_from_l1e(
161d5c0
         return -EACCES;
161d5c0
     }
161d5c0
 
161d5c0
+    /*
161d5c0
+     * Track writeable non-coherent mappings to RAM pages, to trigger a cache
161d5c0
+     * flush later if the target is used as anything but a PGT_writeable page.
161d5c0
+     * We care about all writeable mappings, including foreign mappings.
161d5c0
+     */
161d5c0
+    if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) &&
161d5c0
+         (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) )
161d5c0
+        set_bit(_PGT_non_coherent, &page->u.inuse.type_info);
161d5c0
+
161d5c0
     return 0;
161d5c0
 
161d5c0
  could_not_pin:
161d5c0
@@ -2454,6 +2463,19 @@ static int cleanup_page_mappings(struct page_info *page)
161d5c0
         }
161d5c0
     }
161d5c0
 
161d5c0
+    /*
161d5c0
+     * Flush the cache if there were previously non-coherent writeable
161d5c0
+     * mappings of this page.  This forces the page to be coherent before it
161d5c0
+     * is freed back to the heap.
161d5c0
+     */
161d5c0
+    if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) )
161d5c0
+    {
161d5c0
+        void *addr = __map_domain_page(page);
161d5c0
+
161d5c0
+        cache_flush(addr, PAGE_SIZE);
161d5c0
+        unmap_domain_page(addr);
161d5c0
+    }
161d5c0
+
161d5c0
     return rc;
161d5c0
 }
161d5c0
 
161d5c0
@@ -3028,6 +3050,22 @@ static int _get_page_type(struct page_info *page, unsigned long type,
161d5c0
     if ( unlikely(!(nx & PGT_validated)) )
161d5c0
     {
161d5c0
         /*
161d5c0
+         * Flush the cache if there were previously non-coherent mappings of
161d5c0
+         * this page, and we're trying to use it as anything other than a
161d5c0
+         * writeable page.  This forces the page to be coherent before we
161d5c0
+         * validate its contents for safety.
161d5c0
+         */
161d5c0
+        if ( (nx & PGT_non_coherent) && type != PGT_writable_page )
161d5c0
+        {
161d5c0
+            void *addr = __map_domain_page(page);
161d5c0
+
161d5c0
+            cache_flush(addr, PAGE_SIZE);
161d5c0
+            unmap_domain_page(addr);
161d5c0
+
161d5c0
+            page->u.inuse.type_info &= ~PGT_non_coherent;
161d5c0
+        }
161d5c0
+
161d5c0
+        /*
161d5c0
          * No special validation needed for writable or shared pages.  Page
161d5c0
          * tables and GDT/LDT need to have their contents audited.
161d5c0
          *
161d5c0
diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c
161d5c0
index 0325618c9883..81c72e61ed55 100644
161d5c0
--- a/xen/arch/x86/pv/grant_table.c
161d5c0
+++ b/xen/arch/x86/pv/grant_table.c
161d5c0
@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame,
161d5c0
 
161d5c0
     ol1e = *pl1e;
161d5c0
     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
161d5c0
+    {
161d5c0
+        /*
161d5c0
+         * We always create mappings in this path.  However, our caller,
161d5c0
+         * map_grant_ref(), only passes potentially non-zero cache_flags for
161d5c0
+         * MMIO frames, so this path doesn't create non-coherent mappings of
161d5c0
+         * RAM frames and there's no need to calculate PGT_non_coherent.
161d5c0
+         */
161d5c0
+        ASSERT(!cache_flags || is_iomem_page(frame));
161d5c0
+
161d5c0
         rc = GNTST_okay;
161d5c0
+    }
161d5c0
 
161d5c0
  out_unlock:
161d5c0
     page_unlock(page);
161d5c0
@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame,
161d5c0
                  l1e_get_flags(ol1e), addr, grant_pte_flags);
161d5c0
 
161d5c0
     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
161d5c0
+    {
161d5c0
+        /*
161d5c0
+         * Generally, replace_grant_pv_mapping() is used to destroy mappings
161d5c0
+         * (n1le = l1e_empty()), but it can be a present mapping on the
161d5c0
+         * GNTABOP_unmap_and_replace path.
161d5c0
+         *
161d5c0
+         * In such cases, the PTE is fully transplanted from its old location
161d5c0
+         * via steal_linear_addr(), so we need not perform PGT_non_coherent
161d5c0
+         * checking here.
161d5c0
+         */
161d5c0
         rc = GNTST_okay;
161d5c0
+    }
161d5c0
 
161d5c0
  out_unlock:
161d5c0
     page_unlock(page);
161d5c0
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
161d5c0
index 8a9a43bb0a9d..7464167ae192 100644
161d5c0
--- a/xen/include/asm-x86/mm.h
161d5c0
+++ b/xen/include/asm-x86/mm.h
161d5c0
@@ -53,8 +53,12 @@
161d5c0
 #define _PGT_partial      PG_shift(8)
161d5c0
 #define PGT_partial       PG_mask(1, 8)
161d5c0
 
161d5c0
+/* Has this page been mapped writeable with a non-coherent memory type? */
161d5c0
+#define _PGT_non_coherent PG_shift(9)
161d5c0
+#define PGT_non_coherent  PG_mask(1, 9)
161d5c0
+
161d5c0
  /* Count of uses of this frame as its current type. */
161d5c0
-#define PGT_count_width   PG_shift(8)
161d5c0
+#define PGT_count_width   PG_shift(9)
161d5c0
 #define PGT_count_mask    ((1UL<
161d5c0
 
161d5c0
 /* Are the 'type mask' bits identical? */