Blob Blame History Raw
From: Jan Beulich <jbeulich@suse.com>
Subject: VT-d: re-assign devices directly

Devices with RMRRs, due to it being unspecified how/when the specified
memory regions may get accessed, may not be left disconnected from their
respective mappings (as long as it's not certain that the device has
been fully quiesced). Hence rather than unmapping the old context and
then mapping the new one, re-assignment needs to be done in a single
step.

This is CVE-2022-26359 / part of XSA-400.

Reported-by: Roger Pau Monné <roger.pau@citrix.com>

Similarly quarantining scratch-page mode relies on page tables to be
continuously wired up.

To avoid complicating things more than necessary, treat all devices
mostly equally, i.e. regardless of their association with any RMRRs. The
main difference is when it comes to updating context entries, which need
to be atomic when there are RMRRs. Yet atomicity can only be achieved
with CMPXCHG16B, availability of which we can't take for given.

The seemingly complicated choice of non-negative return values for
domain_context_mapping_one() is to limit code churn: This way callers
passing NULL for pdev don't need fiddling with.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- a/xen/drivers/passthrough/vtd/extern.h
+++ b/xen/drivers/passthrough/vtd/extern.h
@@ -85,7 +85,8 @@ void free_pgtable_maddr(u64 maddr);
 void *map_vtd_domain_page(u64 maddr);
 void unmap_vtd_domain_page(void *va);
 int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu,
-                               u8 bus, u8 devfn, const struct pci_dev *);
+                               uint8_t bus, uint8_t devfn,
+                               const struct pci_dev *pdev, unsigned int mode);
 int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu,
                              u8 bus, u8 devfn);
 int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt);
@@ -105,8 +106,8 @@ int is_igd_vt_enabled_quirk(void);
 void platform_quirks_init(void);
 void vtd_ops_preamble_quirk(struct vtd_iommu *iommu);
 void vtd_ops_postamble_quirk(struct vtd_iommu *iommu);
-int __must_check me_wifi_quirk(struct domain *domain,
-                               u8 bus, u8 devfn, int map);
+int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus,
+                               uint8_t devfn, unsigned int mode);
 void pci_vtd_quirk(const struct pci_dev *);
 void quirk_iommu_caps(struct vtd_iommu *iommu);
 
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -116,6 +116,7 @@ static int context_set_domain_id(struct
     }
 
     set_bit(i, iommu->domid_bitmap);
+    context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
     context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
     return 0;
 }
@@ -1353,15 +1354,27 @@ static void __hwdom_init intel_iommu_hwd
     }
 }
 
+/*
+ * This function returns
+ * - a negative errno value upon error,
+ * - zero upon success when previously the entry was non-present, or this isn't
+ *   the "main" request for a device (pdev == NULL), or for no-op quarantining
+ *   assignments,
+ * - positive (one) upon success when previously the entry was present and this
+ *   is the "main" request for a device (pdev != NULL).
+ */
 int domain_context_mapping_one(
     struct domain *domain,
     struct vtd_iommu *iommu,
-    u8 bus, u8 devfn, const struct pci_dev *pdev)
+    uint8_t bus, uint8_t devfn, const struct pci_dev *pdev,
+    unsigned int mode)
 {
     struct domain_iommu *hd = dom_iommu(domain);
-    struct context_entry *context, *context_entries;
+    struct context_entry *context, *context_entries, lctxt;
+    __uint128_t old;
     u64 maddr, pgd_maddr;
-    u16 seg = iommu->drhd->segment;
+    uint16_t seg = iommu->drhd->segment, prev_did = 0;
+    struct domain *prev_dom = NULL;
     int agaw, rc, ret;
     bool_t flush_dev_iotlb;
 
@@ -1370,17 +1383,32 @@ int domain_context_mapping_one(
     maddr = bus_to_context_maddr(iommu, bus);
     context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
     context = &context_entries[devfn];
+    old = (lctxt = *context).full;
 
-    if ( context_present(*context) )
+    if ( context_present(lctxt) )
     {
-        spin_unlock(&iommu->lock);
-        unmap_vtd_domain_page(context_entries);
-        return 0;
+        domid_t domid;
+
+        prev_did = context_domain_id(lctxt);
+        domid = iommu->domid_map[prev_did];
+        if ( domid < DOMID_FIRST_RESERVED )
+            prev_dom = rcu_lock_domain_by_id(domid);
+        else if ( domid == DOMID_IO )
+            prev_dom = rcu_lock_domain(dom_io);
+        if ( !prev_dom )
+        {
+            spin_unlock(&iommu->lock);
+            unmap_vtd_domain_page(context_entries);
+            dprintk(XENLOG_DEBUG VTDPREFIX,
+                    "no domain for did %u (nr_dom %u)\n",
+                    prev_did, cap_ndoms(iommu->cap));
+            return -ESRCH;
+        }
     }
 
     if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
     {
-        context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
+        context_set_translation_type(lctxt, CONTEXT_TT_PASS_THRU);
         agaw = level_to_agaw(iommu->nr_pt_levels);
     }
     else
@@ -1397,6 +1425,8 @@ int domain_context_mapping_one(
                 spin_unlock(&hd->arch.mapping_lock);
                 spin_unlock(&iommu->lock);
                 unmap_vtd_domain_page(context_entries);
+                if ( prev_dom )
+                    rcu_unlock_domain(prev_dom);
                 return -ENOMEM;
             }
         }
@@ -1414,33 +1444,102 @@ int domain_context_mapping_one(
                 goto nomem;
         }
 
-        context_set_address_root(*context, pgd_maddr);
+        context_set_address_root(lctxt, pgd_maddr);
         if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
-            context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
+            context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB);
         else
-            context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
+            context_set_translation_type(lctxt, CONTEXT_TT_MULTI_LEVEL);
 
         spin_unlock(&hd->arch.mapping_lock);
     }
 
-    if ( context_set_domain_id(context, domain, iommu) )
+    rc = context_set_domain_id(&lctxt, domain, iommu);
+    if ( rc )
     {
+    unlock:
         spin_unlock(&iommu->lock);
         unmap_vtd_domain_page(context_entries);
-        return -EFAULT;
+        if ( prev_dom )
+            rcu_unlock_domain(prev_dom);
+        return rc;
+    }
+
+    if ( !prev_dom )
+    {
+        context_set_address_width(lctxt, agaw);
+        context_set_fault_enable(lctxt);
+        context_set_present(lctxt);
+    }
+    else if ( prev_dom == domain )
+    {
+        ASSERT(lctxt.full == context->full);
+        rc = !!pdev;
+        goto unlock;
+    }
+    else
+    {
+        ASSERT(context_address_width(lctxt) == agaw);
+        ASSERT(!context_fault_disable(lctxt));
+    }
+
+    if ( cpu_has_cx16 )
+    {
+        __uint128_t res = cmpxchg16b(context, &old, &lctxt.full);
+
+        /*
+         * Hardware does not update the context entry behind our backs,
+         * so the return value should match "old".
+         */
+        if ( res != old )
+        {
+            if ( pdev )
+                check_cleanup_domid_map(domain, pdev, iommu);
+            printk(XENLOG_ERR
+                   "%04x:%02x:%02x.%u: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n",
+                   pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                   (uint64_t)(res >> 64), (uint64_t)res,
+                   (uint64_t)(old >> 64), (uint64_t)old);
+            rc = -EILSEQ;
+            goto unlock;
+        }
+    }
+    else if ( !prev_dom || !(mode & MAP_WITH_RMRR) )
+    {
+        context_clear_present(*context);
+        iommu_sync_cache(context, sizeof(*context));
+
+        write_atomic(&context->hi, lctxt.hi);
+        /* No barrier should be needed between these two. */
+        write_atomic(&context->lo, lctxt.lo);
+    }
+    else /* Best effort, updating DID last. */
+    {
+         /*
+          * By non-atomically updating the context entry's DID field last,
+          * during a short window in time TLB entries with the old domain ID
+          * but the new page tables may be inserted.  This could affect I/O
+          * of other devices using this same (old) domain ID.  Such updating
+          * therefore is not a problem if this was the only device associated
+          * with the old domain ID.  Diverting I/O of any of a dying domain's
+          * devices to the quarantine page tables is intended anyway.
+          */
+        if ( !(mode & (MAP_OWNER_DYING | MAP_SINGLE_DEVICE)) )
+            printk(XENLOG_WARNING VTDPREFIX
+                   " %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n",
+                   seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), prev_dom);
+
+        write_atomic(&context->lo, lctxt.lo);
+        /* No barrier should be needed between these two. */
+        write_atomic(&context->hi, lctxt.hi);
     }
 
-    context_set_address_width(*context, agaw);
-    context_set_fault_enable(*context);
-    context_set_present(*context);
     iommu_sync_cache(context, sizeof(struct context_entry));
     spin_unlock(&iommu->lock);
 
-    /* Context entry was previously non-present (with domid 0). */
-    rc = iommu_flush_context_device(iommu, 0, PCI_BDF2(bus, devfn),
-                                    DMA_CCMD_MASK_NOBIT, 1);
+    rc = iommu_flush_context_device(iommu, prev_did, PCI_BDF2(bus, devfn),
+                                    DMA_CCMD_MASK_NOBIT, !prev_dom);
     flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
-    ret = iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
+    ret = iommu_flush_iotlb_dsi(iommu, prev_did, !prev_dom, flush_dev_iotlb);
 
     /*
      * The current logic for returns:
@@ -1461,17 +1560,26 @@ int domain_context_mapping_one(
     unmap_vtd_domain_page(context_entries);
 
     if ( !seg && !rc )
-        rc = me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC);
+        rc = me_wifi_quirk(domain, bus, devfn, mode);
 
     if ( rc )
     {
-        ret = domain_context_unmap_one(domain, iommu, bus, devfn);
+        if ( !prev_dom )
+            ret = domain_context_unmap_one(domain, iommu, bus, devfn);
+        else if ( prev_dom != domain ) /* Avoid infinite recursion. */
+            ret = domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
+                                             mode & MAP_WITH_RMRR) < 0;
+        else
+            ret = 1;
 
         if ( !ret && pdev && pdev->devfn == devfn )
             check_cleanup_domid_map(domain, pdev, iommu);
     }
 
-    return rc;
+    if ( prev_dom )
+        rcu_unlock_domain(prev_dom);
+
+    return rc ?: pdev && prev_dom;
 }
 
 static int domain_context_unmap(struct domain *d, uint8_t devfn,
@@ -1481,8 +1589,10 @@ static int domain_context_mapping(struct
                                   struct pci_dev *pdev)
 {
     struct acpi_drhd_unit *drhd;
+    const struct acpi_rmrr_unit *rmrr;
     int ret = 0;
-    uint16_t seg = pdev->seg;
+    unsigned int i, mode = 0;
+    uint16_t seg = pdev->seg, bdf;
     uint8_t bus = pdev->bus, secbus;
 
     drhd = acpi_find_matched_drhd_unit(pdev);
@@ -1502,8 +1612,29 @@ static int domain_context_mapping(struct
 
     ASSERT(pcidevs_locked());
 
+    for_each_rmrr_device( rmrr, bdf, i )
+    {
+        if ( rmrr->segment != pdev->seg || bdf != pdev->sbdf.bdf )
+            continue;
+
+        mode |= MAP_WITH_RMRR;
+        break;
+    }
+
+    if ( domain != pdev->domain )
+    {
+        if ( pdev->domain->is_dying )
+            mode |= MAP_OWNER_DYING;
+        else if ( drhd &&
+                  !any_pdev_behind_iommu(pdev->domain, pdev, drhd->iommu) &&
+                  !pdev->phantom_stride )
+            mode |= MAP_SINGLE_DEVICE;
+    }
+
     switch ( pdev->type )
     {
+        bool prev_present;
+
     case DEV_TYPE_PCI_HOST_BRIDGE:
         if ( iommu_debug )
             printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u map\n",
@@ -1524,7 +1655,9 @@ static int domain_context_mapping(struct
                    domain->domain_id, seg, bus,
                    PCI_SLOT(devfn), PCI_FUNC(devfn));
         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                         pdev);
+                                         pdev, mode);
+        if ( ret > 0 )
+            ret = 0;
         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
             enable_ats_device(pdev, &drhd->iommu->ats_devices);
 
@@ -1537,9 +1670,10 @@ static int domain_context_mapping(struct
                    PCI_SLOT(devfn), PCI_FUNC(devfn));
 
         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                         pdev);
-        if ( ret )
+                                         pdev, mode);
+        if ( ret < 0 )
             break;
+        prev_present = ret;
 
         if ( (ret = find_upstream_bridge(seg, &bus, &devfn, &secbus)) < 1 )
         {
@@ -1547,6 +1681,15 @@ static int domain_context_mapping(struct
                 break;
             ret = -ENXIO;
         }
+        /*
+         * Strictly speaking if the device is the only one behind this bridge
+         * and the only one with this (secbus,0,0) tuple, it could be allowed
+         * to be re-assigned regardless of RMRR presence.  But let's deal with
+         * that case only if it is actually found in the wild.
+         */
+        else if ( prev_present && (mode & MAP_WITH_RMRR) &&
+                  domain != pdev->domain )
+            ret = -EOPNOTSUPP;
 
         /*
          * Mapping a bridge should, if anything, pass the struct pci_dev of
@@ -1555,7 +1698,7 @@ static int domain_context_mapping(struct
          */
         if ( ret >= 0 )
             ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                             NULL);
+                                             NULL, mode);
 
         /*
          * Devices behind PCIe-to-PCI/PCIx bridge may generate different
@@ -1570,10 +1713,15 @@ static int domain_context_mapping(struct
         if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
              (secbus != pdev->bus || pdev->devfn != 0) )
             ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
-                                             NULL);
+                                             NULL, mode);
 
         if ( ret )
-            domain_context_unmap(domain, devfn, pdev);
+        {
+            if ( !prev_present )
+                domain_context_unmap(domain, devfn, pdev);
+            else if ( pdev->domain != domain ) /* Avoid infinite recursion. */
+                domain_context_mapping(pdev->domain, devfn, pdev);
+        }
 
         break;
 
@@ -2352,9 +2500,8 @@ static int reassign_device_ownership(
 {
     int ret;
 
-    ret = domain_context_unmap(source, devfn, pdev);
-    if ( ret )
-        return ret;
+    if ( !has_arch_pdevs(target) )
+        vmx_pi_hooks_assign(target);
 
     /*
      * Devices assigned to untrusted domains (here assumed to be any domU)
@@ -2364,6 +2511,31 @@ static int reassign_device_ownership(
     if ( (target != hardware_domain) && !iommu_intremap )
         untrusted_msi = true;
 
+    ret = domain_context_mapping(target, devfn, pdev);
+    if ( ret )
+    {
+        if ( !has_arch_pdevs(target) )
+            vmx_pi_hooks_deassign(target);
+        return ret;
+    }
+
+    if ( pdev->devfn == devfn )
+    {
+        const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
+
+        if ( drhd )
+            check_cleanup_domid_map(source, pdev, drhd->iommu);
+    }
+
+    if ( devfn == pdev->devfn && pdev->domain != target )
+    {
+        list_move(&pdev->domain_list, &target->pdev_list);
+        pdev->domain = target;
+    }
+
+    if ( !has_arch_pdevs(source) )
+        vmx_pi_hooks_deassign(source);
+
     /*
      * If the device belongs to the hardware domain, and it has RMRR, don't
      * remove it from the hardware domain, because BIOS may use RMRR at
@@ -2392,34 +2564,7 @@ static int reassign_device_ownership(
             }
     }
 
-    if ( devfn == pdev->devfn && pdev->domain != dom_io )
-    {
-        list_move(&pdev->domain_list, &dom_io->pdev_list);
-        pdev->domain = dom_io;
-    }
-
-    if ( !has_arch_pdevs(source) )
-        vmx_pi_hooks_deassign(source);
-
-    if ( !has_arch_pdevs(target) )
-        vmx_pi_hooks_assign(target);
-
-    ret = domain_context_mapping(target, devfn, pdev);
-    if ( ret )
-    {
-        if ( !has_arch_pdevs(target) )
-            vmx_pi_hooks_deassign(target);
-
-        return ret;
-    }
-
-    if ( devfn == pdev->devfn && pdev->domain != target )
-    {
-        list_move(&pdev->domain_list, &target->pdev_list);
-        pdev->domain = target;
-    }
-
-    return ret;
+    return 0;
 }
 
 static int intel_iommu_assign_device(
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -202,8 +202,12 @@ struct root_entry {
     do {(root).val |= ((value) & PAGE_MASK_4K);} while(0)
 
 struct context_entry {
-    u64 lo;
-    u64 hi;
+    union {
+        struct {
+            uint64_t lo, hi;
+        };
+        __uint128_t full;
+    };
 };
 #define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
 #define context_present(c) ((c).lo & 1)
--- a/xen/drivers/passthrough/vtd/quirks.c
+++ b/xen/drivers/passthrough/vtd/quirks.c
@@ -344,7 +344,8 @@ void __init platform_quirks_init(void)
  */
 
 static int __must_check map_me_phantom_function(struct domain *domain,
-                                                u32 dev, int map)
+                                                unsigned int dev,
+                                                unsigned int mode)
 {
     struct acpi_drhd_unit *drhd;
     struct pci_dev *pdev;
@@ -355,9 +356,9 @@ static int __must_check map_me_phantom_f
     drhd = acpi_find_matched_drhd_unit(pdev);
 
     /* map or unmap ME phantom function */
-    if ( map )
+    if ( !(mode & UNMAP_ME_PHANTOM_FUNC) )
         rc = domain_context_mapping_one(domain, drhd->iommu, 0,
-                                        PCI_DEVFN(dev, 7), NULL);
+                                        PCI_DEVFN(dev, 7), NULL, mode);
     else
         rc = domain_context_unmap_one(domain, drhd->iommu, 0,
                                       PCI_DEVFN(dev, 7));
@@ -365,7 +366,8 @@ static int __must_check map_me_phantom_f
     return rc;
 }
 
-int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map)
+int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn,
+                  unsigned int mode)
 {
     u32 id;
     int rc = 0;
@@ -389,7 +391,7 @@ int me_wifi_quirk(struct domain *domain,
             case 0x423b8086:
             case 0x423c8086:
             case 0x423d8086:
-                rc = map_me_phantom_function(domain, 3, map);
+                rc = map_me_phantom_function(domain, 3, mode);
                 break;
             default:
                 break;
@@ -415,7 +417,7 @@ int me_wifi_quirk(struct domain *domain,
             case 0x42388086:        /* Puma Peak */
             case 0x422b8086:
             case 0x422c8086:
-                rc = map_me_phantom_function(domain, 22, map);
+                rc = map_me_phantom_function(domain, 22, mode);
                 break;
             default:
                 break;
--- a/xen/drivers/passthrough/vtd/vtd.h
+++ b/xen/drivers/passthrough/vtd/vtd.h
@@ -22,8 +22,14 @@
 
 #include <xen/iommu.h>
 
-#define MAP_ME_PHANTOM_FUNC      1
-#define UNMAP_ME_PHANTOM_FUNC    0
+/*
+ * Values for domain_context_mapping_one()'s and me_wifi_quirk()'s "mode"
+ * parameters.
+ */
+#define MAP_WITH_RMRR         (1u << 0)
+#define MAP_OWNER_DYING       (1u << 1)
+#define MAP_SINGLE_DEVICE     (1u << 2)
+#define UNMAP_ME_PHANTOM_FUNC (1u << 3)
 
 /* Allow for both IOAPIC and IOSAPIC. */
 #define IO_xAPIC_route_entry IO_APIC_route_entry