diff --git a/xen.git-3581714729cf84df50d6831c4da076e21587408c.patch b/xen.git-3581714729cf84df50d6831c4da076e21587408c.patch new file mode 100644 index 0000000..6eb4f71 --- /dev/null +++ b/xen.git-3581714729cf84df50d6831c4da076e21587408c.patch @@ -0,0 +1,117 @@ +From 3581714729cf84df50d6831c4da076e21587408c Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Thu, 15 Jul 2021 09:21:04 +0200 +Subject: [PATCH] VT-d: adjust domid map updating when unmapping context + +When an earlier error occurred, cleaning up the domid mapping data is +wrong, as references likely still exist. The only exception to this is +when the actual unmapping worked, but some flush failed (supposedly +impossible after XSA-373). The guest will get crashed in such a case +though, so add fallback cleanup to domain destruction to cover this +case. This in turn makes it desirable to silence the dprintk() in +domain_iommu_domid(). + +Note that no error will be returned anymore when the lookup fails - in +the common case lookup failure would already have caused +domain_context_unmap_one() to fail, yet even from a more general +perspective it doesn't look right to fail domain_context_unmap() in such +a case when this was the last device, but not when any earlier unmap was +otherwise successful. + +Signed-off-by: Jan Beulich +Reviewed-by: Kevin Tian +master commit: 32655880057ce2829f962d46916ea6cec60f98d3 +master date: 2021-06-24 16:29:13 +0200 +--- + xen/drivers/passthrough/vtd/iommu.c | 39 ++++++++++++++++++----------- + 1 file changed, 24 insertions(+), 15 deletions(-) + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index c0070f1c99..2b3609dae2 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -80,9 +80,11 @@ static int domain_iommu_domid(struct domain *d, + i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1); + } + +- dprintk(XENLOG_ERR VTDPREFIX, +- "Cannot get valid iommu domid: domid=%d iommu->index=%d\n", +- d->domain_id, iommu->index); ++ if ( !d->is_dying ) ++ dprintk(XENLOG_ERR VTDPREFIX, ++ "Cannot get valid iommu %u domid: %pd\n", ++ iommu->index, d); ++ + return -1; + } + +@@ -147,6 +149,17 @@ static int context_get_domain_id(struct context_entry *context, + return domid; + } + ++static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu) ++{ ++ int iommu_domid = domain_iommu_domid(domain, iommu); ++ ++ if ( iommu_domid >= 0 ) ++ { ++ clear_bit(iommu_domid, iommu->domid_bitmap); ++ iommu->domid_map[iommu_domid] = 0; ++ } ++} ++ + static void sync_cache(const void *addr, unsigned int size) + { + static unsigned long clflush_size = 0; +@@ -1732,6 +1745,9 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, + goto out; + } + ++ if ( ret ) ++ goto out; ++ + /* + * if no other devices under the same iommu owned by this domain, + * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp +@@ -1751,19 +1767,8 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, + + if ( found == 0 ) + { +- int iommu_domid; +- + clear_bit(iommu->index, &dom_iommu(domain)->arch.vtd.iommu_bitmap); +- +- iommu_domid = domain_iommu_domid(domain, iommu); +- if ( iommu_domid == -1 ) +- { +- ret = -EINVAL; +- goto out; +- } +- +- clear_bit(iommu_domid, iommu->domid_bitmap); +- iommu->domid_map[iommu_domid] = 0; ++ cleanup_domid_map(domain, iommu); + } + + out: +@@ -1783,6 +1788,7 @@ static void iommu_domain_teardown(struct domain *d) + { + struct domain_iommu *hd = dom_iommu(d); + struct mapped_rmrr *mrmrr, *tmp; ++ const struct acpi_drhd_unit *drhd; + + if ( list_empty(&acpi_drhd_units) ) + return; +@@ -1794,6 +1800,9 @@ static void iommu_domain_teardown(struct domain *d) + } + + ASSERT(!hd->arch.vtd.pgd_maddr); ++ ++ for_each_drhd_unit ( drhd ) ++ cleanup_domid_map(d, drhd->iommu); + } + + static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn, +-- +2.30.2 + diff --git a/xen.spec b/xen.spec index 8786b4e..93276be 100644 --- a/xen.spec +++ b/xen.spec @@ -58,7 +58,7 @@ Summary: Xen is a virtual machine monitor Name: xen Version: 4.15.0 -Release: 5%{?dist} +Release: 6%{?dist} License: GPLv2+ and LGPLv2+ and BSD URL: http://xen.org/ Source0: https://downloads.xenproject.org/release/xen/%{version}/xen-%{version}.tar.gz @@ -123,6 +123,20 @@ Patch51: xsa373-4.15-5.patch Patch52: xsa375.patch Patch53: xsa377.patch Patch54: xen.git-58ad654ebce7ccb272a3f4f3482c03aaad850d31.patch +Patch55: xen.git-3581714729cf84df50d6831c4da076e21587408c.patch +Patch56: xsa378-4.15-1.patch +Patch57: xsa378-4.15-2.patch +Patch58: xsa378-4.15-3.patch +Patch59: xsa378-4.15-4.patch +Patch60: xsa378-4.15-5.patch +Patch61: xsa378-4.15-6.patch +Patch62: xsa378-4.15-7.patch +Patch63: xsa378-4.15-8.patch +Patch64: xsa379-4.15.patch +Patch65: xsa380-1.patch +Patch66: xsa380-2.patch +Patch67: xsa382.patch +Patch68: xsa383.patch %if %build_qemutrad @@ -342,6 +356,20 @@ manage Xen virtual machines. %patch52 -p1 %patch53 -p1 %patch54 -p1 +%patch55 -p1 +%patch56 -p1 +%patch57 -p1 +%patch58 -p1 +%patch59 -p1 +%patch60 -p1 +%patch61 -p1 +%patch62 -p1 +%patch63 -p1 +%patch64 -p1 +%patch65 -p1 +%patch66 -p1 +%patch67 -p1 +%patch68 -p1 # qemu-xen-traditional patches pushd tools/qemu-xen-traditional @@ -640,6 +668,9 @@ if [ -f /sbin/grub2-mkconfig ]; then DIR=/usr/lib/grub/i386-pc TARGET=/boot/grub2/i386-pc do_it $DIR $TARGET + DIR=/usr/lib/grub/x86_64-efi + TARGET=/boot/grub2/x86_64-efi + do_it $DIR $TARGET fi if [ -f /boot/efi/EFI/fedora/grub.cfg ]; then DIR=/usr/lib/grub/x86_64-efi @@ -939,6 +970,20 @@ fi %endif %changelog +* Wed Aug 25 2021 Michael Young - 4.15.0-6 +- IOMMU page mapping issues on x86 [XSA-378, CVE-2021-28694, + CVE-2021-28695, CVE-2021-28696] (#1997531) (#1997568) + (#1997537) +- grant table v2 status pages may remain accessible after de-allocation + [XSA-379, CVE-2021-28697] (#1997520) +- long running loops in grant table handling [XSA-380, CVE-2021-28698] + (#1997526) +- inadequate grant-v2 status frames array bounds check [XSA-382, + CVE-2021-28699] (#1997523) +- xen/arm: No memory limit for dom0less domUs [XSA-383, CVE-2021-28700] + (#1997527) +- grub x86_64-efi modules now go into /boot/grub2 + * Thu Aug 12 2021 Michael Young - 4.15.0-5 - work around build issue with GNU ld 2.37 (#1990344) diff --git a/xsa378-4.15-1.patch b/xsa378-4.15-1.patch new file mode 100644 index 0000000..4b38f96 --- /dev/null +++ b/xsa378-4.15-1.patch @@ -0,0 +1,142 @@ +From: Jan Beulich +Subject: AMD/IOMMU: correct global exclusion range extending + +Besides unity mapping regions, the AMD IOMMU spec also provides for +exclusion ranges (areas of memory not to be subject to DMA translation) +to be specified by firmware in the ACPI tables. The spec does not put +any constraints on the number of such regions. + +Blindly assuming all addresses between any two such ranges should also +be excluded can't be right. Since hardware has room for just a single +such range (comprised of the Exclusion Base Register and the Exclusion +Range Limit Register), combine only adjacent or overlapping regions (for +now; this may require further adjustment in case table entries aren't +sorted by address) with matching exclusion_allow_all settings. This +requires bubbling up error indicators, such that IOMMU init can be +failed when concatenation wasn't possible. + +Furthermore, since the exclusion range specified in IOMMU registers +implies R/W access, reject requests asking for less permissions (this +will be brought closer to the spec by a subsequent change). + +This is part of XSA-378 / CVE-2021-28695. + +Signed-off-by: Jan Beulich +Reviewed-by: Paul Durrant + +--- a/xen/drivers/passthrough/amd/iommu_acpi.c ++++ b/xen/drivers/passthrough/amd/iommu_acpi.c +@@ -116,12 +116,21 @@ static struct amd_iommu * __init find_io + return NULL; + } + +-static void __init reserve_iommu_exclusion_range( +- struct amd_iommu *iommu, uint64_t base, uint64_t limit) ++static int __init reserve_iommu_exclusion_range( ++ struct amd_iommu *iommu, uint64_t base, uint64_t limit, ++ bool all, bool iw, bool ir) + { ++ if ( !ir || !iw ) ++ return -EPERM; ++ + /* need to extend exclusion range? */ + if ( iommu->exclusion_enable ) + { ++ if ( iommu->exclusion_limit + PAGE_SIZE < base || ++ limit + PAGE_SIZE < iommu->exclusion_base || ++ iommu->exclusion_allow_all != all ) ++ return -EBUSY; ++ + if ( iommu->exclusion_base < base ) + base = iommu->exclusion_base; + if ( iommu->exclusion_limit > limit ) +@@ -129,16 +138,11 @@ static void __init reserve_iommu_exclusi + } + + iommu->exclusion_enable = IOMMU_CONTROL_ENABLED; ++ iommu->exclusion_allow_all = all; + iommu->exclusion_base = base; + iommu->exclusion_limit = limit; +-} + +-static void __init reserve_iommu_exclusion_range_all( +- struct amd_iommu *iommu, +- unsigned long base, unsigned long limit) +-{ +- reserve_iommu_exclusion_range(iommu, base, limit); +- iommu->exclusion_allow_all = IOMMU_CONTROL_ENABLED; ++ return 0; + } + + static void __init reserve_unity_map_for_device( +@@ -176,6 +180,7 @@ static int __init register_exclusion_ran + unsigned long range_top, iommu_top, length; + struct amd_iommu *iommu; + unsigned int bdf; ++ int rc = 0; + + /* is part of exclusion range inside of IOMMU virtual address space? */ + /* note: 'limit' parameter is assumed to be page-aligned */ +@@ -197,10 +202,15 @@ static int __init register_exclusion_ran + if ( limit >= iommu_top ) + { + for_each_amd_iommu( iommu ) +- reserve_iommu_exclusion_range_all(iommu, base, limit); ++ { ++ rc = reserve_iommu_exclusion_range(iommu, base, limit, ++ true /* all */, iw, ir); ++ if ( rc ) ++ break; ++ } + } + +- return 0; ++ return rc; + } + + static int __init register_exclusion_range_for_device( +@@ -211,6 +221,7 @@ static int __init register_exclusion_ran + unsigned long range_top, iommu_top, length; + struct amd_iommu *iommu; + u16 req; ++ int rc = 0; + + iommu = find_iommu_for_device(seg, bdf); + if ( !iommu ) +@@ -240,12 +251,13 @@ static int __init register_exclusion_ran + /* register IOMMU exclusion range settings for device */ + if ( limit >= iommu_top ) + { +- reserve_iommu_exclusion_range(iommu, base, limit); ++ rc = reserve_iommu_exclusion_range(iommu, base, limit, ++ false /* all */, iw, ir); + ivrs_mappings[bdf].dte_allow_exclusion = true; + ivrs_mappings[req].dte_allow_exclusion = true; + } + +- return 0; ++ return rc; + } + + static int __init register_exclusion_range_for_iommu_devices( +@@ -255,6 +267,7 @@ static int __init register_exclusion_ran + unsigned long range_top, iommu_top, length; + unsigned int bdf; + u16 req; ++ int rc = 0; + + /* is part of exclusion range inside of IOMMU virtual address space? */ + /* note: 'limit' parameter is assumed to be page-aligned */ +@@ -285,8 +298,10 @@ static int __init register_exclusion_ran + + /* register IOMMU exclusion range settings */ + if ( limit >= iommu_top ) +- reserve_iommu_exclusion_range_all(iommu, base, limit); +- return 0; ++ rc = reserve_iommu_exclusion_range(iommu, base, limit, ++ true /* all */, iw, ir); ++ ++ return rc; + } + + static int __init parse_ivmd_device_select( diff --git a/xsa378-4.15-2.patch b/xsa378-4.15-2.patch new file mode 100644 index 0000000..aa19772 --- /dev/null +++ b/xsa378-4.15-2.patch @@ -0,0 +1,218 @@ +From: Jan Beulich +Subject: AMD/IOMMU: correct device unity map handling + +Blindly assuming all addresses between any two such ranges, specified by +firmware in the ACPI tables, should also be unity-mapped can't be right. +Nor can it be correct to merge ranges with differing permissions. Track +ranges individually; don't merge at all, but check for overlaps instead. +This requires bubbling up error indicators, such that IOMMU init can be +failed when allocation of a new tracking struct wasn't possible, or an +overlap was detected. + +At this occasion also stop ignoring +amd_iommu_reserve_domain_unity_map()'s return value. + +This is part of XSA-378 / CVE-2021-28695. + +Signed-off-by: Jan Beulich +Reviewed-by: George Dunlap +Reviewed-by: Paul Durrant + +--- a/xen/drivers/passthrough/amd/iommu.h ++++ b/xen/drivers/passthrough/amd/iommu.h +@@ -107,20 +107,24 @@ struct amd_iommu { + struct list_head ats_devices; + }; + ++struct ivrs_unity_map { ++ bool read:1; ++ bool write:1; ++ paddr_t addr; ++ unsigned long length; ++ struct ivrs_unity_map *next; ++}; ++ + struct ivrs_mappings { + uint16_t dte_requestor_id; + bool valid:1; + bool dte_allow_exclusion:1; +- bool unity_map_enable:1; +- bool write_permission:1; +- bool read_permission:1; + + /* ivhd device data settings */ + uint8_t device_flags; + +- unsigned long addr_range_start; +- unsigned long addr_range_length; + struct amd_iommu *iommu; ++ struct ivrs_unity_map *unity_map; + + /* per device interrupt remapping table */ + void *intremap_table; +--- a/xen/drivers/passthrough/amd/iommu_acpi.c ++++ b/xen/drivers/passthrough/amd/iommu_acpi.c +@@ -145,32 +145,48 @@ static int __init reserve_iommu_exclusio + return 0; + } + +-static void __init reserve_unity_map_for_device( +- u16 seg, u16 bdf, unsigned long base, +- unsigned long length, u8 iw, u8 ir) ++static int __init reserve_unity_map_for_device( ++ uint16_t seg, uint16_t bdf, unsigned long base, ++ unsigned long length, bool iw, bool ir) + { + struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg); +- unsigned long old_top, new_top; ++ struct ivrs_unity_map *unity_map = ivrs_mappings[bdf].unity_map; + +- /* need to extend unity-mapped range? */ +- if ( ivrs_mappings[bdf].unity_map_enable ) ++ /* Check for overlaps. */ ++ for ( ; unity_map; unity_map = unity_map->next ) + { +- old_top = ivrs_mappings[bdf].addr_range_start + +- ivrs_mappings[bdf].addr_range_length; +- new_top = base + length; +- if ( old_top > new_top ) +- new_top = old_top; +- if ( ivrs_mappings[bdf].addr_range_start < base ) +- base = ivrs_mappings[bdf].addr_range_start; +- length = new_top - base; +- } +- +- /* extend r/w permissioms and keep aggregate */ +- ivrs_mappings[bdf].write_permission = iw; +- ivrs_mappings[bdf].read_permission = ir; +- ivrs_mappings[bdf].unity_map_enable = true; +- ivrs_mappings[bdf].addr_range_start = base; +- ivrs_mappings[bdf].addr_range_length = length; ++ /* ++ * Exact matches are okay. This can in particular happen when ++ * register_exclusion_range_for_device() calls here twice for the ++ * same (s,b,d,f). ++ */ ++ if ( base == unity_map->addr && length == unity_map->length && ++ ir == unity_map->read && iw == unity_map->write ) ++ return 0; ++ ++ if ( unity_map->addr + unity_map->length > base && ++ base + length > unity_map->addr ) ++ { ++ AMD_IOMMU_DEBUG("IVMD Error: overlap [%lx,%lx) vs [%lx,%lx)\n", ++ base, base + length, unity_map->addr, ++ unity_map->addr + unity_map->length); ++ return -EPERM; ++ } ++ } ++ ++ /* Populate and insert a new unity map. */ ++ unity_map = xmalloc(struct ivrs_unity_map); ++ if ( !unity_map ) ++ return -ENOMEM; ++ ++ unity_map->read = ir; ++ unity_map->write = iw; ++ unity_map->addr = base; ++ unity_map->length = length; ++ unity_map->next = ivrs_mappings[bdf].unity_map; ++ ivrs_mappings[bdf].unity_map = unity_map; ++ ++ return 0; + } + + static int __init register_exclusion_range_for_all_devices( +@@ -193,13 +209,13 @@ static int __init register_exclusion_ran + length = range_top - base; + /* reserve r/w unity-mapped page entries for devices */ + /* note: these entries are part of the exclusion range */ +- for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) +- reserve_unity_map_for_device(seg, bdf, base, length, iw, ir); ++ for ( bdf = 0; !rc && bdf < ivrs_bdf_entries; bdf++ ) ++ rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir); + /* push 'base' just outside of virtual address space */ + base = iommu_top; + } + /* register IOMMU exclusion range settings */ +- if ( limit >= iommu_top ) ++ if ( !rc && limit >= iommu_top ) + { + for_each_amd_iommu( iommu ) + { +@@ -241,15 +257,15 @@ static int __init register_exclusion_ran + length = range_top - base; + /* reserve unity-mapped page entries for device */ + /* note: these entries are part of the exclusion range */ +- reserve_unity_map_for_device(seg, bdf, base, length, iw, ir); +- reserve_unity_map_for_device(seg, req, base, length, iw, ir); ++ rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir) ?: ++ reserve_unity_map_for_device(seg, req, base, length, iw, ir); + + /* push 'base' just outside of virtual address space */ + base = iommu_top; + } + + /* register IOMMU exclusion range settings for device */ +- if ( limit >= iommu_top ) ++ if ( !rc && limit >= iommu_top ) + { + rc = reserve_iommu_exclusion_range(iommu, base, limit, + false /* all */, iw, ir); +@@ -280,15 +296,15 @@ static int __init register_exclusion_ran + length = range_top - base; + /* reserve r/w unity-mapped page entries for devices */ + /* note: these entries are part of the exclusion range */ +- for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) ++ for ( bdf = 0; !rc && bdf < ivrs_bdf_entries; bdf++ ) + { + if ( iommu == find_iommu_for_device(iommu->seg, bdf) ) + { +- reserve_unity_map_for_device(iommu->seg, bdf, base, length, +- iw, ir); + req = get_ivrs_mappings(iommu->seg)[bdf].dte_requestor_id; +- reserve_unity_map_for_device(iommu->seg, req, base, length, +- iw, ir); ++ rc = reserve_unity_map_for_device(iommu->seg, bdf, base, length, ++ iw, ir) ?: ++ reserve_unity_map_for_device(iommu->seg, req, base, length, ++ iw, ir); + } + } + +@@ -297,7 +313,7 @@ static int __init register_exclusion_ran + } + + /* register IOMMU exclusion range settings */ +- if ( limit >= iommu_top ) ++ if ( !rc && limit >= iommu_top ) + rc = reserve_iommu_exclusion_range(iommu, base, limit, + true /* all */, iw, ir); + +--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c ++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c +@@ -367,15 +367,17 @@ static int amd_iommu_assign_device(struc + struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); + int bdf = PCI_BDF2(pdev->bus, devfn); + int req_id = get_dma_requestor_id(pdev->seg, bdf); ++ const struct ivrs_unity_map *unity_map; + +- if ( ivrs_mappings[req_id].unity_map_enable ) ++ for ( unity_map = ivrs_mappings[req_id].unity_map; unity_map; ++ unity_map = unity_map->next ) + { +- amd_iommu_reserve_domain_unity_map( +- d, +- ivrs_mappings[req_id].addr_range_start, +- ivrs_mappings[req_id].addr_range_length, +- ivrs_mappings[req_id].write_permission, +- ivrs_mappings[req_id].read_permission); ++ int rc = amd_iommu_reserve_domain_unity_map( ++ d, unity_map->addr, unity_map->length, ++ unity_map->write, unity_map->read); ++ ++ if ( rc ) ++ return rc; + } + + return reassign_device(pdev->domain, d, devfn, pdev); diff --git a/xsa378-4.15-3.patch b/xsa378-4.15-3.patch new file mode 100644 index 0000000..9e1bd18 --- /dev/null +++ b/xsa378-4.15-3.patch @@ -0,0 +1,102 @@ +From: Jan Beulich +Subject: IOMMU: also pass p2m_access_t to p2m_get_iommu_flags() + +A subsequent change will want to customize the IOMMU permissions based +on this. + +This is part of XSA-378. + +Signed-off-by: Jan Beulich +Reviewed-by: Paul Durrant + +--- a/xen/arch/x86/mm/p2m-ept.c ++++ b/xen/arch/x86/mm/p2m-ept.c +@@ -681,7 +681,7 @@ ept_set_entry(struct p2m_domain *p2m, gf + uint8_t ipat = 0; + bool_t need_modify_vtd_table = 1; + bool_t vtd_pte_present = 0; +- unsigned int iommu_flags = p2m_get_iommu_flags(p2mt, mfn); ++ unsigned int iommu_flags = p2m_get_iommu_flags(p2mt, p2ma, mfn); + bool_t needs_sync = 1; + ept_entry_t old_entry = { .epte = 0 }; + ept_entry_t new_entry = { .epte = 0 }; +@@ -809,8 +809,8 @@ ept_set_entry(struct p2m_domain *p2m, gf + + /* Safe to read-then-write because we hold the p2m lock */ + if ( ept_entry->mfn == new_entry.mfn && +- p2m_get_iommu_flags(ept_entry->sa_p2mt, _mfn(ept_entry->mfn)) == +- iommu_flags ) ++ p2m_get_iommu_flags(ept_entry->sa_p2mt, ept_entry->access, ++ _mfn(ept_entry->mfn)) == iommu_flags ) + need_modify_vtd_table = 0; + + ept_p2m_type_to_flags(p2m, &new_entry); +--- a/xen/arch/x86/mm/p2m-pt.c ++++ b/xen/arch/x86/mm/p2m-pt.c +@@ -545,6 +545,16 @@ int p2m_pt_handle_deferred_changes(uint6 + return rc; + } + ++/* Reconstruct a fake p2m_access_t from stored PTE flags. */ ++static p2m_access_t p2m_flags_to_access(unsigned int flags) ++{ ++ if ( flags & _PAGE_PRESENT ) ++ return p2m_access_n; ++ ++ /* No need to look at _PAGE_NX for now. */ ++ return flags & _PAGE_RW ? p2m_access_rw : p2m_access_r; ++} ++ + /* Checks only applicable to entries with order > PAGE_ORDER_4K */ + static void check_entry(mfn_t mfn, p2m_type_t new, p2m_type_t old, + unsigned int order) +@@ -579,7 +589,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, + l2_pgentry_t l2e_content; + l3_pgentry_t l3e_content; + int rc; +- unsigned int iommu_pte_flags = p2m_get_iommu_flags(p2mt, mfn); ++ unsigned int iommu_pte_flags = p2m_get_iommu_flags(p2mt, p2ma, mfn); + /* + * old_mfn and iommu_old_flags control possible flush/update needs on the + * IOMMU: We need to flush when MFN or flags (i.e. permissions) change. +@@ -642,6 +652,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, + old_mfn = l1e_get_pfn(*p2m_entry); + iommu_old_flags = + p2m_get_iommu_flags(p2m_flags_to_type(flags), ++ p2m_flags_to_access(flags), + _mfn(old_mfn)); + } + else +@@ -684,9 +695,10 @@ p2m_pt_set_entry(struct p2m_domain *p2m, + 0, L1_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + old_mfn = l1e_get_pfn(*p2m_entry); ++ flags = l1e_get_flags(*p2m_entry); + iommu_old_flags = +- p2m_get_iommu_flags(p2m_flags_to_type(l1e_get_flags(*p2m_entry)), +- _mfn(old_mfn)); ++ p2m_get_iommu_flags(p2m_flags_to_type(flags), ++ p2m_flags_to_access(flags), _mfn(old_mfn)); + + if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ) + entry_content = p2m_l1e_from_pfn(mfn_x(mfn), +@@ -714,6 +726,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, + old_mfn = l1e_get_pfn(*p2m_entry); + iommu_old_flags = + p2m_get_iommu_flags(p2m_flags_to_type(flags), ++ p2m_flags_to_access(flags), + _mfn(old_mfn)); + } + else +--- a/xen/include/asm-x86/p2m.h ++++ b/xen/include/asm-x86/p2m.h +@@ -915,7 +915,8 @@ static inline void p2m_altp2m_check(stru + /* + * p2m type to IOMMU flags + */ +-static inline unsigned int p2m_get_iommu_flags(p2m_type_t p2mt, mfn_t mfn) ++static inline unsigned int p2m_get_iommu_flags(p2m_type_t p2mt, ++ p2m_access_t p2ma, mfn_t mfn) + { + unsigned int flags; + diff --git a/xsa378-4.15-4.patch b/xsa378-4.15-4.patch new file mode 100644 index 0000000..a1d86db --- /dev/null +++ b/xsa378-4.15-4.patch @@ -0,0 +1,399 @@ +From: Jan Beulich +Subject: IOMMU: generalize VT-d's tracking of mapped RMRR regions + +In order to re-use it elsewhere, move the logic to vendor independent +code and strip it of RMRR specifics. + +Note that the prior "map" parameter gets folded into the new "p2ma" one +(which AMD IOMMU code will want to make use of), assigning alternative +meaning ("unmap") to p2m_access_x. Prepare set_identity_p2m_entry() and +p2m_get_iommu_flags() for getting passed access types other than +p2m_access_rw (in the latter case just for p2m_mmio_direct requests). + +Note also that, to be on the safe side, an overlap check gets added to +the main loop of iommu_identity_mapping(). + +This is part of XSA-378. + +Signed-off-by: Jan Beulich +Reviewed-by: Paul Durrant + +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -1365,7 +1365,7 @@ int set_identity_p2m_entry(struct domain + return 0; + return iommu_legacy_map(d, _dfn(gfn_l), _mfn(gfn_l), + 1ul << PAGE_ORDER_4K, +- IOMMUF_readable | IOMMUF_writable); ++ p2m_access_to_iommu_flags(p2ma)); + } + + gfn_lock(p2m, gfn, 0); +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -42,12 +42,6 @@ + #include "vtd.h" + #include "../ats.h" + +-struct mapped_rmrr { +- struct list_head list; +- u64 base, end; +- unsigned int count; +-}; +- + /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */ + bool __read_mostly untrusted_msi; + +@@ -1311,7 +1305,6 @@ static int intel_iommu_domain_init(struc + struct domain_iommu *hd = dom_iommu(d); + + hd->arch.vtd.agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); +- INIT_LIST_HEAD(&hd->arch.vtd.mapped_rmrrs); + + return 0; + } +@@ -1788,17 +1781,12 @@ static void iommu_clear_root_pgtable(str + static void iommu_domain_teardown(struct domain *d) + { + struct domain_iommu *hd = dom_iommu(d); +- struct mapped_rmrr *mrmrr, *tmp; + const struct acpi_drhd_unit *drhd; + + if ( list_empty(&acpi_drhd_units) ) + return; + +- list_for_each_entry_safe ( mrmrr, tmp, &hd->arch.vtd.mapped_rmrrs, list ) +- { +- list_del(&mrmrr->list); +- xfree(mrmrr); +- } ++ iommu_identity_map_teardown(d); + + ASSERT(!hd->arch.vtd.pgd_maddr); + +@@ -1946,74 +1934,6 @@ static int __init vtd_ept_page_compatibl + (ept_has_1gb(ept_cap) && opt_hap_1gb) <= cap_sps_1gb(vtd_cap); + } + +-static int rmrr_identity_mapping(struct domain *d, bool_t map, +- const struct acpi_rmrr_unit *rmrr, +- u32 flag) +-{ +- unsigned long base_pfn = rmrr->base_address >> PAGE_SHIFT_4K; +- unsigned long end_pfn = PAGE_ALIGN_4K(rmrr->end_address) >> PAGE_SHIFT_4K; +- struct mapped_rmrr *mrmrr; +- struct domain_iommu *hd = dom_iommu(d); +- +- ASSERT(pcidevs_locked()); +- ASSERT(rmrr->base_address < rmrr->end_address); +- +- /* +- * No need to acquire hd->arch.mapping_lock: Both insertion and removal +- * get done while holding pcidevs_lock. +- */ +- list_for_each_entry( mrmrr, &hd->arch.vtd.mapped_rmrrs, list ) +- { +- if ( mrmrr->base == rmrr->base_address && +- mrmrr->end == rmrr->end_address ) +- { +- int ret = 0; +- +- if ( map ) +- { +- ++mrmrr->count; +- return 0; +- } +- +- if ( --mrmrr->count ) +- return 0; +- +- while ( base_pfn < end_pfn ) +- { +- if ( clear_identity_p2m_entry(d, base_pfn) ) +- ret = -ENXIO; +- base_pfn++; +- } +- +- list_del(&mrmrr->list); +- xfree(mrmrr); +- return ret; +- } +- } +- +- if ( !map ) +- return -ENOENT; +- +- while ( base_pfn < end_pfn ) +- { +- int err = set_identity_p2m_entry(d, base_pfn, p2m_access_rw, flag); +- +- if ( err ) +- return err; +- base_pfn++; +- } +- +- mrmrr = xmalloc(struct mapped_rmrr); +- if ( !mrmrr ) +- return -ENOMEM; +- mrmrr->base = rmrr->base_address; +- mrmrr->end = rmrr->end_address; +- mrmrr->count = 1; +- list_add_tail(&mrmrr->list, &hd->arch.vtd.mapped_rmrrs); +- +- return 0; +-} +- + static int intel_iommu_add_device(u8 devfn, struct pci_dev *pdev) + { + struct acpi_rmrr_unit *rmrr; +@@ -2045,7 +1965,9 @@ static int intel_iommu_add_device(u8 dev + * Since RMRRs are always reserved in the e820 map for the hardware + * domain, there shouldn't be a conflict. + */ +- ret = rmrr_identity_mapping(pdev->domain, 1, rmrr, 0); ++ ret = iommu_identity_mapping(pdev->domain, p2m_access_rw, ++ rmrr->base_address, rmrr->end_address, ++ 0); + if ( ret ) + dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n", + pdev->domain->domain_id); +@@ -2090,7 +2012,8 @@ static int intel_iommu_remove_device(u8 + * Any flag is nothing to clear these mappings but here + * its always safe and strict to set 0. + */ +- rmrr_identity_mapping(pdev->domain, 0, rmrr, 0); ++ iommu_identity_mapping(pdev->domain, p2m_access_x, rmrr->base_address, ++ rmrr->end_address, 0); + } + + return domain_context_unmap(pdev->domain, devfn, pdev); +@@ -2289,7 +2212,8 @@ static void __hwdom_init setup_hwdom_rmr + * domain, there shouldn't be a conflict. So its always safe and + * strict to set 0. + */ +- ret = rmrr_identity_mapping(d, 1, rmrr, 0); ++ ret = iommu_identity_mapping(d, p2m_access_rw, rmrr->base_address, ++ rmrr->end_address, 0); + if ( ret ) + dprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: mapping reserved region failed\n"); +@@ -2460,7 +2384,9 @@ static int reassign_device_ownership( + * Any RMRR flag is always ignored when remove a device, + * but its always safe and strict to set 0. + */ +- ret = rmrr_identity_mapping(source, 0, rmrr, 0); ++ ret = iommu_identity_mapping(source, p2m_access_x, ++ rmrr->base_address, ++ rmrr->end_address, 0); + if ( ret != -ENOENT ) + return ret; + } +@@ -2556,7 +2482,8 @@ static int intel_iommu_assign_device( + PCI_BUS(bdf) == bus && + PCI_DEVFN2(bdf) == devfn ) + { +- ret = rmrr_identity_mapping(d, 1, rmrr, flag); ++ ret = iommu_identity_mapping(d, p2m_access_rw, rmrr->base_address, ++ rmrr->end_address, flag); + if ( ret ) + { + int rc; +--- a/xen/drivers/passthrough/x86/iommu.c ++++ b/xen/drivers/passthrough/x86/iommu.c +@@ -143,6 +143,7 @@ int arch_iommu_domain_init(struct domain + + INIT_PAGE_LIST_HEAD(&hd->arch.pgtables.list); + spin_lock_init(&hd->arch.pgtables.lock); ++ INIT_LIST_HEAD(&hd->arch.identity_maps); + + return 0; + } +@@ -158,6 +159,99 @@ void arch_iommu_domain_destroy(struct do + page_list_empty(&dom_iommu(d)->arch.pgtables.list)); + } + ++struct identity_map { ++ struct list_head list; ++ paddr_t base, end; ++ p2m_access_t access; ++ unsigned int count; ++}; ++ ++int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma, ++ paddr_t base, paddr_t end, ++ unsigned int flag) ++{ ++ unsigned long base_pfn = base >> PAGE_SHIFT_4K; ++ unsigned long end_pfn = PAGE_ALIGN_4K(end) >> PAGE_SHIFT_4K; ++ struct identity_map *map; ++ struct domain_iommu *hd = dom_iommu(d); ++ ++ ASSERT(pcidevs_locked()); ++ ASSERT(base < end); ++ ++ /* ++ * No need to acquire hd->arch.mapping_lock: Both insertion and removal ++ * get done while holding pcidevs_lock. ++ */ ++ list_for_each_entry( map, &hd->arch.identity_maps, list ) ++ { ++ if ( map->base == base && map->end == end ) ++ { ++ int ret = 0; ++ ++ if ( p2ma != p2m_access_x ) ++ { ++ if ( map->access != p2ma ) ++ return -EADDRINUSE; ++ ++map->count; ++ return 0; ++ } ++ ++ if ( --map->count ) ++ return 0; ++ ++ while ( base_pfn < end_pfn ) ++ { ++ if ( clear_identity_p2m_entry(d, base_pfn) ) ++ ret = -ENXIO; ++ base_pfn++; ++ } ++ ++ list_del(&map->list); ++ xfree(map); ++ ++ return ret; ++ } ++ ++ if ( end >= map->base && map->end >= base ) ++ return -EADDRINUSE; ++ } ++ ++ if ( p2ma == p2m_access_x ) ++ return -ENOENT; ++ ++ while ( base_pfn < end_pfn ) ++ { ++ int err = set_identity_p2m_entry(d, base_pfn, p2ma, flag); ++ ++ if ( err ) ++ return err; ++ base_pfn++; ++ } ++ ++ map = xmalloc(struct identity_map); ++ if ( !map ) ++ return -ENOMEM; ++ map->base = base; ++ map->end = end; ++ map->access = p2ma; ++ map->count = 1; ++ list_add_tail(&map->list, &hd->arch.identity_maps); ++ ++ return 0; ++} ++ ++void iommu_identity_map_teardown(struct domain *d) ++{ ++ struct domain_iommu *hd = dom_iommu(d); ++ struct identity_map *map, *tmp; ++ ++ list_for_each_entry_safe ( map, tmp, &hd->arch.identity_maps, list ) ++ { ++ list_del(&map->list); ++ xfree(map); ++ } ++} ++ + static bool __hwdom_init hwdom_iommu_map(const struct domain *d, + unsigned long pfn, + unsigned long max_pfn) +--- a/xen/include/asm-x86/iommu.h ++++ b/xen/include/asm-x86/iommu.h +@@ -16,6 +16,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -51,13 +52,14 @@ struct arch_iommu + spinlock_t lock; + } pgtables; + ++ struct list_head identity_maps; ++ + union { + /* Intel VT-d */ + struct { + uint64_t pgd_maddr; /* io page directory machine address */ + unsigned int agaw; /* adjusted guest address width, 0 is level 2 30-bit */ + uint64_t iommu_bitmap; /* bitmap of iommu(s) that the domain uses */ +- struct list_head mapped_rmrrs; + } vtd; + /* AMD IOMMU */ + struct { +@@ -123,6 +125,11 @@ static inline void iommu_disable_x2apic( + iommu_ops.disable_x2apic(); + } + ++int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma, ++ paddr_t base, paddr_t end, ++ unsigned int flag); ++void iommu_identity_map_teardown(struct domain *d); ++ + extern bool untrusted_msi; + + int pi_update_irte(const struct pi_desc *pi_desc, const struct pirq *pirq, +--- a/xen/include/asm-x86/p2m.h ++++ b/xen/include/asm-x86/p2m.h +@@ -912,6 +912,34 @@ struct p2m_domain *p2m_get_altp2m(struct + static inline void p2m_altp2m_check(struct vcpu *v, uint16_t idx) {} + #endif + ++/* p2m access to IOMMU flags */ ++static inline unsigned int p2m_access_to_iommu_flags(p2m_access_t p2ma) ++{ ++ switch ( p2ma ) ++ { ++ case p2m_access_rw: ++ case p2m_access_rwx: ++ return IOMMUF_readable | IOMMUF_writable; ++ ++ case p2m_access_r: ++ case p2m_access_rx: ++ case p2m_access_rx2rw: ++ return IOMMUF_readable; ++ ++ case p2m_access_w: ++ case p2m_access_wx: ++ return IOMMUF_writable; ++ ++ case p2m_access_n: ++ case p2m_access_x: ++ case p2m_access_n2rwx: ++ return 0; ++ } ++ ++ ASSERT_UNREACHABLE(); ++ return 0; ++} ++ + /* + * p2m type to IOMMU flags + */ +@@ -933,9 +961,10 @@ static inline unsigned int p2m_get_iommu + flags = IOMMUF_readable; + break; + case p2m_mmio_direct: +- flags = IOMMUF_readable; +- if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) +- flags |= IOMMUF_writable; ++ flags = p2m_access_to_iommu_flags(p2ma); ++ if ( (flags & IOMMUF_writable) && ++ rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) ++ flags &= ~IOMMUF_writable; + break; + default: + flags = 0; diff --git a/xsa378-4.15-5.patch b/xsa378-4.15-5.patch new file mode 100644 index 0000000..134f425 --- /dev/null +++ b/xsa378-4.15-5.patch @@ -0,0 +1,208 @@ +From: Jan Beulich +Subject: AMD/IOMMU: re-arrange/complete re-assignment handling + +Prior to the assignment step having completed successfully, devices +should not get associated with their new owner. Hand the device to DomIO +(perhaps temporarily), until after the de-assignment step has completed. + +De-assignment of a device (from other than Dom0) as well as failure of +reassign_device() during assignment should result in unity mappings +getting torn down. This in turn requires switching to a refcounted +mapping approach, as was already used by VT-d for its RMRRs, to prevent +unmapping a region used by multiple devices. + +This is CVE-2021-28696 / part of XSA-378. + +Signed-off-by: Jan Beulich +Reviewed-by: Paul Durrant + +--- a/xen/drivers/passthrough/amd/iommu.h ++++ b/xen/drivers/passthrough/amd/iommu.h +@@ -232,8 +232,10 @@ int __must_check amd_iommu_unmap_page(st + unsigned int *flush_flags); + int __must_check amd_iommu_alloc_root(struct domain *d); + int amd_iommu_reserve_domain_unity_map(struct domain *domain, +- paddr_t phys_addr, unsigned long size, +- int iw, int ir); ++ const struct ivrs_unity_map *map, ++ unsigned int flag); ++int amd_iommu_reserve_domain_unity_unmap(struct domain *d, ++ const struct ivrs_unity_map *map); + int __must_check amd_iommu_flush_iotlb_pages(struct domain *d, dfn_t dfn, + unsigned long page_count, + unsigned int flush_flags); +--- a/xen/drivers/passthrough/amd/iommu_map.c ++++ b/xen/drivers/passthrough/amd/iommu_map.c +@@ -419,38 +419,49 @@ int amd_iommu_flush_iotlb_all(struct dom + return 0; + } + +-int amd_iommu_reserve_domain_unity_map(struct domain *domain, +- paddr_t phys_addr, +- unsigned long size, int iw, int ir) ++int amd_iommu_reserve_domain_unity_map(struct domain *d, ++ const struct ivrs_unity_map *map, ++ unsigned int flag) + { +- unsigned long npages, i; +- unsigned long gfn; +- unsigned int flags = !!ir; +- unsigned int flush_flags = 0; +- int rt = 0; +- +- if ( iw ) +- flags |= IOMMUF_writable; +- +- npages = region_to_pages(phys_addr, size); +- gfn = phys_addr >> PAGE_SHIFT; +- for ( i = 0; i < npages; i++ ) ++ int rc; ++ ++ if ( d == dom_io ) ++ return 0; ++ ++ for ( rc = 0; !rc && map; map = map->next ) + { +- unsigned long frame = gfn + i; ++ p2m_access_t p2ma = p2m_access_n; ++ ++ if ( map->read ) ++ p2ma |= p2m_access_r; ++ if ( map->write ) ++ p2ma |= p2m_access_w; + +- rt = amd_iommu_map_page(domain, _dfn(frame), _mfn(frame), flags, +- &flush_flags); +- if ( rt != 0 ) +- break; ++ rc = iommu_identity_mapping(d, p2ma, map->addr, ++ map->addr + map->length - 1, flag); + } + +- /* Use while-break to avoid compiler warning */ +- while ( flush_flags && +- amd_iommu_flush_iotlb_pages(domain, _dfn(gfn), +- npages, flush_flags) ) +- break; ++ return rc; ++} ++ ++int amd_iommu_reserve_domain_unity_unmap(struct domain *d, ++ const struct ivrs_unity_map *map) ++{ ++ int rc; ++ ++ if ( d == dom_io ) ++ return 0; ++ ++ for ( rc = 0; map; map = map->next ) ++ { ++ int ret = iommu_identity_mapping(d, p2m_access_x, map->addr, ++ map->addr + map->length - 1, 0); ++ ++ if ( ret && ret != -ENOENT && !rc ) ++ rc = ret; ++ } + +- return rt; ++ return rc; + } + + int __init amd_iommu_quarantine_init(struct domain *d) +--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c ++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c +@@ -329,6 +329,7 @@ static int reassign_device(struct domain + { + struct amd_iommu *iommu; + int bdf, rc; ++ const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); + + bdf = PCI_BDF2(pdev->bus, pdev->devfn); + iommu = find_iommu_for_device(pdev->seg, bdf); +@@ -343,10 +344,24 @@ static int reassign_device(struct domain + + amd_iommu_disable_domain_device(source, iommu, devfn, pdev); + +- if ( devfn == pdev->devfn ) ++ /* ++ * If the device belongs to the hardware domain, and it has a unity mapping, ++ * don't remove it from the hardware domain, because BIOS may reference that ++ * mapping. ++ */ ++ if ( !is_hardware_domain(source) ) + { +- list_move(&pdev->domain_list, &target->pdev_list); +- pdev->domain = target; ++ rc = amd_iommu_reserve_domain_unity_unmap( ++ source, ++ ivrs_mappings[get_dma_requestor_id(pdev->seg, bdf)].unity_map); ++ if ( rc ) ++ return rc; ++ } ++ ++ if ( devfn == pdev->devfn && pdev->domain != dom_io ) ++ { ++ list_move(&pdev->domain_list, &dom_io->pdev_list); ++ pdev->domain = dom_io; + } + + rc = allocate_domain_resources(target); +@@ -357,6 +372,12 @@ static int reassign_device(struct domain + AMD_IOMMU_DEBUG("Re-assign %pp from dom%d to dom%d\n", + &pdev->sbdf, source->domain_id, target->domain_id); + ++ if ( devfn == pdev->devfn && pdev->domain != target ) ++ { ++ list_move(&pdev->domain_list, &target->pdev_list); ++ pdev->domain = target; ++ } ++ + return 0; + } + +@@ -367,20 +388,28 @@ static int amd_iommu_assign_device(struc + struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); + int bdf = PCI_BDF2(pdev->bus, devfn); + int req_id = get_dma_requestor_id(pdev->seg, bdf); +- const struct ivrs_unity_map *unity_map; ++ int rc = amd_iommu_reserve_domain_unity_map( ++ d, ivrs_mappings[req_id].unity_map, flag); + +- for ( unity_map = ivrs_mappings[req_id].unity_map; unity_map; +- unity_map = unity_map->next ) ++ if ( !rc ) ++ rc = reassign_device(pdev->domain, d, devfn, pdev); ++ ++ if ( rc && !is_hardware_domain(d) ) + { +- int rc = amd_iommu_reserve_domain_unity_map( +- d, unity_map->addr, unity_map->length, +- unity_map->write, unity_map->read); ++ int ret = amd_iommu_reserve_domain_unity_unmap( ++ d, ivrs_mappings[req_id].unity_map); + +- if ( rc ) +- return rc; ++ if ( ret ) ++ { ++ printk(XENLOG_ERR "AMD-Vi: " ++ "unity-unmap for %pd/%04x:%02x:%02x.%u failed (%d)\n", ++ d, pdev->seg, pdev->bus, ++ PCI_SLOT(devfn), PCI_FUNC(devfn), ret); ++ domain_crash(d); ++ } + } + +- return reassign_device(pdev->domain, d, devfn, pdev); ++ return rc; + } + + static void amd_iommu_clear_root_pgtable(struct domain *d) +@@ -394,6 +423,7 @@ static void amd_iommu_clear_root_pgtable + + static void amd_iommu_domain_destroy(struct domain *d) + { ++ iommu_identity_map_teardown(d); + ASSERT(!dom_iommu(d)->arch.amd.root_table); + } + diff --git a/xsa378-4.15-6.patch b/xsa378-4.15-6.patch new file mode 100644 index 0000000..d6ebc2a --- /dev/null +++ b/xsa378-4.15-6.patch @@ -0,0 +1,411 @@ +From: Jan Beulich +Subject: AMD/IOMMU: re-arrange exclusion range and unity map recording + +The spec makes no provisions for OS behavior here to depend on the +amount of RAM found on the system. While the spec may not sufficiently +clearly distinguish both kinds of regions, they are surely meant to be +separate things: Only regions with ACPI_IVMD_EXCLUSION_RANGE set should +be candidates for putting in the exclusion range registers. (As there's +only a single such pair of registers per IOMMU, secondary non-adjacent +regions with the flag set already get converted to unity mapped +regions.) + +First of all, drop the dependency on max_page. With commit b4f042236ae0 +("AMD/IOMMU: Cease using a dynamic height for the IOMMU pagetables") the +use of it here was stale anyway; it was bogus already before, as it +didn't account for max_page getting increased later on. Simply try an +exclusion range registration first, and if it fails (for being +unsuitable or non-mergeable), register a unity mapping range. + +With this various local variables become unnecessary and hence get +dropped at the same time. + +With the max_page boundary dropped for using unity maps, the minimum +page table tree height now needs both recording and enforcing in +amd_iommu_domain_init(). Since we can't predict which devices may get +assigned to a domain, our only option is to uniformly force at least +that height for all domains, now that the height isn't dynamic anymore. + +Further don't make use of the exclusion range unless ACPI data says so. + +Note that exclusion range registration in +register_range_for_all_devices() is on a best effort basis. Hence unity +map entries also registered are redundant when the former succeeded, but +they also do no harm. Improvements in this area can be done later imo. + +Also adjust types where suitable without touching extra lines. + +This is part of XSA-378. + +Signed-off-by: Jan Beulich +Reviewed-by: Paul Durrant + +--- a/xen/drivers/passthrough/amd/iommu.h ++++ b/xen/drivers/passthrough/amd/iommu.h +@@ -304,6 +304,8 @@ extern struct hpet_sbdf { + } init; + } hpet_sbdf; + ++extern int amd_iommu_min_paging_mode; ++ + extern void *shared_intremap_table; + extern unsigned long *shared_intremap_inuse; + +--- a/xen/drivers/passthrough/amd/iommu_acpi.c ++++ b/xen/drivers/passthrough/amd/iommu_acpi.c +@@ -117,12 +117,8 @@ static struct amd_iommu * __init find_io + } + + static int __init reserve_iommu_exclusion_range( +- struct amd_iommu *iommu, uint64_t base, uint64_t limit, +- bool all, bool iw, bool ir) ++ struct amd_iommu *iommu, paddr_t base, paddr_t limit, bool all) + { +- if ( !ir || !iw ) +- return -EPERM; +- + /* need to extend exclusion range? */ + if ( iommu->exclusion_enable ) + { +@@ -151,14 +147,18 @@ static int __init reserve_unity_map_for_ + { + struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg); + struct ivrs_unity_map *unity_map = ivrs_mappings[bdf].unity_map; ++ int paging_mode = amd_iommu_get_paging_mode(PFN_UP(base + length)); ++ ++ if ( paging_mode < 0 ) ++ return paging_mode; + + /* Check for overlaps. */ + for ( ; unity_map; unity_map = unity_map->next ) + { + /* + * Exact matches are okay. This can in particular happen when +- * register_exclusion_range_for_device() calls here twice for the +- * same (s,b,d,f). ++ * register_range_for_device() calls here twice for the same ++ * (s,b,d,f). + */ + if ( base == unity_map->addr && length == unity_map->length && + ir == unity_map->read && iw == unity_map->write ) +@@ -186,55 +186,52 @@ static int __init reserve_unity_map_for_ + unity_map->next = ivrs_mappings[bdf].unity_map; + ivrs_mappings[bdf].unity_map = unity_map; + ++ if ( paging_mode > amd_iommu_min_paging_mode ) ++ amd_iommu_min_paging_mode = paging_mode; ++ + return 0; + } + +-static int __init register_exclusion_range_for_all_devices( +- unsigned long base, unsigned long limit, u8 iw, u8 ir) ++static int __init register_range_for_all_devices( ++ paddr_t base, paddr_t limit, bool iw, bool ir, bool exclusion) + { + int seg = 0; /* XXX */ +- unsigned long range_top, iommu_top, length; + struct amd_iommu *iommu; +- unsigned int bdf; + int rc = 0; + + /* is part of exclusion range inside of IOMMU virtual address space? */ + /* note: 'limit' parameter is assumed to be page-aligned */ +- range_top = limit + PAGE_SIZE; +- iommu_top = max_page * PAGE_SIZE; +- if ( base < iommu_top ) +- { +- if ( range_top > iommu_top ) +- range_top = iommu_top; +- length = range_top - base; +- /* reserve r/w unity-mapped page entries for devices */ +- /* note: these entries are part of the exclusion range */ +- for ( bdf = 0; !rc && bdf < ivrs_bdf_entries; bdf++ ) +- rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir); +- /* push 'base' just outside of virtual address space */ +- base = iommu_top; +- } +- /* register IOMMU exclusion range settings */ +- if ( !rc && limit >= iommu_top ) ++ if ( exclusion ) + { + for_each_amd_iommu( iommu ) + { +- rc = reserve_iommu_exclusion_range(iommu, base, limit, +- true /* all */, iw, ir); +- if ( rc ) +- break; ++ int ret = reserve_iommu_exclusion_range(iommu, base, limit, ++ true /* all */); ++ ++ if ( ret && !rc ) ++ rc = ret; + } + } + ++ if ( !exclusion || rc ) ++ { ++ paddr_t length = limit + PAGE_SIZE - base; ++ unsigned int bdf; ++ ++ /* reserve r/w unity-mapped page entries for devices */ ++ for ( bdf = rc = 0; !rc && bdf < ivrs_bdf_entries; bdf++ ) ++ rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir); ++ } ++ + return rc; + } + +-static int __init register_exclusion_range_for_device( +- u16 bdf, unsigned long base, unsigned long limit, u8 iw, u8 ir) ++static int __init register_range_for_device( ++ unsigned int bdf, paddr_t base, paddr_t limit, ++ bool iw, bool ir, bool exclusion) + { + int seg = 0; /* XXX */ + struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg); +- unsigned long range_top, iommu_top, length; + struct amd_iommu *iommu; + u16 req; + int rc = 0; +@@ -248,27 +245,19 @@ static int __init register_exclusion_ran + req = ivrs_mappings[bdf].dte_requestor_id; + + /* note: 'limit' parameter is assumed to be page-aligned */ +- range_top = limit + PAGE_SIZE; +- iommu_top = max_page * PAGE_SIZE; +- if ( base < iommu_top ) +- { +- if ( range_top > iommu_top ) +- range_top = iommu_top; +- length = range_top - base; ++ if ( exclusion ) ++ rc = reserve_iommu_exclusion_range(iommu, base, limit, ++ false /* all */); ++ if ( !exclusion || rc ) ++ { ++ paddr_t length = limit + PAGE_SIZE - base; ++ + /* reserve unity-mapped page entries for device */ +- /* note: these entries are part of the exclusion range */ + rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir) ?: + reserve_unity_map_for_device(seg, req, base, length, iw, ir); +- +- /* push 'base' just outside of virtual address space */ +- base = iommu_top; + } +- +- /* register IOMMU exclusion range settings for device */ +- if ( !rc && limit >= iommu_top ) ++ else + { +- rc = reserve_iommu_exclusion_range(iommu, base, limit, +- false /* all */, iw, ir); + ivrs_mappings[bdf].dte_allow_exclusion = true; + ivrs_mappings[req].dte_allow_exclusion = true; + } +@@ -276,53 +265,42 @@ static int __init register_exclusion_ran + return rc; + } + +-static int __init register_exclusion_range_for_iommu_devices( +- struct amd_iommu *iommu, +- unsigned long base, unsigned long limit, u8 iw, u8 ir) ++static int __init register_range_for_iommu_devices( ++ struct amd_iommu *iommu, paddr_t base, paddr_t limit, ++ bool iw, bool ir, bool exclusion) + { +- unsigned long range_top, iommu_top, length; ++ /* note: 'limit' parameter is assumed to be page-aligned */ ++ paddr_t length = limit + PAGE_SIZE - base; + unsigned int bdf; + u16 req; +- int rc = 0; ++ int rc; + +- /* is part of exclusion range inside of IOMMU virtual address space? */ +- /* note: 'limit' parameter is assumed to be page-aligned */ +- range_top = limit + PAGE_SIZE; +- iommu_top = max_page * PAGE_SIZE; +- if ( base < iommu_top ) +- { +- if ( range_top > iommu_top ) +- range_top = iommu_top; +- length = range_top - base; +- /* reserve r/w unity-mapped page entries for devices */ +- /* note: these entries are part of the exclusion range */ +- for ( bdf = 0; !rc && bdf < ivrs_bdf_entries; bdf++ ) +- { +- if ( iommu == find_iommu_for_device(iommu->seg, bdf) ) +- { +- req = get_ivrs_mappings(iommu->seg)[bdf].dte_requestor_id; +- rc = reserve_unity_map_for_device(iommu->seg, bdf, base, length, +- iw, ir) ?: +- reserve_unity_map_for_device(iommu->seg, req, base, length, +- iw, ir); +- } +- } +- +- /* push 'base' just outside of virtual address space */ +- base = iommu_top; ++ if ( exclusion ) ++ { ++ rc = reserve_iommu_exclusion_range(iommu, base, limit, true /* all */); ++ if ( !rc ) ++ return 0; + } + +- /* register IOMMU exclusion range settings */ +- if ( !rc && limit >= iommu_top ) +- rc = reserve_iommu_exclusion_range(iommu, base, limit, +- true /* all */, iw, ir); ++ /* reserve unity-mapped page entries for devices */ ++ for ( bdf = rc = 0; !rc && bdf < ivrs_bdf_entries; bdf++ ) ++ { ++ if ( iommu != find_iommu_for_device(iommu->seg, bdf) ) ++ continue; ++ ++ req = get_ivrs_mappings(iommu->seg)[bdf].dte_requestor_id; ++ rc = reserve_unity_map_for_device(iommu->seg, bdf, base, length, ++ iw, ir) ?: ++ reserve_unity_map_for_device(iommu->seg, req, base, length, ++ iw, ir); ++ } + + return rc; + } + + static int __init parse_ivmd_device_select( + const struct acpi_ivrs_memory *ivmd_block, +- unsigned long base, unsigned long limit, u8 iw, u8 ir) ++ paddr_t base, paddr_t limit, bool iw, bool ir, bool exclusion) + { + u16 bdf; + +@@ -333,12 +311,12 @@ static int __init parse_ivmd_device_sele + return -ENODEV; + } + +- return register_exclusion_range_for_device(bdf, base, limit, iw, ir); ++ return register_range_for_device(bdf, base, limit, iw, ir, exclusion); + } + + static int __init parse_ivmd_device_range( + const struct acpi_ivrs_memory *ivmd_block, +- unsigned long base, unsigned long limit, u8 iw, u8 ir) ++ paddr_t base, paddr_t limit, bool iw, bool ir, bool exclusion) + { + unsigned int first_bdf, last_bdf, bdf; + int error; +@@ -360,15 +338,15 @@ static int __init parse_ivmd_device_rang + } + + for ( bdf = first_bdf, error = 0; (bdf <= last_bdf) && !error; bdf++ ) +- error = register_exclusion_range_for_device( +- bdf, base, limit, iw, ir); ++ error = register_range_for_device( ++ bdf, base, limit, iw, ir, exclusion); + + return error; + } + + static int __init parse_ivmd_device_iommu( + const struct acpi_ivrs_memory *ivmd_block, +- unsigned long base, unsigned long limit, u8 iw, u8 ir) ++ paddr_t base, paddr_t limit, bool iw, bool ir, bool exclusion) + { + int seg = 0; /* XXX */ + struct amd_iommu *iommu; +@@ -383,14 +361,14 @@ static int __init parse_ivmd_device_iomm + return -ENODEV; + } + +- return register_exclusion_range_for_iommu_devices( +- iommu, base, limit, iw, ir); ++ return register_range_for_iommu_devices( ++ iommu, base, limit, iw, ir, exclusion); + } + + static int __init parse_ivmd_block(const struct acpi_ivrs_memory *ivmd_block) + { + unsigned long start_addr, mem_length, base, limit; +- u8 iw, ir; ++ bool iw = true, ir = true, exclusion = false; + + if ( ivmd_block->header.length < sizeof(*ivmd_block) ) + { +@@ -407,13 +385,11 @@ static int __init parse_ivmd_block(const + ivmd_block->header.type, start_addr, mem_length); + + if ( ivmd_block->header.flags & ACPI_IVMD_EXCLUSION_RANGE ) +- iw = ir = IOMMU_CONTROL_ENABLED; ++ exclusion = true; + else if ( ivmd_block->header.flags & ACPI_IVMD_UNITY ) + { +- iw = ivmd_block->header.flags & ACPI_IVMD_READ ? +- IOMMU_CONTROL_ENABLED : IOMMU_CONTROL_DISABLED; +- ir = ivmd_block->header.flags & ACPI_IVMD_WRITE ? +- IOMMU_CONTROL_ENABLED : IOMMU_CONTROL_DISABLED; ++ iw = ivmd_block->header.flags & ACPI_IVMD_READ; ++ ir = ivmd_block->header.flags & ACPI_IVMD_WRITE; + } + else + { +@@ -424,20 +400,20 @@ static int __init parse_ivmd_block(const + switch( ivmd_block->header.type ) + { + case ACPI_IVRS_TYPE_MEMORY_ALL: +- return register_exclusion_range_for_all_devices( +- base, limit, iw, ir); ++ return register_range_for_all_devices( ++ base, limit, iw, ir, exclusion); + + case ACPI_IVRS_TYPE_MEMORY_ONE: +- return parse_ivmd_device_select(ivmd_block, +- base, limit, iw, ir); ++ return parse_ivmd_device_select(ivmd_block, base, limit, ++ iw, ir, exclusion); + + case ACPI_IVRS_TYPE_MEMORY_RANGE: +- return parse_ivmd_device_range(ivmd_block, +- base, limit, iw, ir); ++ return parse_ivmd_device_range(ivmd_block, base, limit, ++ iw, ir, exclusion); + + case ACPI_IVRS_TYPE_MEMORY_IOMMU: +- return parse_ivmd_device_iommu(ivmd_block, +- base, limit, iw, ir); ++ return parse_ivmd_device_iommu(ivmd_block, base, limit, ++ iw, ir, exclusion); + + default: + AMD_IOMMU_DEBUG("IVMD Error: Invalid Block Type!\n"); +--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c ++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c +@@ -234,6 +234,8 @@ static int __must_check allocate_domain_ + return rc; + } + ++int __read_mostly amd_iommu_min_paging_mode = 1; ++ + static int amd_iommu_domain_init(struct domain *d) + { + struct domain_iommu *hd = dom_iommu(d); +@@ -245,11 +247,13 @@ static int amd_iommu_domain_init(struct + * - HVM could in principle use 3 or 4 depending on how much guest + * physical address space we give it, but this isn't known yet so use 4 + * unilaterally. ++ * - Unity maps may require an even higher number. + */ +- hd->arch.amd.paging_mode = amd_iommu_get_paging_mode( +- is_hvm_domain(d) +- ? 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT) +- : get_upper_mfn_bound() + 1); ++ hd->arch.amd.paging_mode = max(amd_iommu_get_paging_mode( ++ is_hvm_domain(d) ++ ? 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT) ++ : get_upper_mfn_bound() + 1), ++ amd_iommu_min_paging_mode); + + return 0; + } diff --git a/xsa378-4.15-7.patch b/xsa378-4.15-7.patch new file mode 100644 index 0000000..0f59532 --- /dev/null +++ b/xsa378-4.15-7.patch @@ -0,0 +1,88 @@ +From: Jan Beulich +Subject: x86/p2m: introduce p2m_is_special() + +Seeing the similarity of grant, foreign, and (subsequently) direct-MMIO +handling, introduce a new P2M type group named "special" (as in "needing +special accessors to create/destroy"). + +Also use -EPERM instead of other error codes on the two domain_crash() +paths touched. + +This is part of XSA-378. + +Signed-off-by: Jan Beulich +Reviewed-by: Paul Durrant + +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -811,7 +811,7 @@ p2m_remove_page(struct p2m_domain *p2m, + for ( i = 0; i < (1UL << page_order); i++ ) + { + p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, NULL, NULL); +- if ( !p2m_is_grant(t) && !p2m_is_shared(t) && !p2m_is_foreign(t) ) ++ if ( !p2m_is_special(t) && !p2m_is_shared(t) ) + set_gpfn_from_mfn(mfn_x(mfn) + i, INVALID_M2P_ENTRY); + } + } +@@ -941,13 +941,13 @@ guest_physmap_add_entry(struct domain *d + &ot, &a, 0, NULL, NULL); + ASSERT(!p2m_is_shared(ot)); + } +- if ( p2m_is_grant(ot) || p2m_is_foreign(ot) ) ++ if ( p2m_is_special(ot) ) + { +- /* Really shouldn't be unmapping grant/foreign maps this way */ ++ /* Don't permit unmapping grant/foreign this way. */ + domain_crash(d); + p2m_unlock(p2m); + +- return -EINVAL; ++ return -EPERM; + } + else if ( p2m_is_ram(ot) && !p2m_is_paged(ot) ) + { +@@ -1041,8 +1041,7 @@ int p2m_change_type_one(struct domain *d + struct p2m_domain *p2m = p2m_get_hostp2m(d); + int rc; + +- BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); +- BUG_ON(p2m_is_foreign(ot) || p2m_is_foreign(nt)); ++ BUG_ON(p2m_is_special(ot) || p2m_is_special(nt)); + + gfn_lock(p2m, gfn, 0); + +@@ -1289,11 +1288,11 @@ static int set_typed_p2m_entry(struct do + gfn_unlock(p2m, gfn, order); + return cur_order + 1; + } +- if ( p2m_is_grant(ot) || p2m_is_foreign(ot) ) ++ if ( p2m_is_special(ot) ) + { + gfn_unlock(p2m, gfn, order); + domain_crash(d); +- return -ENOENT; ++ return -EPERM; + } + else if ( p2m_is_ram(ot) ) + { +--- a/xen/include/asm-x86/p2m.h ++++ b/xen/include/asm-x86/p2m.h +@@ -149,6 +149,10 @@ typedef unsigned int p2m_query_t; + | p2m_to_mask(p2m_ram_logdirty) ) + #define P2M_SHARED_TYPES (p2m_to_mask(p2m_ram_shared)) + ++/* Types established/cleaned up via special accessors. */ ++#define P2M_SPECIAL_TYPES (P2M_GRANT_TYPES | \ ++ p2m_to_mask(p2m_map_foreign)) ++ + /* Valid types not necessarily associated with a (valid) MFN. */ + #define P2M_INVALID_MFN_TYPES (P2M_POD_TYPES \ + | p2m_to_mask(p2m_mmio_direct) \ +@@ -177,6 +181,7 @@ typedef unsigned int p2m_query_t; + #define p2m_is_paged(_t) (p2m_to_mask(_t) & P2M_PAGED_TYPES) + #define p2m_is_sharable(_t) (p2m_to_mask(_t) & P2M_SHARABLE_TYPES) + #define p2m_is_shared(_t) (p2m_to_mask(_t) & P2M_SHARED_TYPES) ++#define p2m_is_special(_t) (p2m_to_mask(_t) & P2M_SPECIAL_TYPES) + #define p2m_is_broken(_t) (p2m_to_mask(_t) & P2M_BROKEN_TYPES) + #define p2m_is_foreign(_t) (p2m_to_mask(_t) & p2m_to_mask(p2m_map_foreign)) + diff --git a/xsa378-4.15-8.patch b/xsa378-4.15-8.patch new file mode 100644 index 0000000..391a0bc --- /dev/null +++ b/xsa378-4.15-8.patch @@ -0,0 +1,155 @@ +From: Jan Beulich +Subject: x86/p2m: guard (in particular) identity mapping entries + +Such entries, created by set_identity_p2m_entry(), should only be +destroyed by clear_identity_p2m_entry(). However, similarly, entries +created by set_mmio_p2m_entry() should only be torn down by +clear_mmio_p2m_entry(), so the logic gets based upon p2m_mmio_direct as +the entry type (separation between "ordinary" and 1:1 mappings would +require a further indicator to tell apart the two). + +As to the guest_remove_page() change, commit 48dfb297a20a ("x86/PVH: +allow guest_remove_page to remove p2m_mmio_direct pages"), which +introduced the call to clear_mmio_p2m_entry(), claimed this was done for +hwdom only without this actually having been the case. However, this +code shouldn't be there in the first place, as MMIO entries shouldn't be +dropped this way. Avoid triggering the warning again that 48dfb297a20a +silenced by an adjustment to xenmem_add_to_physmap_one() instead. + +Note that guest_physmap_mark_populate_on_demand() gets tightened beyond +the immediate purpose of this change. + +Note also that I didn't inspect code which isn't security supported, +e.g. sharing, paging, or altp2m. + +This is CVE-2021-28694 / part of XSA-378. + +Signed-off-by: Jan Beulich +Reviewed-by: Paul Durrant + +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -799,7 +799,8 @@ p2m_remove_page(struct p2m_domain *p2m, + &cur_order, NULL); + + if ( p2m_is_valid(t) && +- (!mfn_valid(mfn) || !mfn_eq(mfn_add(mfn, i), mfn_return)) ) ++ (!mfn_valid(mfn) || t == p2m_mmio_direct || ++ !mfn_eq(mfn_add(mfn, i), mfn_return)) ) + return -EILSEQ; + + i += (1UL << cur_order) - +@@ -899,7 +900,7 @@ guest_physmap_add_entry(struct domain *d + if ( p2m_is_foreign(t) ) + return -EINVAL; + +- if ( !mfn_valid(mfn) ) ++ if ( !mfn_valid(mfn) || t == p2m_mmio_direct ) + { + ASSERT_UNREACHABLE(); + return -EINVAL; +@@ -943,7 +944,7 @@ guest_physmap_add_entry(struct domain *d + } + if ( p2m_is_special(ot) ) + { +- /* Don't permit unmapping grant/foreign this way. */ ++ /* Don't permit unmapping grant/foreign/direct-MMIO this way. */ + domain_crash(d); + p2m_unlock(p2m); + +@@ -1399,8 +1400,8 @@ int set_identity_p2m_entry(struct domain + * order+1 for caller to retry with order (guaranteed smaller than + * the order value passed in) + */ +-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn_l, mfn_t mfn, +- unsigned int order) ++static int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn_l, ++ mfn_t mfn, unsigned int order) + { + int rc = -EINVAL; + gfn_t gfn = _gfn(gfn_l); +@@ -2731,7 +2732,9 @@ int xenmem_add_to_physmap_one( + + /* Remove previously mapped page if it was present. */ + prev_mfn = get_gfn(d, gfn_x(gpfn), &p2mt); +- if ( mfn_valid(prev_mfn) ) ++ if ( p2mt == p2m_mmio_direct ) ++ rc = -EPERM; ++ else if ( mfn_valid(prev_mfn) ) + { + if ( is_special_page(mfn_to_page(prev_mfn)) ) + /* Special pages are simply unhooked from this phys slot. */ +--- a/xen/arch/x86/mm/p2m-pod.c ++++ b/xen/arch/x86/mm/p2m-pod.c +@@ -1299,17 +1299,17 @@ guest_physmap_mark_populate_on_demand(st + + p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, &cur_order, NULL); + n = 1UL << min(order, cur_order); +- if ( p2m_is_ram(ot) ) ++ if ( ot == p2m_populate_on_demand ) ++ { ++ /* Count how many PoD entries we'll be replacing if successful */ ++ pod_count += n; ++ } ++ else if ( ot != p2m_invalid && ot != p2m_mmio_dm ) + { + P2M_DEBUG("gfn_to_mfn returned type %d!\n", ot); + rc = -EBUSY; + goto out; + } +- else if ( ot == p2m_populate_on_demand ) +- { +- /* Count how man PoD entries we'll be replacing if successful */ +- pod_count += n; +- } + } + + /* Now, actually do the two-way mapping */ +--- a/xen/common/memory.c ++++ b/xen/common/memory.c +@@ -330,7 +330,7 @@ int guest_remove_page(struct domain *d, + } + if ( p2mt == p2m_mmio_direct ) + { +- rc = clear_mmio_p2m_entry(d, gmfn, mfn, PAGE_ORDER_4K); ++ rc = -EPERM; + goto out_put_gfn; + } + #else +@@ -1875,6 +1875,15 @@ int check_get_page_from_gfn(struct domai + return -EAGAIN; + } + #endif ++#ifdef CONFIG_X86 ++ if ( p2mt == p2m_mmio_direct ) ++ { ++ if ( page ) ++ put_page(page); ++ ++ return -EPERM; ++ } ++#endif + + if ( !page ) + return -EINVAL; +--- a/xen/include/asm-x86/p2m.h ++++ b/xen/include/asm-x86/p2m.h +@@ -151,7 +151,8 @@ typedef unsigned int p2m_query_t; + + /* Types established/cleaned up via special accessors. */ + #define P2M_SPECIAL_TYPES (P2M_GRANT_TYPES | \ +- p2m_to_mask(p2m_map_foreign)) ++ p2m_to_mask(p2m_map_foreign) | \ ++ p2m_to_mask(p2m_mmio_direct)) + + /* Valid types not necessarily associated with a (valid) MFN. */ + #define P2M_INVALID_MFN_TYPES (P2M_POD_TYPES \ +@@ -666,8 +667,6 @@ int p2m_is_logdirty_range(struct p2m_dom + /* Set mmio addresses in the p2m table (for pass-through) */ + int set_mmio_p2m_entry(struct domain *d, gfn_t gfn, mfn_t mfn, + unsigned int order); +-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, +- unsigned int order); + + /* Set identity addresses in the p2m table (for pass-through) */ + int set_identity_p2m_entry(struct domain *d, unsigned long gfn, diff --git a/xsa379-4.15.patch b/xsa379-4.15.patch new file mode 100644 index 0000000..004f56e --- /dev/null +++ b/xsa379-4.15.patch @@ -0,0 +1,82 @@ +From: Jan Beulich +Subject: x86/mm: widen locked region in xenmem_add_to_physmap_one() + +For pages which can be made part of the P2M by the guest, but which can +also later be de-allocated (grant table v2 status pages being the +present example), it is imperative that they be mapped at no more than a +single GFN. We therefore need to make sure that of two parallel +XENMAPSPACE_grant_table requests for the same status page one completes +before the second checks at which other GFN the underlying MFN is +presently mapped. + +Push down the respective put_gfn(). This leverages that gfn_lock() +really aliases p2m_lock(), but the function makes this assumption +already anyway: In the XENMAPSPACE_gmfn case lock nesting constraints +for both involved GFNs would otherwise need to be enforced to avoid ABBA +deadlocks. + +This is CVE-2021-28697 / XSA-379. + +Signed-off-by: Jan Beulich +Reviewed-by: Julien Grall +--- +Since there was some re-ordering of the checks in staging/master (the +-EXDEV now sitting earlier there), I deemed it better to drop the +earlier "if ( rc )" and allow an earlier error to be overwritten by +-EXDEV here. + +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -2730,8 +2730,20 @@ int xenmem_add_to_physmap_one( + goto put_both; + } + +- /* Remove previously mapped page if it was present. */ ++ /* ++ * Note that we're (ab)using GFN locking (to really be locking of the ++ * entire P2M) here in (at least) two ways: Finer grained locking would ++ * expose lock order violations in the XENMAPSPACE_gmfn case (due to the ++ * earlier get_gfn_unshare() above). Plus at the very least for the grant ++ * table v2 status page case we need to guarantee that the same page can ++ * only appear at a single GFN. While this is a property we want in ++ * general, for pages which can subsequently be freed this imperative: ++ * Upon freeing we wouldn't be able to find other mappings in the P2M ++ * (unless we did a brute force search). ++ */ + prev_mfn = get_gfn(d, gfn_x(gpfn), &p2mt); ++ ++ /* Remove previously mapped page if it was present. */ + if ( p2mt == p2m_mmio_direct ) + rc = -EPERM; + else if ( mfn_valid(prev_mfn) ) +@@ -2743,27 +2755,21 @@ int xenmem_add_to_physmap_one( + /* Normal domain memory is freed, to avoid leaking memory. */ + rc = guest_remove_page(d, gfn_x(gpfn)); + } +- /* In the XENMAPSPACE_gmfn case we still hold a ref on the old page. */ +- put_gfn(d, gfn_x(gpfn)); +- +- if ( rc ) +- goto put_both; + + /* Unmap from old location, if any. */ + old_gpfn = get_gpfn_from_mfn(mfn_x(mfn)); + ASSERT(!SHARED_M2P(old_gpfn)); + if ( space == XENMAPSPACE_gmfn && old_gpfn != gfn ) +- { + rc = -EXDEV; +- goto put_both; +- } +- if ( old_gpfn != INVALID_M2P_ENTRY ) ++ else if ( !rc && old_gpfn != INVALID_M2P_ENTRY ) + rc = guest_physmap_remove_page(d, _gfn(old_gpfn), mfn, PAGE_ORDER_4K); + + /* Map at new location. */ + if ( !rc ) + rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K); + ++ put_gfn(d, gfn_x(gpfn)); ++ + put_both: + /* + * In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. diff --git a/xsa380-1.patch b/xsa380-1.patch new file mode 100644 index 0000000..9212d73 --- /dev/null +++ b/xsa380-1.patch @@ -0,0 +1,178 @@ +From: Jan Beulich +Subject: gnttab: add preemption check to gnttab_release_mappings() + +A guest may die with many grant mappings still in place, or simply with +a large maptrack table. Iterating through this may take more time than +is reasonable without intermediate preemption (to run softirqs and +perhaps the scheduler). + +Move the invocation of the function to the section where other +restartable functions get invoked, and have the function itself check +for preemption every once in a while. Have it iterate the table +backwards, such that decreasing the maptrack limit is all it takes to +convey restart information. + +In domain_teardown() introduce PROG_none such that inserting at the +front will be easier going forward. + +This is part of CVE-2021-28698 / XSA-380. + +Reported-by: Andrew Cooper +Signed-off-by: Jan Beulich +Reviewed-by: Julien Grall +--- +While I consider removal of the freeing of t->maptrack[i] from +grant_table_destroy() an integral part of this change, also freeing +t->maptrack right in gnttab_release_mappings() would seem like an +unrelated change to me, so I'm not moving that one for now. If others +think it would better be moved, I'd be happy to do so. + +While in principle it would be nice to also eliminate the other loops +from grant_table_destroy() (which can all take long as well provided a +large enough max_grant_frames), ->maptrack[] really is special in that +it only gets accessed when processing requests by the domain itself. The +other arrays may all continue to be accessed as remote domains drop uses +of grants by the dying domain. +--- +v3: Add comment. +v2: Move gnttab_release_mappings() invocation into domain_teardown(). + Don't crash when cleaning up domain without maptrack table. Extend + comment next to maptrack_limit. + +--- a/xen/common/domain.c ++++ b/xen/common/domain.c +@@ -412,11 +412,18 @@ static int domain_teardown(struct domain + v = d->teardown.vcpu + + enum { +- PROG_vcpu_teardown = 1, ++ PROG_none, ++ PROG_gnttab_mappings, ++ PROG_vcpu_teardown, + PROG_done, + }; + +- case 0: ++ case PROG_none: ++ rc = gnttab_release_mappings(d); ++ if ( rc ) ++ return rc; ++ ++ PROGRESS(gnttab_mappings): + for_each_vcpu ( d, v ) + { + PROGRESS_VCPU(teardown); +@@ -908,7 +915,6 @@ int domain_kill(struct domain *d) + return domain_kill(d); + d->is_dying = DOMDYING_dying; + argo_destroy(d); +- gnttab_release_mappings(d); + vnuma_destroy(d->vnuma); + domain_set_outstanding_pages(d, 0); + /* fallthrough */ +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -64,7 +64,13 @@ struct grant_table { + unsigned int nr_grant_frames; + /* Number of grant status frames shared with guest (for version 2) */ + unsigned int nr_status_frames; +- /* Number of available maptrack entries. */ ++ /* ++ * Number of available maptrack entries. For cleanup purposes it is ++ * important to realize that this field and @maptrack further down will ++ * only ever be accessed by the local domain. Thus it is okay to clean ++ * up early, and to shrink the limit for the purpose of tracking cleanup ++ * progress. ++ */ + unsigned int maptrack_limit; + /* Shared grant table (see include/public/grant_table.h). */ + union { +@@ -3679,9 +3685,7 @@ do_grant_table_op( + #include "compat/grant_table.c" + #endif + +-void +-gnttab_release_mappings( +- struct domain *d) ++int gnttab_release_mappings(struct domain *d) + { + struct grant_table *gt = d->grant_table, *rgt; + struct grant_mapping *map; +@@ -3695,8 +3699,32 @@ gnttab_release_mappings( + + BUG_ON(!d->is_dying); + +- for ( handle = 0; handle < gt->maptrack_limit; handle++ ) ++ if ( !gt || !gt->maptrack ) ++ return 0; ++ ++ for ( handle = gt->maptrack_limit; handle; ) + { ++ /* ++ * Deal with full pages such that their freeing (in the body of the ++ * if()) remains simple. ++ */ ++ if ( handle < gt->maptrack_limit && !(handle % MAPTRACK_PER_PAGE) ) ++ { ++ /* ++ * Changing maptrack_limit alters nr_maptrack_frames()'es return ++ * value. Free the then excess trailing page right here, rather ++ * than leaving it to grant_table_destroy() (and in turn requiring ++ * to leave gt->maptrack_limit unaltered). ++ */ ++ gt->maptrack_limit = handle; ++ FREE_XENHEAP_PAGE(gt->maptrack[nr_maptrack_frames(gt)]); ++ ++ if ( hypercall_preempt_check() ) ++ return -ERESTART; ++ } ++ ++ --handle; ++ + map = &maptrack_entry(gt, handle); + if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) ) + continue; +@@ -3780,6 +3808,11 @@ gnttab_release_mappings( + + map->flags = 0; + } ++ ++ gt->maptrack_limit = 0; ++ FREE_XENHEAP_PAGE(gt->maptrack[0]); ++ ++ return 0; + } + + void grant_table_warn_active_grants(struct domain *d) +@@ -3843,8 +3876,7 @@ grant_table_destroy( + free_xenheap_page(t->shared_raw[i]); + xfree(t->shared_raw); + +- for ( i = 0; i < nr_maptrack_frames(t); i++ ) +- free_xenheap_page(t->maptrack[i]); ++ ASSERT(!t->maptrack_limit); + vfree(t->maptrack); + + for ( i = 0; i < nr_active_grant_frames(t); i++ ) +--- a/xen/include/xen/grant_table.h ++++ b/xen/include/xen/grant_table.h +@@ -47,9 +47,7 @@ void grant_table_init_vcpu(struct vcpu * + void grant_table_warn_active_grants(struct domain *d); + + /* Domain death release of granted mappings of other domains' memory. */ +-void +-gnttab_release_mappings( +- struct domain *d); ++int gnttab_release_mappings(struct domain *d); + + int mem_sharing_gref_to_gfn(struct grant_table *gt, grant_ref_t ref, + gfn_t *gfn, uint16_t *status); +@@ -80,7 +78,7 @@ static inline void grant_table_init_vcpu + + static inline void grant_table_warn_active_grants(struct domain *d) {} + +-static inline void gnttab_release_mappings(struct domain *d) {} ++static inline int gnttab_release_mappings(struct domain *d) { return 0; } + + static inline int mem_sharing_gref_to_gfn(struct grant_table *gt, + grant_ref_t ref, diff --git a/xsa380-2.patch b/xsa380-2.patch new file mode 100644 index 0000000..e5a134d --- /dev/null +++ b/xsa380-2.patch @@ -0,0 +1,410 @@ +From: Jan Beulich +Subject: gnttab: replace mapkind() + +mapkind() doesn't scale very well with larger maptrack entry counts, +using a brute force linear search through all entries, with the only +option of an early loop exit if a matching writable entry was found. +Introduce a radix tree alongside the main maptrack table, thus +allowing much faster MFN-based lookup. To avoid the need to actually +allocate space for the individual nodes, encode the two counters in the +node pointers themselves, thus limiting the number of permitted +simultaneous r/o and r/w mappings of the same MFN to 2³¹-1 (64-bit) / +2¹⁵-1 (32-bit) each. + +To avoid enforcing an unnecessarily low bound on the number of +simultaneous mappings of a single MFN, introduce +radix_tree_{ulong_to_ptr,ptr_to_ulong} paralleling +radix_tree_{int_to_ptr,ptr_to_int}. + +As a consequence locking changes are also applicable: With there no +longer being any inspection of the remote domain's active entries, +there's also no need anymore to hold the remote domain's grant table +lock. And since we're no longer iterating over the local domain's map +track table, the lock in map_grant_ref() can also be dropped before the +new maptrack entry actually gets populated. + +As a nice side effect this also reduces the number of IOMMU operations +in unmap_common(): Previously we would have "established" a readable +mapping whenever we didn't find a writable entry anymore (yet, of +course, at least one readable one). But we only need to do this if we +actually dropped the last writable entry, not if there were none already +before. + +This is part of CVE-2021-28698 / XSA-380. + +Signed-off-by: Jan Beulich +Reviewed-by: Julien Grall +--- +I hope that limiting the map count to 32k on Arm32 is good enough. I +also hope it is out of question that 2G of mappings are enough on 64-bit +architectures. + +I'm using the grant table lock for synchronization to limit differences +in behavior to prior code. I think in principle the maptrack lock could +be used equally well. + +Shouldn't IOMMU insertions be limited anyway to GNTMAP_device_map +requests? This would further save on the number of radix tree nodes in +need of maintaining. + +I'm hesitant to introduce GNTST_* in a security patch, but being able to +tell allocation failure or counter overflow from other errors might be +worthwhile. + +I don't think adding anything to gnttab_usage_print() is useful: +radix_tree_gang_lookup() requires nodes to record their own indexes into +the tree, which we don't do to save space. Yet without indexes printing +node contents isn't very useful. Plus there's also no printing of the +main maptrack table contents. +--- +v3: Check for radix_tree_lookup_slot() returning NULL. Convert -EEXIST + to -EBUSY. Add comments. Re-base over comment addition in patch 1. +v2: New. + +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -82,8 +83,13 @@ struct grant_table { + grant_status_t **status; + /* Active grant table. */ + struct active_grant_entry **active; +- /* Mapping tracking table per vcpu. */ ++ /* Handle-indexed tracking table of mappings. */ + struct grant_mapping **maptrack; ++ /* ++ * MFN-indexed tracking tree of mappings, if needed. Note that this is ++ * protected by @lock, not @maptrack_lock. ++ */ ++ struct radix_tree_root maptrack_tree; + + /* Domain to which this struct grant_table belongs. */ + const struct domain *domain; +@@ -516,34 +522,6 @@ static int get_paged_frame(unsigned long + return GNTST_okay; + } + +-static inline void +-double_gt_lock(struct grant_table *lgt, struct grant_table *rgt) +-{ +- /* +- * See mapkind() for why the write lock is also required for the +- * remote domain. +- */ +- if ( lgt < rgt ) +- { +- grant_write_lock(lgt); +- grant_write_lock(rgt); +- } +- else +- { +- if ( lgt != rgt ) +- grant_write_lock(rgt); +- grant_write_lock(lgt); +- } +-} +- +-static inline void +-double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt) +-{ +- grant_write_unlock(lgt); +- if ( lgt != rgt ) +- grant_write_unlock(rgt); +-} +- + #define INVALID_MAPTRACK_HANDLE UINT_MAX + + static inline grant_handle_t +@@ -970,41 +948,17 @@ static struct active_grant_entry *grant_ + return ERR_PTR(-EINVAL); + } + +-#define MAPKIND_READ 1 +-#define MAPKIND_WRITE 2 +-static unsigned int mapkind( +- struct grant_table *lgt, const struct domain *rd, mfn_t mfn) +-{ +- struct grant_mapping *map; +- grant_handle_t handle, limit = lgt->maptrack_limit; +- unsigned int kind = 0; +- +- /* +- * Must have the local domain's grant table write lock when +- * iterating over its maptrack entries. +- */ +- ASSERT(percpu_rw_is_write_locked(&lgt->lock)); +- /* +- * Must have the remote domain's grant table write lock while +- * counting its active entries. +- */ +- ASSERT(percpu_rw_is_write_locked(&rd->grant_table->lock)); +- +- smp_rmb(); +- +- for ( handle = 0; !(kind & MAPKIND_WRITE) && handle < limit; handle++ ) +- { +- map = &maptrack_entry(lgt, handle); +- if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) || +- map->domid != rd->domain_id ) +- continue; +- if ( mfn_eq(_active_entry(rd->grant_table, map->ref).mfn, mfn) ) +- kind |= map->flags & GNTMAP_readonly ? +- MAPKIND_READ : MAPKIND_WRITE; +- } +- +- return kind; +-} ++union maptrack_node { ++ struct { ++ /* Radix tree slot pointers use two of the bits. */ ++#ifdef __BIG_ENDIAN_BITFIELD ++ unsigned long : 2; ++#endif ++ unsigned long rd : BITS_PER_LONG / 2 - 1; ++ unsigned long wr : BITS_PER_LONG / 2 - 1; ++ } cnt; ++ unsigned long raw; ++}; + + static void + map_grant_ref( +@@ -1023,7 +977,6 @@ map_grant_ref( + struct grant_mapping *mt; + grant_entry_header_t *shah; + uint16_t *status; +- bool_t need_iommu; + + ld = current->domain; + +@@ -1244,31 +1197,75 @@ map_grant_ref( + * as mem-sharing and IOMMU use are incompatible). The dom_io case would + * need checking separately if we compared against owner here. + */ +- need_iommu = ld != rd && gnttab_need_iommu_mapping(ld); +- if ( need_iommu ) ++ if ( ld != rd && gnttab_need_iommu_mapping(ld) ) + { ++ union maptrack_node node = { ++ .cnt.rd = !!(op->flags & GNTMAP_readonly), ++ .cnt.wr = !(op->flags & GNTMAP_readonly), ++ }; ++ int err; ++ void **slot = NULL; + unsigned int kind; + +- double_gt_lock(lgt, rgt); ++ grant_write_lock(lgt); ++ ++ err = radix_tree_insert(&lgt->maptrack_tree, mfn_x(mfn), ++ radix_tree_ulong_to_ptr(node.raw)); ++ if ( err == -EEXIST ) ++ { ++ slot = radix_tree_lookup_slot(&lgt->maptrack_tree, mfn_x(mfn)); ++ if ( likely(slot) ) ++ { ++ node.raw = radix_tree_ptr_to_ulong(*slot); ++ err = -EBUSY; ++ ++ /* Update node only when refcount doesn't overflow. */ ++ if ( op->flags & GNTMAP_readonly ? ++node.cnt.rd ++ : ++node.cnt.wr ) ++ { ++ radix_tree_replace_slot(slot, ++ radix_tree_ulong_to_ptr(node.raw)); ++ err = 0; ++ } ++ } ++ else ++ ASSERT_UNREACHABLE(); ++ } + + /* + * We're not translated, so we know that dfns and mfns are + * the same things, so the IOMMU entry is always 1-to-1. + */ +- kind = mapkind(lgt, rd, mfn); +- if ( !(op->flags & GNTMAP_readonly) && +- !(kind & MAPKIND_WRITE) ) ++ if ( !(op->flags & GNTMAP_readonly) && node.cnt.wr == 1 ) + kind = IOMMUF_readable | IOMMUF_writable; +- else if ( !kind ) ++ else if ( (op->flags & GNTMAP_readonly) && ++ node.cnt.rd == 1 && !node.cnt.wr ) + kind = IOMMUF_readable; + else + kind = 0; +- if ( kind && iommu_legacy_map(ld, _dfn(mfn_x(mfn)), mfn, 1, kind) ) ++ if ( err || ++ (kind && iommu_legacy_map(ld, _dfn(mfn_x(mfn)), mfn, 1, kind)) ) + { +- double_gt_unlock(lgt, rgt); ++ if ( !err ) ++ { ++ if ( slot ) ++ { ++ op->flags & GNTMAP_readonly ? node.cnt.rd-- ++ : node.cnt.wr--; ++ radix_tree_replace_slot(slot, ++ radix_tree_ulong_to_ptr(node.raw)); ++ } ++ else ++ radix_tree_delete(&lgt->maptrack_tree, mfn_x(mfn)); ++ } ++ + rc = GNTST_general_error; +- goto undo_out; + } ++ ++ grant_write_unlock(lgt); ++ ++ if ( rc != GNTST_okay ) ++ goto undo_out; + } + + TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op->dom); +@@ -1276,10 +1273,6 @@ map_grant_ref( + /* + * All maptrack entry users check mt->flags first before using the + * other fields so just ensure the flags field is stored last. +- * +- * However, if gnttab_need_iommu_mapping() then this would race +- * with a concurrent mapkind() call (on an unmap, for example) +- * and a lock is required. + */ + mt = &maptrack_entry(lgt, handle); + mt->domid = op->dom; +@@ -1287,9 +1280,6 @@ map_grant_ref( + smp_wmb(); + write_atomic(&mt->flags, op->flags); + +- if ( need_iommu ) +- double_gt_unlock(lgt, rgt); +- + op->dev_bus_addr = mfn_to_maddr(mfn); + op->handle = handle; + op->status = GNTST_okay; +@@ -1497,19 +1487,34 @@ unmap_common( + /* See the respective comment in map_grant_ref(). */ + if ( rc == GNTST_okay && ld != rd && gnttab_need_iommu_mapping(ld) ) + { +- unsigned int kind; ++ void **slot; ++ union maptrack_node node; + int err = 0; + +- double_gt_lock(lgt, rgt); ++ grant_write_lock(lgt); ++ slot = radix_tree_lookup_slot(&lgt->maptrack_tree, mfn_x(op->mfn)); ++ node.raw = likely(slot) ? radix_tree_ptr_to_ulong(*slot) : 0; ++ ++ /* Refcount must not underflow. */ ++ if ( !(flags & GNTMAP_readonly ? node.cnt.rd-- ++ : node.cnt.wr--) ) ++ BUG(); + +- kind = mapkind(lgt, rd, op->mfn); +- if ( !kind ) ++ if ( !node.raw ) + err = iommu_legacy_unmap(ld, _dfn(mfn_x(op->mfn)), 1); +- else if ( !(kind & MAPKIND_WRITE) ) ++ else if ( !(flags & GNTMAP_readonly) && !node.cnt.wr ) + err = iommu_legacy_map(ld, _dfn(mfn_x(op->mfn)), op->mfn, 1, + IOMMUF_readable); + +- double_gt_unlock(lgt, rgt); ++ if ( err ) ++ ; ++ else if ( !node.raw ) ++ radix_tree_delete(&lgt->maptrack_tree, mfn_x(op->mfn)); ++ else ++ radix_tree_replace_slot(slot, ++ radix_tree_ulong_to_ptr(node.raw)); ++ ++ grant_write_unlock(lgt); + + if ( err ) + rc = GNTST_general_error; +@@ -1956,6 +1961,8 @@ int grant_table_init(struct domain *d, i + gt->maptrack = vzalloc(gt->max_maptrack_frames * sizeof(*gt->maptrack)); + if ( gt->maptrack == NULL ) + goto out; ++ ++ radix_tree_init(>->maptrack_tree); + } + + /* Shared grant table. */ +@@ -3704,6 +3711,8 @@ int gnttab_release_mappings(struct domai + + for ( handle = gt->maptrack_limit; handle; ) + { ++ mfn_t mfn; ++ + /* + * Deal with full pages such that their freeing (in the body of the + * if()) remains simple. +@@ -3801,17 +3810,31 @@ int gnttab_release_mappings(struct domai + + reduce_status_for_pin(rd, act, status, map->flags & GNTMAP_readonly); + ++ mfn = act->mfn; ++ + active_entry_release(act); + grant_read_unlock(rgt); + + rcu_unlock_domain(rd); + + map->flags = 0; ++ ++ /* ++ * This is excessive in that a single such call would suffice per ++ * mapped MFN (or none at all, if no entry was ever inserted). But it ++ * should be the common case for an MFN to be mapped just once, and ++ * this way we don't need to further maintain the counters. We also ++ * don't want to leave cleaning up of the tree as a whole to the end ++ * of the function, as this could take quite some time. ++ */ ++ radix_tree_delete(>->maptrack_tree, mfn_x(mfn)); + } + + gt->maptrack_limit = 0; + FREE_XENHEAP_PAGE(gt->maptrack[0]); + ++ radix_tree_destroy(>->maptrack_tree, NULL); ++ + return 0; + } + +--- a/xen/include/xen/radix-tree.h ++++ b/xen/include/xen/radix-tree.h +@@ -190,6 +190,25 @@ static inline int radix_tree_ptr_to_int( + return (int)((long)ptr >> 2); + } + ++/** ++ * radix_tree_{ulong_to_ptr,ptr_to_ulong}: ++ * ++ * Same for unsigned long values. Beware though that only BITS_PER_LONG-2 ++ * bits are actually usable for the value. ++ */ ++static inline void *radix_tree_ulong_to_ptr(unsigned long val) ++{ ++ unsigned long ptr = (val << 2) | 0x2; ++ ASSERT((ptr >> 2) == val); ++ return (void *)ptr; ++} ++ ++static inline unsigned long radix_tree_ptr_to_ulong(void *ptr) ++{ ++ ASSERT(((unsigned long)ptr & 0x3) == 0x2); ++ return (unsigned long)ptr >> 2; ++} ++ + int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); + void *radix_tree_lookup(struct radix_tree_root *, unsigned long); + void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); diff --git a/xsa380-4.14-1.patch b/xsa380-4.14-1.patch new file mode 100644 index 0000000..7221248 --- /dev/null +++ b/xsa380-4.14-1.patch @@ -0,0 +1,148 @@ +From: Jan Beulich +Subject: gnttab: add preemption check to gnttab_release_mappings() + +A guest may die with many grant mappings still in place, or simply with +a large maptrack table. Iterating through this may take more time than +is reasonable without intermediate preemption (to run softirqs and +perhaps the scheduler). + +Move the invocation of the function to the section where other +restartable functions get invoked, and have the function itself check +for preemption every once in a while. Have it iterate the table +backwards, such that decreasing the maptrack limit is all it takes to +convey restart information. + +In domain_teardown() introduce PROG_none such that inserting at the +front will be easier going forward. + +This is part of CVE-2021-28698 / XSA-380. + +Reported-by: Andrew Cooper +Signed-off-by: Jan Beulich +Reviewed-by: Julien Grall + +--- a/xen/common/domain.c ++++ b/xen/common/domain.c +@@ -721,11 +721,13 @@ int domain_kill(struct domain *d) + return domain_kill(d); + d->is_dying = DOMDYING_dying; + argo_destroy(d); +- gnttab_release_mappings(d); + vnuma_destroy(d->vnuma); + domain_set_outstanding_pages(d, 0); + /* fallthrough */ + case DOMDYING_dying: ++ rc = gnttab_release_mappings(d); ++ if ( rc ) ++ break; + rc = evtchn_destroy(d); + if ( rc ) + break; +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -64,7 +64,13 @@ struct grant_table { + unsigned int nr_grant_frames; + /* Number of grant status frames shared with guest (for version 2) */ + unsigned int nr_status_frames; +- /* Number of available maptrack entries. */ ++ /* ++ * Number of available maptrack entries. For cleanup purposes it is ++ * important to realize that this field and @maptrack further down will ++ * only ever be accessed by the local domain. Thus it is okay to clean ++ * up early, and to shrink the limit for the purpose of tracking cleanup ++ * progress. ++ */ + unsigned int maptrack_limit; + /* Shared grant table (see include/public/grant_table.h). */ + union { +@@ -3708,9 +3714,7 @@ do_grant_table_op( + #include "compat/grant_table.c" + #endif + +-void +-gnttab_release_mappings( +- struct domain *d) ++int gnttab_release_mappings(struct domain *d) + { + struct grant_table *gt = d->grant_table, *rgt; + struct grant_mapping *map; +@@ -3724,10 +3728,34 @@ gnttab_release_mappings( + + BUG_ON(!d->is_dying); + +- for ( handle = 0; handle < gt->maptrack_limit; handle++ ) ++ if ( !gt || !gt->maptrack ) ++ return 0; ++ ++ for ( handle = gt->maptrack_limit; handle; ) + { + unsigned int clear_flags = 0; + ++ /* ++ * Deal with full pages such that their freeing (in the body of the ++ * if()) remains simple. ++ */ ++ if ( handle < gt->maptrack_limit && !(handle % MAPTRACK_PER_PAGE) ) ++ { ++ /* ++ * Changing maptrack_limit alters nr_maptrack_frames()'es return ++ * value. Free the then excess trailing page right here, rather ++ * than leaving it to grant_table_destroy() (and in turn requiring ++ * to leave gt->maptrack_limit unaltered). ++ */ ++ gt->maptrack_limit = handle; ++ FREE_XENHEAP_PAGE(gt->maptrack[nr_maptrack_frames(gt)]); ++ ++ if ( hypercall_preempt_check() ) ++ return -ERESTART; ++ } ++ ++ --handle; ++ + map = &maptrack_entry(gt, handle); + if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) ) + continue; +@@ -3818,6 +3846,11 @@ gnttab_release_mappings( + + map->flags = 0; + } ++ ++ gt->maptrack_limit = 0; ++ FREE_XENHEAP_PAGE(gt->maptrack[0]); ++ ++ return 0; + } + + void grant_table_warn_active_grants(struct domain *d) +@@ -3881,8 +3914,7 @@ grant_table_destroy( + free_xenheap_page(t->shared_raw[i]); + xfree(t->shared_raw); + +- for ( i = 0; i < nr_maptrack_frames(t); i++ ) +- free_xenheap_page(t->maptrack[i]); ++ ASSERT(!t->maptrack_limit); + vfree(t->maptrack); + + for ( i = 0; i < nr_active_grant_frames(t); i++ ) +--- a/xen/include/xen/grant_table.h ++++ b/xen/include/xen/grant_table.h +@@ -47,9 +47,7 @@ void grant_table_init_vcpu(struct vcpu * + void grant_table_warn_active_grants(struct domain *d); + + /* Domain death release of granted mappings of other domains' memory. */ +-void +-gnttab_release_mappings( +- struct domain *d); ++int gnttab_release_mappings(struct domain *d); + + int mem_sharing_gref_to_gfn(struct grant_table *gt, grant_ref_t ref, + gfn_t *gfn, uint16_t *status); +@@ -78,7 +76,7 @@ static inline void grant_table_init_vcpu + + static inline void grant_table_warn_active_grants(struct domain *d) {} + +-static inline void gnttab_release_mappings(struct domain *d) {} ++static inline int gnttab_release_mappings(struct domain *d) { return 0; } + + static inline int mem_sharing_gref_to_gfn(struct grant_table *gt, + grant_ref_t ref, diff --git a/xsa380-4.14-2.patch b/xsa380-4.14-2.patch new file mode 100644 index 0000000..d8b38b9 --- /dev/null +++ b/xsa380-4.14-2.patch @@ -0,0 +1,383 @@ +From: Jan Beulich +Subject: gnttab: replace mapkind() + +mapkind() doesn't scale very well with larger maptrack entry counts, +using a brute force linear search through all entries, with the only +option of an early loop exit if a matching writable entry was found. +Introduce a radix tree alongside the main maptrack table, thus +allowing much faster MFN-based lookup. To avoid the need to actually +allocate space for the individual nodes, encode the two counters in the +node pointers themselves, thus limiting the number of permitted +simultaneous r/o and r/w mappings of the same MFN to 2³¹-1 (64-bit) / +2¹⁵-1 (32-bit) each. + +To avoid enforcing an unnecessarily low bound on the number of +simultaneous mappings of a single MFN, introduce +radix_tree_{ulong_to_ptr,ptr_to_ulong} paralleling +radix_tree_{int_to_ptr,ptr_to_int}. + +As a consequence locking changes are also applicable: With there no +longer being any inspection of the remote domain's active entries, +there's also no need anymore to hold the remote domain's grant table +lock. And since we're no longer iterating over the local domain's map +track table, the lock in map_grant_ref() can also be dropped before the +new maptrack entry actually gets populated. + +As a nice side effect this also reduces the number of IOMMU operations +in unmap_common(): Previously we would have "established" a readable +mapping whenever we didn't find a writable entry anymore (yet, of +course, at least one readable one). But we only need to do this if we +actually dropped the last writable entry, not if there were none already +before. + +This is part of CVE-2021-28698 / XSA-380. + +Signed-off-by: Jan Beulich +Reviewed-by: Julien Grall + +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -82,8 +83,13 @@ struct grant_table { + grant_status_t **status; + /* Active grant table. */ + struct active_grant_entry **active; +- /* Mapping tracking table per vcpu. */ ++ /* Handle-indexed tracking table of mappings. */ + struct grant_mapping **maptrack; ++ /* ++ * MFN-indexed tracking tree of mappings, if needed. Note that this is ++ * protected by @lock, not @maptrack_lock. ++ */ ++ struct radix_tree_root maptrack_tree; + + /* Domain to which this struct grant_table belongs. */ + const struct domain *domain; +@@ -501,34 +507,6 @@ static int get_paged_frame(unsigned long + return GNTST_okay; + } + +-static inline void +-double_gt_lock(struct grant_table *lgt, struct grant_table *rgt) +-{ +- /* +- * See mapkind() for why the write lock is also required for the +- * remote domain. +- */ +- if ( lgt < rgt ) +- { +- grant_write_lock(lgt); +- grant_write_lock(rgt); +- } +- else +- { +- if ( lgt != rgt ) +- grant_write_lock(rgt); +- grant_write_lock(lgt); +- } +-} +- +-static inline void +-double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt) +-{ +- grant_write_unlock(lgt); +- if ( lgt != rgt ) +- grant_write_unlock(rgt); +-} +- + #define INVALID_MAPTRACK_HANDLE UINT_MAX + + static inline grant_handle_t +@@ -948,41 +926,17 @@ static struct active_grant_entry *grant_ + return ERR_PTR(-EINVAL); + } + +-#define MAPKIND_READ 1 +-#define MAPKIND_WRITE 2 +-static unsigned int mapkind( +- struct grant_table *lgt, const struct domain *rd, mfn_t mfn) +-{ +- struct grant_mapping *map; +- grant_handle_t handle, limit = lgt->maptrack_limit; +- unsigned int kind = 0; +- +- /* +- * Must have the local domain's grant table write lock when +- * iterating over its maptrack entries. +- */ +- ASSERT(percpu_rw_is_write_locked(&lgt->lock)); +- /* +- * Must have the remote domain's grant table write lock while +- * counting its active entries. +- */ +- ASSERT(percpu_rw_is_write_locked(&rd->grant_table->lock)); +- +- smp_rmb(); +- +- for ( handle = 0; !(kind & MAPKIND_WRITE) && handle < limit; handle++ ) +- { +- map = &maptrack_entry(lgt, handle); +- if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) || +- map->domid != rd->domain_id ) +- continue; +- if ( mfn_eq(_active_entry(rd->grant_table, map->ref).mfn, mfn) ) +- kind |= map->flags & GNTMAP_readonly ? +- MAPKIND_READ : MAPKIND_WRITE; +- } +- +- return kind; +-} ++union maptrack_node { ++ struct { ++ /* Radix tree slot pointers use two of the bits. */ ++#ifdef __BIG_ENDIAN_BITFIELD ++ unsigned long : 2; ++#endif ++ unsigned long rd : BITS_PER_LONG / 2 - 1; ++ unsigned long wr : BITS_PER_LONG / 2 - 1; ++ } cnt; ++ unsigned long raw; ++}; + + static void + map_grant_ref( +@@ -1001,7 +955,6 @@ map_grant_ref( + struct grant_mapping *mt; + grant_entry_header_t *shah; + uint16_t *status; +- bool_t need_iommu; + + ld = current->domain; + +@@ -1220,31 +1173,75 @@ map_grant_ref( + * as mem-sharing and IOMMU use are incompatible). The dom_io case would + * need checking separately if we compared against owner here. + */ +- need_iommu = ld != rd && gnttab_need_iommu_mapping(ld); +- if ( need_iommu ) ++ if ( ld != rd && gnttab_need_iommu_mapping(ld) ) + { ++ union maptrack_node node = { ++ .cnt.rd = !!(op->flags & GNTMAP_readonly), ++ .cnt.wr = !(op->flags & GNTMAP_readonly), ++ }; ++ int err; ++ void **slot = NULL; + unsigned int kind; + +- double_gt_lock(lgt, rgt); ++ grant_write_lock(lgt); ++ ++ err = radix_tree_insert(&lgt->maptrack_tree, mfn_x(mfn), ++ radix_tree_ulong_to_ptr(node.raw)); ++ if ( err == -EEXIST ) ++ { ++ slot = radix_tree_lookup_slot(&lgt->maptrack_tree, mfn_x(mfn)); ++ if ( likely(slot) ) ++ { ++ node.raw = radix_tree_ptr_to_ulong(*slot); ++ err = -EBUSY; ++ ++ /* Update node only when refcount doesn't overflow. */ ++ if ( op->flags & GNTMAP_readonly ? ++node.cnt.rd ++ : ++node.cnt.wr ) ++ { ++ radix_tree_replace_slot(slot, ++ radix_tree_ulong_to_ptr(node.raw)); ++ err = 0; ++ } ++ } ++ else ++ ASSERT_UNREACHABLE(); ++ } + + /* + * We're not translated, so we know that dfns and mfns are + * the same things, so the IOMMU entry is always 1-to-1. + */ +- kind = mapkind(lgt, rd, mfn); +- if ( !(op->flags & GNTMAP_readonly) && +- !(kind & MAPKIND_WRITE) ) ++ if ( !(op->flags & GNTMAP_readonly) && node.cnt.wr == 1 ) + kind = IOMMUF_readable | IOMMUF_writable; +- else if ( !kind ) ++ else if ( (op->flags & GNTMAP_readonly) && ++ node.cnt.rd == 1 && !node.cnt.wr ) + kind = IOMMUF_readable; + else + kind = 0; +- if ( kind && iommu_legacy_map(ld, _dfn(mfn_x(mfn)), mfn, 0, kind) ) ++ if ( err || ++ (kind && iommu_legacy_map(ld, _dfn(mfn_x(mfn)), mfn, 0, kind)) ) + { +- double_gt_unlock(lgt, rgt); ++ if ( !err ) ++ { ++ if ( slot ) ++ { ++ op->flags & GNTMAP_readonly ? node.cnt.rd-- ++ : node.cnt.wr--; ++ radix_tree_replace_slot(slot, ++ radix_tree_ulong_to_ptr(node.raw)); ++ } ++ else ++ radix_tree_delete(&lgt->maptrack_tree, mfn_x(mfn)); ++ } ++ + rc = GNTST_general_error; +- goto undo_out; + } ++ ++ grant_write_unlock(lgt); ++ ++ if ( rc != GNTST_okay ) ++ goto undo_out; + } + + TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op->dom); +@@ -1252,10 +1249,6 @@ map_grant_ref( + /* + * All maptrack entry users check mt->flags first before using the + * other fields so just ensure the flags field is stored last. +- * +- * However, if gnttab_need_iommu_mapping() then this would race +- * with a concurrent mapkind() call (on an unmap, for example) +- * and a lock is required. + */ + mt = &maptrack_entry(lgt, handle); + mt->domid = op->dom; +@@ -1263,9 +1256,6 @@ map_grant_ref( + smp_wmb(); + write_atomic(&mt->flags, op->flags); + +- if ( need_iommu ) +- double_gt_unlock(lgt, rgt); +- + op->dev_bus_addr = mfn_to_maddr(mfn); + op->handle = handle; + op->status = GNTST_okay; +@@ -1487,19 +1477,34 @@ unmap_common( + /* See the respective comment in map_grant_ref(). */ + if ( rc == GNTST_okay && ld != rd && gnttab_need_iommu_mapping(ld) ) + { +- unsigned int kind; ++ void **slot; ++ union maptrack_node node; + int err = 0; + +- double_gt_lock(lgt, rgt); ++ grant_write_lock(lgt); ++ slot = radix_tree_lookup_slot(&lgt->maptrack_tree, mfn_x(op->mfn)); ++ node.raw = likely(slot) ? radix_tree_ptr_to_ulong(*slot) : 0; ++ ++ /* Refcount must not underflow. */ ++ if ( !(flags & GNTMAP_readonly ? node.cnt.rd-- ++ : node.cnt.wr--) ) ++ BUG(); + +- kind = mapkind(lgt, rd, op->mfn); +- if ( !kind ) ++ if ( !node.raw ) + err = iommu_legacy_unmap(ld, _dfn(mfn_x(op->mfn)), 0); +- else if ( !(kind & MAPKIND_WRITE) ) ++ else if ( !(flags & GNTMAP_readonly) && !node.cnt.wr ) + err = iommu_legacy_map(ld, _dfn(mfn_x(op->mfn)), op->mfn, 0, + IOMMUF_readable); + +- double_gt_unlock(lgt, rgt); ++ if ( err ) ++ ; ++ else if ( !node.raw ) ++ radix_tree_delete(&lgt->maptrack_tree, mfn_x(op->mfn)); ++ else ++ radix_tree_replace_slot(slot, ++ radix_tree_ulong_to_ptr(node.raw)); ++ ++ grant_write_unlock(lgt); + + if ( err ) + rc = GNTST_general_error; +@@ -1951,6 +1956,8 @@ int grant_table_init(struct domain *d, i + gt->maptrack = vzalloc(gt->max_maptrack_frames * sizeof(*gt->maptrack)); + if ( gt->maptrack == NULL ) + goto out; ++ ++ radix_tree_init(>->maptrack_tree); + } + + /* Shared grant table. */ +@@ -3734,6 +3741,7 @@ int gnttab_release_mappings(struct domai + for ( handle = gt->maptrack_limit; handle; ) + { + unsigned int clear_flags = 0; ++ mfn_t mfn; + + /* + * Deal with full pages such that their freeing (in the body of the +@@ -3839,17 +3847,31 @@ int gnttab_release_mappings(struct domai + if ( clear_flags ) + gnttab_clear_flags(rd, clear_flags, status); + ++ mfn = act->mfn; ++ + active_entry_release(act); + grant_read_unlock(rgt); + + rcu_unlock_domain(rd); + + map->flags = 0; ++ ++ /* ++ * This is excessive in that a single such call would suffice per ++ * mapped MFN (or none at all, if no entry was ever inserted). But it ++ * should be the common case for an MFN to be mapped just once, and ++ * this way we don't need to further maintain the counters. We also ++ * don't want to leave cleaning up of the tree as a whole to the end ++ * of the function, as this could take quite some time. ++ */ ++ radix_tree_delete(>->maptrack_tree, mfn_x(mfn)); + } + + gt->maptrack_limit = 0; + FREE_XENHEAP_PAGE(gt->maptrack[0]); + ++ radix_tree_destroy(>->maptrack_tree, NULL); ++ + return 0; + } + +--- a/xen/include/xen/radix-tree.h ++++ b/xen/include/xen/radix-tree.h +@@ -190,6 +190,25 @@ static inline int radix_tree_ptr_to_int( + return (int)((long)ptr >> 2); + } + ++/** ++ * radix_tree_{ulong_to_ptr,ptr_to_ulong}: ++ * ++ * Same for unsigned long values. Beware though that only BITS_PER_LONG-2 ++ * bits are actually usable for the value. ++ */ ++static inline void *radix_tree_ulong_to_ptr(unsigned long val) ++{ ++ unsigned long ptr = (val << 2) | 0x2; ++ ASSERT((ptr >> 2) == val); ++ return (void *)ptr; ++} ++ ++static inline unsigned long radix_tree_ptr_to_ulong(void *ptr) ++{ ++ ASSERT(((unsigned long)ptr & 0x3) == 0x2); ++ return (unsigned long)ptr >> 2; ++} ++ + int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); + void *radix_tree_lookup(struct radix_tree_root *, unsigned long); + void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); diff --git a/xsa382.patch b/xsa382.patch new file mode 100644 index 0000000..936c2de --- /dev/null +++ b/xsa382.patch @@ -0,0 +1,34 @@ +From: Jan Beulich +Subject: gnttab: fix array capacity check in gnttab_get_status_frames() + +The number of grant frames is of no interest here; converting the passed +in op.nr_frames this way means we allow for 8 times as many GFNs to be +written as actually fit in the array. We would corrupt xlat areas of +higher vCPU-s (after having faulted many times while trying to write to +the guard pages between any two areas) for 32-bit PV guests. For HVM +guests we'd simply crash as soon as we hit the first guard page, as +accesses to the xlat area are simply memcpy() there. + +This is CVE-2021-28699 / XSA-382. + +Fixes: 18b1be5e324b ("gnttab: make resource limits per domain") +Signed-off-by: Jan Beulich + +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -3243,12 +3243,11 @@ gnttab_get_status_frames(XEN_GUEST_HANDL + goto unlock; + } + +- if ( unlikely(limit_max < grant_to_status_frames(op.nr_frames)) ) ++ if ( unlikely(limit_max < op.nr_frames) ) + { + gdprintk(XENLOG_WARNING, +- "grant_to_status_frames(%u) for d%d is too large (%u,%u)\n", +- op.nr_frames, d->domain_id, +- grant_to_status_frames(op.nr_frames), limit_max); ++ "nr_status_frames for %pd is too large (%u,%u)\n", ++ d, op.nr_frames, limit_max); + op.status = GNTST_general_error; + goto unlock; + } diff --git a/xsa383.patch b/xsa383.patch new file mode 100644 index 0000000..9ab5eb3 --- /dev/null +++ b/xsa383.patch @@ -0,0 +1,55 @@ +From: Julien Grall +Date: Sat, 3 Jul 2021 14:03:36 +0100 +Subject: [PATCH] xen/arm: Restrict the amount of memory that dom0less domU and + dom0 can allocate + +Currently, both dom0less domUs and dom0 can allocate an "unlimited" +amount of memory because d->max_pages is set to ~0U. + +In particular, the former are meant to be unprivileged. Therefore the +memory they could allocate should be bounded. As the domain are not yet +officially aware of Xen (we don't expose advertise it in the DT, yet +the hypercalls are accessible), they should not need to allocate more +than the initial amount. So cap set d->max_pages directly the amount of +memory we are meant to allocate. + +Take the opportunity to also restrict the memory for dom0 as the +domain is direct mapped (e.g. MFN == GFN) and therefore cannot +allocate outside of the pre-allocated region. + +This is CVE-2021-28700 / XSA-383. + +Reported-by: Julien Grall +Signed-off-by: Julien Grall +Reviewed-by: Stefano Stabellini +Tested-by: Stefano Stabellini +--- + xen/arch/arm/domain_build.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c +index 6c86d527810f..206038d1c022 100644 +--- a/xen/arch/arm/domain_build.c ++++ b/xen/arch/arm/domain_build.c +@@ -2440,7 +2440,8 @@ static int __init construct_domU(struct domain *d, + + if ( vcpu_create(d, 0) == NULL ) + return -ENOMEM; +- d->max_pages = ~0U; ++ ++ d->max_pages = ((paddr_t)mem * SZ_1K) >> PAGE_SHIFT; + + kinfo.d = d; + +@@ -2546,7 +2547,7 @@ static int __init construct_dom0(struct domain *d) + + iommu_hwdom_init(d); + +- d->max_pages = ~0U; ++ d->max_pages = dom0_mem >> PAGE_SHIFT; + + kinfo.unassigned_mem = dom0_mem; + kinfo.d = d; +-- +2.17.1 +