From 7724e33e7340b2beac564efcd2629b3c7a1939f2 Mon Sep 17 00:00:00 2001 From: Michael Young Date: Sep 12 2021 15:19:18 +0000 Subject: update to xen-4.15.1 --- diff --git a/.gitignore b/.gitignore index 794101c..10627f5 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,4 @@ lwip-1.3.0.tar.gz pciutils-2.2.9.tar.bz2 zlib-1.2.3.tar.gz polarssl-1.1.4-gpl.tgz -/xen-4.15.0.tar.gz +/xen-4.15.1.tar.gz diff --git a/sources b/sources index 875f44f..977c8aa 100644 --- a/sources +++ b/sources @@ -4,4 +4,4 @@ SHA512 (newlib-1.16.0.tar.gz) = 40eb96bbc6736a16b6399e0cdb73e853d0d90b685c967e77 SHA512 (zlib-1.2.3.tar.gz) = 021b958fcd0d346c4ba761bcf0cc40f3522de6186cf5a0a6ea34a70504ce9622b1c2626fce40675bc8282cf5f5ade18473656abc38050f72f5d6480507a2106e SHA512 (polarssl-1.1.4-gpl.tgz) = 88da614e4d3f4409c4fd3bb3e44c7587ba051e3fed4e33d526069a67e8180212e1ea22da984656f50e290049f60ddca65383e5983c0f8884f648d71f698303ad SHA512 (pciutils-2.2.9.tar.bz2) = 2b3d98d027e46d8c08037366dde6f0781ca03c610ef2b380984639e4ef39899ed8d8b8e4cd9c9dc54df101279b95879bd66bfd4d04ad07fef41e847ea7ae32b5 -SHA512 (xen-4.15.0.tar.gz) = 93683b8a97387ca5f003c635a11d163e61c87dbdc9a03081f9155fe87b49f1dfa74ce243fcd5e04dc009353a36e2375b786f1ebde828b5951a094cd64197b4c7 +SHA512 (xen-4.15.1.tar.gz) = 8d3cbdf708f46477e32ee7cbd16a490c82efa855cecd84ee712b8680df4d69c987ba9ab00ff3851f627b98a8ebbc5dab71f92f142ed958ee2bc538bc792cd4b9 diff --git a/xen.gcc11.fixes.patch b/xen.gcc11.fixes.patch index a075a6f..31db0cb 100644 --- a/xen.gcc11.fixes.patch +++ b/xen.gcc11.fixes.patch @@ -9,28 +9,6 @@ unsigned int mbytes, vmac_ctx_t *ctx); ---- xen-4.14.0/tools/libs/foreignmemory/linux.c.orig 2020-07-23 16:07:51.000000000 +0100 -+++ xen-4.14.0/tools/libs/foreignmemory/linux.c 2020-10-25 21:36:00.982040566 +0000 -@@ -162,7 +162,7 @@ - void *osdep_xenforeignmemory_map(xenforeignmemory_handle *fmem, - uint32_t dom, void *addr, - int prot, int flags, size_t num, -- const xen_pfn_t arr[/*num*/], int err[/*num*/]) -+ const xen_pfn_t arr[num], int err[num]) - { - int fd = fmem->fd; - privcmd_mmapbatch_v2_t ioctlx; ---- xen-4.14.0/tools/libs/foreignmemory/minios.c.orig 2020-07-23 16:07:51.000000000 +0100 -+++ xen-4.14.0/tools/libs/foreignmemory/minios.c 2020-10-26 22:36:12.423883688 +0000 -@@ -42,7 +42,7 @@ - void *osdep_xenforeignmemory_map(xenforeignmemory_handle *fmem, - uint32_t dom, void *addr, - int prot, int flags, size_t num, -- const xen_pfn_t arr[/*num*/], int err[/*num*/]) -+ const xen_pfn_t arr[num], int err[num]) - { - unsigned long pt_prot = 0; - if (prot & PROT_READ) diff --git a/xen/arch/x86/tboot.c b/xen/arch/x86/tboot.c index 320e06f..618ae92 100644 --- a/xen/arch/x86/tboot.c @@ -44,20 +22,3 @@ index 320e06f..618ae92 100644 /* Look for valid page-aligned address for shared page. */ if ( !opt_tboot_pa || (opt_tboot_pa & ~PAGE_MASK) ) -diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c -index 84bb8e0..6ecf5db 100644 ---- a/xen/arch/x86/x86_emulate/x86_emulate.c -+++ b/xen/arch/x86/x86_emulate/x86_emulate.c -@@ -725,9 +725,9 @@ union vex { - #define copy_VEX(ptr, vex) ({ \ - if ( !mode_64bit() ) \ - (vex).reg |= 8; \ -- (ptr)[0 - PFX_BYTES] = ext < ext_8f08 ? 0xc4 : 0x8f; \ -- (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \ -- (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \ -+ ((volatile uint8_t *)ptr)[0 - PFX_BYTES] = ext < ext_8f08 ? 0xc4 : 0x8f; \ -+ ((volatile uint8_t *)ptr)[1 - PFX_BYTES] = (vex).raw[0]; \ -+ ((volatile uint8_t *)ptr)[2 - PFX_BYTES] = (vex).raw[1]; \ - container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \ - }) - diff --git a/xen.git-3581714729cf84df50d6831c4da076e21587408c.patch b/xen.git-3581714729cf84df50d6831c4da076e21587408c.patch deleted file mode 100644 index 6eb4f71..0000000 --- a/xen.git-3581714729cf84df50d6831c4da076e21587408c.patch +++ /dev/null @@ -1,117 +0,0 @@ -From 3581714729cf84df50d6831c4da076e21587408c Mon Sep 17 00:00:00 2001 -From: Jan Beulich -Date: Thu, 15 Jul 2021 09:21:04 +0200 -Subject: [PATCH] VT-d: adjust domid map updating when unmapping context - -When an earlier error occurred, cleaning up the domid mapping data is -wrong, as references likely still exist. The only exception to this is -when the actual unmapping worked, but some flush failed (supposedly -impossible after XSA-373). The guest will get crashed in such a case -though, so add fallback cleanup to domain destruction to cover this -case. This in turn makes it desirable to silence the dprintk() in -domain_iommu_domid(). - -Note that no error will be returned anymore when the lookup fails - in -the common case lookup failure would already have caused -domain_context_unmap_one() to fail, yet even from a more general -perspective it doesn't look right to fail domain_context_unmap() in such -a case when this was the last device, but not when any earlier unmap was -otherwise successful. - -Signed-off-by: Jan Beulich -Reviewed-by: Kevin Tian -master commit: 32655880057ce2829f962d46916ea6cec60f98d3 -master date: 2021-06-24 16:29:13 +0200 ---- - xen/drivers/passthrough/vtd/iommu.c | 39 ++++++++++++++++++----------- - 1 file changed, 24 insertions(+), 15 deletions(-) - -diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c -index c0070f1c99..2b3609dae2 100644 ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -80,9 +80,11 @@ static int domain_iommu_domid(struct domain *d, - i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1); - } - -- dprintk(XENLOG_ERR VTDPREFIX, -- "Cannot get valid iommu domid: domid=%d iommu->index=%d\n", -- d->domain_id, iommu->index); -+ if ( !d->is_dying ) -+ dprintk(XENLOG_ERR VTDPREFIX, -+ "Cannot get valid iommu %u domid: %pd\n", -+ iommu->index, d); -+ - return -1; - } - -@@ -147,6 +149,17 @@ static int context_get_domain_id(struct context_entry *context, - return domid; - } - -+static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu) -+{ -+ int iommu_domid = domain_iommu_domid(domain, iommu); -+ -+ if ( iommu_domid >= 0 ) -+ { -+ clear_bit(iommu_domid, iommu->domid_bitmap); -+ iommu->domid_map[iommu_domid] = 0; -+ } -+} -+ - static void sync_cache(const void *addr, unsigned int size) - { - static unsigned long clflush_size = 0; -@@ -1732,6 +1745,9 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, - goto out; - } - -+ if ( ret ) -+ goto out; -+ - /* - * if no other devices under the same iommu owned by this domain, - * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp -@@ -1751,19 +1767,8 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, - - if ( found == 0 ) - { -- int iommu_domid; -- - clear_bit(iommu->index, &dom_iommu(domain)->arch.vtd.iommu_bitmap); -- -- iommu_domid = domain_iommu_domid(domain, iommu); -- if ( iommu_domid == -1 ) -- { -- ret = -EINVAL; -- goto out; -- } -- -- clear_bit(iommu_domid, iommu->domid_bitmap); -- iommu->domid_map[iommu_domid] = 0; -+ cleanup_domid_map(domain, iommu); - } - - out: -@@ -1783,6 +1788,7 @@ static void iommu_domain_teardown(struct domain *d) - { - struct domain_iommu *hd = dom_iommu(d); - struct mapped_rmrr *mrmrr, *tmp; -+ const struct acpi_drhd_unit *drhd; - - if ( list_empty(&acpi_drhd_units) ) - return; -@@ -1794,6 +1800,9 @@ static void iommu_domain_teardown(struct domain *d) - } - - ASSERT(!hd->arch.vtd.pgd_maddr); -+ -+ for_each_drhd_unit ( drhd ) -+ cleanup_domid_map(d, drhd->iommu); - } - - static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn, --- -2.30.2 - diff --git a/xen.git-58ad654ebce7ccb272a3f4f3482c03aaad850d31.patch b/xen.git-58ad654ebce7ccb272a3f4f3482c03aaad850d31.patch deleted file mode 100644 index 7229049..0000000 --- a/xen.git-58ad654ebce7ccb272a3f4f3482c03aaad850d31.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 58ad654ebce7ccb272a3f4f3482c03aaad850d31 Mon Sep 17 00:00:00 2001 -From: Jan Beulich -Date: Thu, 22 Jul 2021 11:20:38 +0200 -Subject: [PATCH] x86: work around build issue with GNU ld 2.37 - -I suspect it is commit 40726f16a8d7 ("ld script expression parsing") -which broke the hypervisor build, by no longer accepting section names -with a dash in them inside ADDR() (and perhaps other script directives -expecting just a section name, not an expression): .note.gnu.build-id -is such a section. - -Quoting all section names passed to ADDR() via DECL_SECTION() works -around the regression. - -Signed-off-by: Jan Beulich -Acked-by: Andrew Cooper ---- - xen/arch/x86/xen.lds.S | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S -index 9c6c1c8005..955d5cf4a0 100644 ---- a/xen/arch/x86/xen.lds.S -+++ b/xen/arch/x86/xen.lds.S -@@ -18,7 +18,7 @@ ENTRY(efi_start) - #else /* !EFI */ - - #define FORMAT "elf64-x86-64" --#define DECL_SECTION(x) x : AT(ADDR(x) - __XEN_VIRT_START) -+#define DECL_SECTION(x) x : AT(ADDR(#x) - __XEN_VIRT_START) - - ENTRY(start_pa) - --- -2.30.2 - diff --git a/xen.spec b/xen.spec index 3a85379..9b0c237 100644 --- a/xen.spec +++ b/xen.spec @@ -57,8 +57,8 @@ Summary: Xen is a virtual machine monitor Name: xen -Version: 4.15.0 -Release: 7%{?dist} +Version: 4.15.1 +Release: 1%{?dist} License: GPLv2+ and LGPLv2+ and BSD URL: http://xen.org/ Source0: https://downloads.xenproject.org/release/xen/%{version}/xen-%{version}.tar.gz @@ -113,32 +113,6 @@ Patch41: xen.gcc9.fixes.patch Patch42: xen.gcc10.fixes.patch Patch43: xen.gcc11.fixes.patch Patch44: xen.ocaml.4.12.fixes.patch -Patch45: xsa372-4.15-0001-xen-arm-Create-dom0less-domUs-earlier.patch -Patch46: xsa372-4.15-0002-xen-arm-Boot-modules-should-always-be-scrubbed-if-bo.patch -Patch47: xsa373-4.15-1.patch -Patch48: xsa373-4.15-2.patch -Patch49: xsa373-4.15-3.patch -Patch50: xsa373-4.15-4.patch -Patch51: xsa373-4.15-5.patch -Patch52: xsa375.patch -Patch53: xsa377.patch -Patch54: xen.git-58ad654ebce7ccb272a3f4f3482c03aaad850d31.patch -Patch55: xen.git-3581714729cf84df50d6831c4da076e21587408c.patch -Patch56: xsa378-4.15-1.patch -Patch57: xsa378-4.15-2.patch -Patch58: xsa378-4.15-3.patch -Patch59: xsa378-4.15-4.patch -Patch60: xsa378-4.15-5.patch -Patch61: xsa378-4.15-6.patch -Patch62: xsa378-4.15-7.patch -Patch63: xsa378-4.15-8.patch -Patch64: xsa379-4.15.patch -Patch65: xsa380-1.patch -Patch66: xsa380-2.patch -Patch67: xsa382.patch -Patch68: xsa383.patch -Patch69: xsa380-3.patch -Patch70: xsa384.patch %if %build_qemutrad @@ -348,32 +322,6 @@ manage Xen virtual machines. %patch42 -p1 %patch43 -p1 %patch44 -p1 -%patch45 -p1 -%patch46 -p1 -%patch47 -p1 -%patch48 -p1 -%patch49 -p1 -%patch50 -p1 -%patch51 -p1 -%patch52 -p1 -%patch53 -p1 -%patch54 -p1 -%patch55 -p1 -%patch56 -p1 -%patch57 -p1 -%patch58 -p1 -%patch59 -p1 -%patch60 -p1 -%patch61 -p1 -%patch62 -p1 -%patch63 -p1 -%patch64 -p1 -%patch65 -p1 -%patch66 -p1 -%patch67 -p1 -%patch68 -p1 -%patch69 -p1 -%patch70 -p1 # qemu-xen-traditional patches pushd tools/qemu-xen-traditional @@ -726,7 +674,7 @@ fi %files libs %{_libdir}/libxencall.so.1 -%{_libdir}/libxencall.so.1.2 +%{_libdir}/libxencall.so.1.3 %{_libdir}/libxenctrl.so.4.* %{_libdir}/libxendevicemodel.so.1 %{_libdir}/libxendevicemodel.so.1.4 @@ -969,6 +917,11 @@ fi %endif %changelog +* Sun Sep 12 2021 Michael Young - 4.15.1-1 +- update to xen-4.15.1 + remove or adjust patches now included or superceded upstream + update libxencall version + * Wed Sep 08 2021 Michael Young - 4.15.0-7 - Another race in XENMAPSPACE_grant_table handling [XSA-384, CVE-2021-28701] (#2002786) diff --git a/xsa372-4.15-0001-xen-arm-Create-dom0less-domUs-earlier.patch b/xsa372-4.15-0001-xen-arm-Create-dom0less-domUs-earlier.patch deleted file mode 100644 index a21dba4..0000000 --- a/xsa372-4.15-0001-xen-arm-Create-dom0less-domUs-earlier.patch +++ /dev/null @@ -1,85 +0,0 @@ -From b1e5a89f19d9919c3eae17ab9c6a663b0801ad9c Mon Sep 17 00:00:00 2001 -From: Julien Grall -Date: Mon, 17 May 2021 17:47:13 +0100 -Subject: [PATCH 1/2] xen/arm: Create dom0less domUs earlier - -In a follow-up patch we will need to unallocate the boot modules -before heap_init_late() is called. - -The modules will contain the domUs kernel and initramfs. Therefore Xen -will need to create extra domUs (used by dom0less) before heap_init_late(). - -This has two consequences on dom0less: - 1) Domains will not be unpaused as soon as they are created but - once all have been created. However, Xen doesn't guarantee an order - to unpause, so this is not something one could rely on. - - 2) The memory allocated for a domU will not be scrubbed anymore when an - admin select bootscrub=on. This is not something we advertised, but if - this is a concern we can introduce either force scrub for all domUs or - a per-domain flag in the DT. The behavior for bootscrub=off and - bootscrub=idle (default) has not changed. - -This is part of XSA-372 / CVE-2021-28693. - -Signed-off-by: Julien Grall -Reviewed-by: Jan Beulich -Reviewed-by: Stefano Stabellini -Tested-by: Stefano Stabellini ---- - xen/arch/arm/domain_build.c | 2 -- - xen/arch/arm/setup.c | 11 ++++++----- - 2 files changed, 6 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c -index 374bf655ee34..4203ddcca0e3 100644 ---- a/xen/arch/arm/domain_build.c -+++ b/xen/arch/arm/domain_build.c -@@ -2515,8 +2515,6 @@ void __init create_domUs(void) - - if ( construct_domU(d, node) != 0 ) - panic("Could not set up domain %s\n", dt_node_name(node)); -- -- domain_unpause_by_systemcontroller(d); - } - } - -diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c -index 2532ec973913..441e0e16e9f0 100644 ---- a/xen/arch/arm/setup.c -+++ b/xen/arch/arm/setup.c -@@ -804,7 +804,7 @@ void __init start_xen(unsigned long boot_phys_offset, - int cpus, i; - const char *cmdline; - struct bootmodule *xen_bootmodule; -- struct domain *dom0; -+ struct domain *dom0, *d; - struct xen_domctl_createdomain dom0_cfg = { - .flags = XEN_DOMCTL_CDF_hvm | XEN_DOMCTL_CDF_hap, - .max_evtchn_port = -1, -@@ -987,6 +987,9 @@ void __init start_xen(unsigned long boot_phys_offset, - if ( construct_dom0(dom0) != 0) - panic("Could not set up DOM0 guest OS\n"); - -+ if ( acpi_disabled ) -+ create_domUs(); -+ - heap_init_late(); - - init_trace_bufs(); -@@ -1000,10 +1003,8 @@ void __init start_xen(unsigned long boot_phys_offset, - - system_state = SYS_STATE_active; - -- if ( acpi_disabled ) -- create_domUs(); -- -- domain_unpause_by_systemcontroller(dom0); -+ for_each_domain( d ) -+ domain_unpause_by_systemcontroller(d); - - /* Switch on to the dynamically allocated stack for the idle vcpu - * since the static one we're running on is about to be freed. */ --- -2.17.1 - diff --git a/xsa372-4.15-0002-xen-arm-Boot-modules-should-always-be-scrubbed-if-bo.patch b/xsa372-4.15-0002-xen-arm-Boot-modules-should-always-be-scrubbed-if-bo.patch deleted file mode 100644 index 9c322b1..0000000 --- a/xsa372-4.15-0002-xen-arm-Boot-modules-should-always-be-scrubbed-if-bo.patch +++ /dev/null @@ -1,59 +0,0 @@ -From 09bb28bdef3fb5e7d08bdd641601ca0c0d4d82b4 Mon Sep 17 00:00:00 2001 -From: Julien Grall -Date: Sat, 17 Apr 2021 17:38:28 +0100 -Subject: [PATCH 2/2] xen/arm: Boot modules should always be scrubbed if - bootscrub={on, idle} - -The function to initialize the pages (see init_heap_pages()) will request -scrub when the admin request idle bootscrub (default) and state == -SYS_STATE_active. When bootscrub=on, Xen will scrub any free pages in -heap_init_late(). - -Currently, the boot modules (e.g. kernels, initramfs) will be discarded/ -freed after heap_init_late() is called and system_state switched to -SYS_STATE_active. This means the pages associated with the boot modules -will not get scrubbed before getting re-purposed. - -If the memory is assigned to an untrusted domU, it may be able to -retrieve secrets from the modules. - -This is part of XSA-372 / CVE-2021-28693. - -Fixes: 1774e9b1df27 ("xen/arm: introduce create_domUs") -Signed-off-by: Julien Grall -Reviewed-by: Jan Beulich -Reviewed-by: Stefano Stabellini -Tested-by: Stefano Stabellini ---- - xen/arch/arm/setup.c | 8 ++++++-- - 1 file changed, 6 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c -index 441e0e16e9f0..8afb78f2c985 100644 ---- a/xen/arch/arm/setup.c -+++ b/xen/arch/arm/setup.c -@@ -72,8 +72,6 @@ domid_t __read_mostly max_init_domid; - - static __used void init_done(void) - { -- discard_initial_modules(); -- - /* Must be done past setting system_state. */ - unregister_init_virtual_region(); - -@@ -990,6 +988,12 @@ void __init start_xen(unsigned long boot_phys_offset, - if ( acpi_disabled ) - create_domUs(); - -+ /* -+ * This needs to be called **before** heap_init_late() so modules -+ * will be scrubbed (unless suppressed). -+ */ -+ discard_initial_modules(); -+ - heap_init_late(); - - init_trace_bufs(); --- -2.17.1 - diff --git a/xsa373-4.15-1.patch b/xsa373-4.15-1.patch deleted file mode 100644 index ee5229a..0000000 --- a/xsa373-4.15-1.patch +++ /dev/null @@ -1,120 +0,0 @@ -From: Jan Beulich -Subject: VT-d: size qinval queue dynamically - -With the present synchronous model, we need two slots for every -operation (the operation itself and a wait descriptor). There can be -one such pair of requests pending per CPU. To ensure that under all -normal circumstances a slot is always available when one is requested, -size the queue ring according to the number of present CPUs. - -This is part of XSA-373 / CVE-2021-28692. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/drivers/passthrough/vtd/iommu.h -+++ b/xen/drivers/passthrough/vtd/iommu.h -@@ -450,17 +450,9 @@ struct qinval_entry { - }q; - }; - --/* Order of queue invalidation pages(max is 8) */ --#define QINVAL_PAGE_ORDER 2 -- --#define QINVAL_ARCH_PAGE_ORDER (QINVAL_PAGE_ORDER + PAGE_SHIFT_4K - PAGE_SHIFT) --#define QINVAL_ARCH_PAGE_NR ( QINVAL_ARCH_PAGE_ORDER < 0 ? \ -- 1 : \ -- 1 << QINVAL_ARCH_PAGE_ORDER ) -- - /* Each entry is 16 bytes, so 2^8 entries per page */ - #define QINVAL_ENTRY_ORDER ( PAGE_SHIFT - 4 ) --#define QINVAL_ENTRY_NR (1 << (QINVAL_PAGE_ORDER + 8)) -+#define QINVAL_MAX_ENTRY_NR (1u << (7 + QINVAL_ENTRY_ORDER)) - - /* Status data flag */ - #define QINVAL_STAT_INIT 0 ---- a/xen/drivers/passthrough/vtd/qinval.c -+++ b/xen/drivers/passthrough/vtd/qinval.c -@@ -31,6 +31,9 @@ - - #define VTD_QI_TIMEOUT 1 - -+static unsigned int __read_mostly qi_pg_order; -+static unsigned int __read_mostly qi_entry_nr; -+ - static int __must_check invalidate_sync(struct vtd_iommu *iommu); - - static void print_qi_regs(struct vtd_iommu *iommu) -@@ -55,7 +58,7 @@ static unsigned int qinval_next_index(st - tail >>= QINVAL_INDEX_SHIFT; - - /* (tail+1 == head) indicates a full queue, wait for HW */ -- while ( ( tail + 1 ) % QINVAL_ENTRY_NR == -+ while ( ((tail + 1) & (qi_entry_nr - 1)) == - ( dmar_readq(iommu->reg, DMAR_IQH_REG) >> QINVAL_INDEX_SHIFT ) ) - cpu_relax(); - -@@ -68,7 +71,7 @@ static void qinval_update_qtail(struct v - - /* Need hold register lock when update tail */ - ASSERT( spin_is_locked(&iommu->register_lock) ); -- val = (index + 1) % QINVAL_ENTRY_NR; -+ val = (index + 1) & (qi_entry_nr - 1); - dmar_writeq(iommu->reg, DMAR_IQT_REG, (val << QINVAL_INDEX_SHIFT)); - } - -@@ -403,8 +406,28 @@ int enable_qinval(struct vtd_iommu *iomm - - if ( iommu->qinval_maddr == 0 ) - { -- iommu->qinval_maddr = alloc_pgtable_maddr(QINVAL_ARCH_PAGE_NR, -- iommu->node); -+ if ( !qi_entry_nr ) -+ { -+ /* -+ * With the present synchronous model, we need two slots for every -+ * operation (the operation itself and a wait descriptor). There -+ * can be one such pair of requests pending per CPU. One extra -+ * entry is needed as the ring is considered full when there's -+ * only one entry left. -+ */ -+ BUILD_BUG_ON(CONFIG_NR_CPUS * 2 >= QINVAL_MAX_ENTRY_NR); -+ qi_pg_order = get_order_from_bytes((num_present_cpus() * 2 + 1) << -+ (PAGE_SHIFT - -+ QINVAL_ENTRY_ORDER)); -+ qi_entry_nr = 1u << (qi_pg_order + QINVAL_ENTRY_ORDER); -+ -+ dprintk(XENLOG_INFO VTDPREFIX, -+ "QI: using %u-entry ring(s)\n", qi_entry_nr); -+ } -+ -+ iommu->qinval_maddr = -+ alloc_pgtable_maddr(qi_entry_nr >> QINVAL_ENTRY_ORDER, -+ iommu->node); - if ( iommu->qinval_maddr == 0 ) - { - dprintk(XENLOG_WARNING VTDPREFIX, -@@ -418,15 +441,16 @@ int enable_qinval(struct vtd_iommu *iomm - - spin_lock_irqsave(&iommu->register_lock, flags); - -- /* Setup Invalidation Queue Address(IQA) register with the -- * address of the page we just allocated. QS field at -- * bits[2:0] to indicate size of queue is one 4KB page. -- * That's 256 entries. Queued Head (IQH) and Queue Tail (IQT) -- * registers are automatically reset to 0 with write -- * to IQA register. -+ /* -+ * Setup Invalidation Queue Address (IQA) register with the address of the -+ * pages we just allocated. The QS field at bits[2:0] indicates the size -+ * (page order) of the queue. -+ * -+ * Queued Head (IQH) and Queue Tail (IQT) registers are automatically -+ * reset to 0 with write to IQA register. - */ - dmar_writeq(iommu->reg, DMAR_IQA_REG, -- iommu->qinval_maddr | QINVAL_PAGE_ORDER); -+ iommu->qinval_maddr | qi_pg_order); - - dmar_writeq(iommu->reg, DMAR_IQT_REG, 0); - diff --git a/xsa373-4.15-2.patch b/xsa373-4.15-2.patch deleted file mode 100644 index d61a3b4..0000000 --- a/xsa373-4.15-2.patch +++ /dev/null @@ -1,102 +0,0 @@ -From: Jan Beulich -Subject: AMD/IOMMU: size command buffer dynamically - -With the present synchronous model, we need two slots for every -operation (the operation itself and a wait command). There can be one -such pair of commands pending per CPU. To ensure that under all normal -circumstances a slot is always available when one is requested, size the -command ring according to the number of present CPUs. - -This is part of XSA-373 / CVE-2021-28692. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/drivers/passthrough/amd/iommu-defs.h -+++ b/xen/drivers/passthrough/amd/iommu-defs.h -@@ -20,9 +20,6 @@ - #ifndef AMD_IOMMU_DEFS_H - #define AMD_IOMMU_DEFS_H - --/* IOMMU Command Buffer entries: in power of 2 increments, minimum of 256 */ --#define IOMMU_CMD_BUFFER_DEFAULT_ENTRIES 512 -- - /* IOMMU Event Log entries: in power of 2 increments, minimum of 256 */ - #define IOMMU_EVENT_LOG_DEFAULT_ENTRIES 512 - -@@ -164,8 +161,8 @@ struct amd_iommu_dte { - #define IOMMU_CMD_BUFFER_LENGTH_MASK 0x0F000000 - #define IOMMU_CMD_BUFFER_LENGTH_SHIFT 24 - --#define IOMMU_CMD_BUFFER_ENTRY_SIZE 16 --#define IOMMU_CMD_BUFFER_POWER_OF2_ENTRIES_PER_PAGE 8 -+#define IOMMU_CMD_BUFFER_ENTRY_ORDER 4 -+#define IOMMU_CMD_BUFFER_MAX_ENTRIES (1u << 15) - - #define IOMMU_CMD_OPCODE_MASK 0xF0000000 - #define IOMMU_CMD_OPCODE_SHIFT 28 ---- a/xen/drivers/passthrough/amd/iommu_cmd.c -+++ b/xen/drivers/passthrough/amd/iommu_cmd.c -@@ -24,7 +24,7 @@ static int queue_iommu_command(struct am - { - uint32_t tail, head; - -- tail = iommu->cmd_buffer.tail + IOMMU_CMD_BUFFER_ENTRY_SIZE; -+ tail = iommu->cmd_buffer.tail + sizeof(cmd_entry_t); - if ( tail == iommu->cmd_buffer.size ) - tail = 0; - -@@ -33,7 +33,7 @@ static int queue_iommu_command(struct am - if ( head != tail ) - { - memcpy(iommu->cmd_buffer.buffer + iommu->cmd_buffer.tail, -- cmd, IOMMU_CMD_BUFFER_ENTRY_SIZE); -+ cmd, sizeof(cmd_entry_t)); - - iommu->cmd_buffer.tail = tail; - return 1; ---- a/xen/drivers/passthrough/amd/iommu_init.c -+++ b/xen/drivers/passthrough/amd/iommu_init.c -@@ -118,7 +118,7 @@ static void register_iommu_cmd_buffer_in - writel(entry, iommu->mmio_base + IOMMU_CMD_BUFFER_BASE_LOW_OFFSET); - - power_of2_entries = get_order_from_bytes(iommu->cmd_buffer.size) + -- IOMMU_CMD_BUFFER_POWER_OF2_ENTRIES_PER_PAGE; -+ PAGE_SHIFT - IOMMU_CMD_BUFFER_ENTRY_ORDER; - - entry = 0; - iommu_set_addr_hi_to_reg(&entry, addr_hi); -@@ -1018,9 +1018,31 @@ static void *__init allocate_ring_buffer - static void * __init allocate_cmd_buffer(struct amd_iommu *iommu) - { - /* allocate 'command buffer' in power of 2 increments of 4K */ -+ static unsigned int __read_mostly nr_ents; -+ -+ if ( !nr_ents ) -+ { -+ unsigned int order; -+ -+ /* -+ * With the present synchronous model, we need two slots for every -+ * operation (the operation itself and a wait command). There can be -+ * one such pair of requests pending per CPU. One extra entry is -+ * needed as the ring is considered full when there's only one entry -+ * left. -+ */ -+ BUILD_BUG_ON(CONFIG_NR_CPUS * 2 >= IOMMU_CMD_BUFFER_MAX_ENTRIES); -+ order = get_order_from_bytes((num_present_cpus() * 2 + 1) << -+ IOMMU_CMD_BUFFER_ENTRY_ORDER); -+ nr_ents = 1u << (order + PAGE_SHIFT - IOMMU_CMD_BUFFER_ENTRY_ORDER); -+ -+ AMD_IOMMU_DEBUG("using %u-entry cmd ring(s)\n", nr_ents); -+ } -+ -+ BUILD_BUG_ON(sizeof(cmd_entry_t) != (1u << IOMMU_CMD_BUFFER_ENTRY_ORDER)); -+ - return allocate_ring_buffer(&iommu->cmd_buffer, sizeof(cmd_entry_t), -- IOMMU_CMD_BUFFER_DEFAULT_ENTRIES, -- "Command Buffer", false); -+ nr_ents, "Command Buffer", false); - } - - static void * __init allocate_event_log(struct amd_iommu *iommu) diff --git a/xsa373-4.15-3.patch b/xsa373-4.15-3.patch deleted file mode 100644 index c7ddf5d..0000000 --- a/xsa373-4.15-3.patch +++ /dev/null @@ -1,163 +0,0 @@ -From: Jan Beulich -Subject: VT-d: eliminate flush related timeouts - -Leaving an in-progress operation pending when it appears to take too -long is problematic: If e.g. a QI command completed later, the write to -the "poll slot" may instead be understood to signal a subsequently -started command's completion. Also our accounting of the timeout period -was actually wrong: We included the time it took for the command to -actually make it to the front of the queue, which could be heavily -affected by guests other than the one for which the flush is being -performed. - -Do away with all timeout detection on all flush related code paths. -Log excessively long processing times (with a progressive threshold) to -have some indication of problems in this area. - -Additionally log (once) if qinval_next_index() didn't immediately find -an available slot. Together with the earlier change sizing the queue(s) -dynamically, we should now have a guarantee that with our fully -synchronous model any demand for slots can actually be satisfied. - -This is part of XSA-373 / CVE-2021-28692. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/drivers/passthrough/vtd/dmar.h -+++ b/xen/drivers/passthrough/vtd/dmar.h -@@ -127,6 +127,34 @@ do { - } \ - } while (0) - -+#define IOMMU_FLUSH_WAIT(what, iommu, offset, op, cond, sts) \ -+do { \ -+ static unsigned int __read_mostly threshold = 1; \ -+ s_time_t start = NOW(); \ -+ s_time_t timeout = start + DMAR_OPERATION_TIMEOUT * threshold; \ -+ \ -+ for ( ; ; ) \ -+ { \ -+ sts = op(iommu->reg, offset); \ -+ if ( cond ) \ -+ break; \ -+ if ( timeout && NOW() > timeout ) \ -+ { \ -+ threshold |= threshold << 1; \ -+ printk(XENLOG_WARNING VTDPREFIX \ -+ " IOMMU#%u: %s flush taking too long\n", \ -+ iommu->index, what); \ -+ timeout = 0; \ -+ } \ -+ cpu_relax(); \ -+ } \ -+ \ -+ if ( !timeout ) \ -+ printk(XENLOG_WARNING VTDPREFIX \ -+ " IOMMU#%u: %s flush took %lums\n", \ -+ iommu->index, what, (NOW() - start) / 10000000); \ -+} while ( false ) -+ - int vtd_hw_check(void); - void disable_pmr(struct vtd_iommu *iommu); - int is_igd_drhd(struct acpi_drhd_unit *drhd); ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -373,8 +373,8 @@ static void iommu_flush_write_buffer(str - dmar_writel(iommu->reg, DMAR_GCMD_REG, val | DMA_GCMD_WBF); - - /* Make sure hardware complete it */ -- IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, -- !(val & DMA_GSTS_WBFS), val); -+ IOMMU_FLUSH_WAIT("write buffer", iommu, DMAR_GSTS_REG, dmar_readl, -+ !(val & DMA_GSTS_WBFS), val); - - spin_unlock_irqrestore(&iommu->register_lock, flags); - } -@@ -423,8 +423,8 @@ int vtd_flush_context_reg(struct vtd_iom - dmar_writeq(iommu->reg, DMAR_CCMD_REG, val); - - /* Make sure hardware complete it */ -- IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq, -- !(val & DMA_CCMD_ICC), val); -+ IOMMU_FLUSH_WAIT("context", iommu, DMAR_CCMD_REG, dmar_readq, -+ !(val & DMA_CCMD_ICC), val); - - spin_unlock_irqrestore(&iommu->register_lock, flags); - /* flush context entry will implicitly flush write buffer */ -@@ -501,8 +501,8 @@ int vtd_flush_iotlb_reg(struct vtd_iommu - dmar_writeq(iommu->reg, tlb_offset + 8, val); - - /* Make sure hardware complete it */ -- IOMMU_WAIT_OP(iommu, (tlb_offset + 8), dmar_readq, -- !(val & DMA_TLB_IVT), val); -+ IOMMU_FLUSH_WAIT("iotlb", iommu, (tlb_offset + 8), dmar_readq, -+ !(val & DMA_TLB_IVT), val); - spin_unlock_irqrestore(&iommu->register_lock, flags); - - /* check IOTLB invalidation granularity */ ---- a/xen/drivers/passthrough/vtd/qinval.c -+++ b/xen/drivers/passthrough/vtd/qinval.c -@@ -29,8 +29,6 @@ - #include "extern.h" - #include "../ats.h" - --#define VTD_QI_TIMEOUT 1 -- - static unsigned int __read_mostly qi_pg_order; - static unsigned int __read_mostly qi_entry_nr; - -@@ -60,7 +58,11 @@ static unsigned int qinval_next_index(st - /* (tail+1 == head) indicates a full queue, wait for HW */ - while ( ((tail + 1) & (qi_entry_nr - 1)) == - ( dmar_readq(iommu->reg, DMAR_IQH_REG) >> QINVAL_INDEX_SHIFT ) ) -+ { -+ printk_once(XENLOG_ERR VTDPREFIX " IOMMU#%u: no QI slot available\n", -+ iommu->index); - cpu_relax(); -+ } - - return tail; - } -@@ -180,23 +182,32 @@ static int __must_check queue_invalidate - /* Now we don't support interrupt method */ - if ( sw ) - { -- s_time_t timeout; -- -- /* In case all wait descriptor writes to same addr with same data */ -- timeout = NOW() + MILLISECS(flush_dev_iotlb ? -- iommu_dev_iotlb_timeout : VTD_QI_TIMEOUT); -+ static unsigned int __read_mostly threshold = 1; -+ s_time_t start = NOW(); -+ s_time_t timeout = start + (flush_dev_iotlb -+ ? iommu_dev_iotlb_timeout -+ : 100) * MILLISECS(threshold); - - while ( ACCESS_ONCE(*this_poll_slot) != QINVAL_STAT_DONE ) - { -- if ( NOW() > timeout ) -+ if ( timeout && NOW() > timeout ) - { -- print_qi_regs(iommu); -+ threshold |= threshold << 1; - printk(XENLOG_WARNING VTDPREFIX -- " Queue invalidate wait descriptor timed out\n"); -- return -ETIMEDOUT; -+ " IOMMU#%u: QI%s wait descriptor taking too long\n", -+ iommu->index, flush_dev_iotlb ? " dev" : ""); -+ print_qi_regs(iommu); -+ timeout = 0; - } - cpu_relax(); - } -+ -+ if ( !timeout ) -+ printk(XENLOG_WARNING VTDPREFIX -+ " IOMMU#%u: QI%s wait descriptor took %lums\n", -+ iommu->index, flush_dev_iotlb ? " dev" : "", -+ (NOW() - start) / 10000000); -+ - return 0; - } - diff --git a/xsa373-4.15-4.patch b/xsa373-4.15-4.patch deleted file mode 100644 index 17592cb..0000000 --- a/xsa373-4.15-4.patch +++ /dev/null @@ -1,79 +0,0 @@ -From: Jan Beulich -Subject: AMD/IOMMU: wait for command slot to be available - -No caller cared about send_iommu_command() indicating unavailability of -a slot. Hence if a sufficient number prior commands timed out, we did -blindly assume that the requested command was submitted to the IOMMU -when really it wasn't. This could mean both a hanging system (waiting -for a command to complete that was never seen by the IOMMU) or blindly -propagating success back to callers, making them believe they're fine -to e.g. free previously unmapped pages. - -Fold the three involved functions into one, add spin waiting for an -available slot along the lines of VT-d's qinval_next_index(), and as a -consequence drop all error indicator return types/values. - -This is part of XSA-373 / CVE-2021-28692. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/drivers/passthrough/amd/iommu_cmd.c -+++ b/xen/drivers/passthrough/amd/iommu_cmd.c -@@ -20,43 +20,30 @@ - #include "iommu.h" - #include "../ats.h" - --static int queue_iommu_command(struct amd_iommu *iommu, u32 cmd[]) -+static void send_iommu_command(struct amd_iommu *iommu, -+ const uint32_t cmd[4]) - { -- uint32_t tail, head; -+ uint32_t tail; - - tail = iommu->cmd_buffer.tail + sizeof(cmd_entry_t); - if ( tail == iommu->cmd_buffer.size ) - tail = 0; - -- head = readl(iommu->mmio_base + -- IOMMU_CMD_BUFFER_HEAD_OFFSET) & IOMMU_RING_BUFFER_PTR_MASK; -- if ( head != tail ) -+ while ( tail == (readl(iommu->mmio_base + -+ IOMMU_CMD_BUFFER_HEAD_OFFSET) & -+ IOMMU_RING_BUFFER_PTR_MASK) ) - { -- memcpy(iommu->cmd_buffer.buffer + iommu->cmd_buffer.tail, -- cmd, sizeof(cmd_entry_t)); -- -- iommu->cmd_buffer.tail = tail; -- return 1; -+ printk_once(XENLOG_ERR "AMD IOMMU %pp: no cmd slot available\n", -+ &PCI_SBDF2(iommu->seg, iommu->bdf)); -+ cpu_relax(); - } - -- return 0; --} -- --static void commit_iommu_command_buffer(struct amd_iommu *iommu) --{ -- writel(iommu->cmd_buffer.tail, -- iommu->mmio_base + IOMMU_CMD_BUFFER_TAIL_OFFSET); --} -+ memcpy(iommu->cmd_buffer.buffer + iommu->cmd_buffer.tail, -+ cmd, sizeof(cmd_entry_t)); - --static int send_iommu_command(struct amd_iommu *iommu, u32 cmd[]) --{ -- if ( queue_iommu_command(iommu, cmd) ) -- { -- commit_iommu_command_buffer(iommu); -- return 1; -- } -+ iommu->cmd_buffer.tail = tail; - -- return 0; -+ writel(tail, iommu->mmio_base + IOMMU_CMD_BUFFER_TAIL_OFFSET); - } - - static void flush_command_buffer(struct amd_iommu *iommu) diff --git a/xsa373-4.15-5.patch b/xsa373-4.15-5.patch deleted file mode 100644 index 0c6b1ea..0000000 --- a/xsa373-4.15-5.patch +++ /dev/null @@ -1,141 +0,0 @@ -From: Jan Beulich -Subject: AMD/IOMMU: drop command completion timeout - -First and foremost - such timeouts were not signaled to callers, making -them believe they're fine to e.g. free previously unmapped pages. - -Mirror VT-d's behavior: A fixed number of loop iterations is not a -suitable way to detect timeouts in an environment (CPU and bus speeds) -independent manner anyway. Furthermore, leaving an in-progress operation -pending when it appears to take too long is problematic: If a command -completed later, the signaling of its completion may instead be -understood to signal a subsequently started command's completion. - -Log excessively long processing times (with a progressive threshold) to -have some indication of problems in this area. Allow callers to specify -a non-default timeout bias for this logging, using the same values as -VT-d does, which in particular means a (by default) much larger value -for device IO TLB invalidation. - -This is part of XSA-373 / CVE-2021-28692. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/drivers/passthrough/amd/iommu_cmd.c -+++ b/xen/drivers/passthrough/amd/iommu_cmd.c -@@ -46,10 +46,12 @@ static void send_iommu_command(struct am - writel(tail, iommu->mmio_base + IOMMU_CMD_BUFFER_TAIL_OFFSET); - } - --static void flush_command_buffer(struct amd_iommu *iommu) -+static void flush_command_buffer(struct amd_iommu *iommu, -+ unsigned int timeout_base) - { -- unsigned int cmd[4], status, loop_count; -- bool comp_wait; -+ uint32_t cmd[4]; -+ s_time_t start, timeout; -+ static unsigned int __read_mostly threshold = 1; - - /* RW1C 'ComWaitInt' in status register */ - writel(IOMMU_STATUS_COMP_WAIT_INT, -@@ -65,22 +67,29 @@ static void flush_command_buffer(struct - IOMMU_COMP_WAIT_I_FLAG_SHIFT, &cmd[0]); - send_iommu_command(iommu, cmd); - -- /* Make loop_count long enough for polling completion wait bit */ -- loop_count = 1000; -- do { -- status = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); -- comp_wait = status & IOMMU_STATUS_COMP_WAIT_INT; -- --loop_count; -- } while ( !comp_wait && loop_count ); -- -- if ( comp_wait ) -+ start = NOW(); -+ timeout = start + (timeout_base ?: 100) * MILLISECS(threshold); -+ while ( !(readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET) & -+ IOMMU_STATUS_COMP_WAIT_INT) ) - { -- /* RW1C 'ComWaitInt' in status register */ -- writel(IOMMU_STATUS_COMP_WAIT_INT, -- iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); -- return; -+ if ( timeout && NOW() > timeout ) -+ { -+ threshold |= threshold << 1; -+ printk(XENLOG_WARNING -+ "AMD IOMMU %pp: %scompletion wait taking too long\n", -+ &PCI_SBDF2(iommu->seg, iommu->bdf), -+ timeout_base ? "iotlb " : ""); -+ timeout = 0; -+ } -+ cpu_relax(); - } -- AMD_IOMMU_DEBUG("Warning: ComWaitInt bit did not assert!\n"); -+ -+ if ( !timeout ) -+ printk(XENLOG_WARNING -+ "AMD IOMMU %pp: %scompletion wait took %lums\n", -+ &PCI_SBDF2(iommu->seg, iommu->bdf), -+ timeout_base ? "iotlb " : "", -+ (NOW() - start) / 10000000); - } - - /* Build low level iommu command messages */ -@@ -291,7 +300,7 @@ void amd_iommu_flush_iotlb(u8 devfn, con - /* send INVALIDATE_IOTLB_PAGES command */ - spin_lock_irqsave(&iommu->lock, flags); - invalidate_iotlb_pages(iommu, maxpend, 0, queueid, daddr, req_id, order); -- flush_command_buffer(iommu); -+ flush_command_buffer(iommu, iommu_dev_iotlb_timeout); - spin_unlock_irqrestore(&iommu->lock, flags); - } - -@@ -328,7 +337,7 @@ static void _amd_iommu_flush_pages(struc - { - spin_lock_irqsave(&iommu->lock, flags); - invalidate_iommu_pages(iommu, daddr, dom_id, order); -- flush_command_buffer(iommu); -+ flush_command_buffer(iommu, 0); - spin_unlock_irqrestore(&iommu->lock, flags); - } - -@@ -352,7 +361,7 @@ void amd_iommu_flush_device(struct amd_i - ASSERT( spin_is_locked(&iommu->lock) ); - - invalidate_dev_table_entry(iommu, bdf); -- flush_command_buffer(iommu); -+ flush_command_buffer(iommu, 0); - } - - void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf) -@@ -360,7 +369,7 @@ void amd_iommu_flush_intremap(struct amd - ASSERT( spin_is_locked(&iommu->lock) ); - - invalidate_interrupt_table(iommu, bdf); -- flush_command_buffer(iommu); -+ flush_command_buffer(iommu, 0); - } - - void amd_iommu_flush_all_caches(struct amd_iommu *iommu) -@@ -368,7 +377,7 @@ void amd_iommu_flush_all_caches(struct a - ASSERT( spin_is_locked(&iommu->lock) ); - - invalidate_iommu_all(iommu); -- flush_command_buffer(iommu); -+ flush_command_buffer(iommu, 0); - } - - void amd_iommu_send_guest_cmd(struct amd_iommu *iommu, u32 cmd[]) -@@ -378,7 +387,8 @@ void amd_iommu_send_guest_cmd(struct amd - spin_lock_irqsave(&iommu->lock, flags); - - send_iommu_command(iommu, cmd); -- flush_command_buffer(iommu); -+ /* TBD: Timeout selection may require peeking into cmd[]. */ -+ flush_command_buffer(iommu, 0); - - spin_unlock_irqrestore(&iommu->lock, flags); - } diff --git a/xsa375.patch b/xsa375.patch deleted file mode 100644 index aa2e5ad..0000000 --- a/xsa375.patch +++ /dev/null @@ -1,50 +0,0 @@ -From: Andrew Cooper -Subject: x86/spec-ctrl: Protect against Speculative Code Store Bypass - -Modern x86 processors have far-better-than-architecturally-guaranteed self -modifying code detection. Typically, when a write hits an instruction in -flight, a Machine Clear occurs to flush stale content in the frontend and -backend. - -For self modifying code, before a write which hits an instruction in flight -retires, the frontend can speculatively decode and execute the old instruction -stream. Speculation of this form can suffer from type confusion in registers, -and potentially leak data. - -Furthermore, updates are typically byte-wise, rather than atomic. Depending -on timing, speculation can race ahead multiple times between individual -writes, and execute the transiently-malformed instruction stream. - -Xen has stubs which are used in certain cases for emulation purposes. Inhibit -speculation between updating the stub and executing it. - -This is XSA-375 / CVE-2021-0089. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich - -diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c -index 8889509d2a..11467a1e3a 100644 ---- a/xen/arch/x86/pv/emul-priv-op.c -+++ b/xen/arch/x86/pv/emul-priv-op.c -@@ -138,6 +138,8 @@ static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode, - /* Runtime confirmation that we haven't clobbered an adjacent stub. */ - BUG_ON(STUB_BUF_SIZE / 2 < (p - ctxt->io_emul_stub)); - -+ block_speculation(); /* SCSB */ -+ - /* Handy function-typed pointer to the stub. */ - return (void *)stub_va; - -diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c -index c25d88d0d8..f42ff2a837 100644 ---- a/xen/arch/x86/x86_emulate/x86_emulate.c -+++ b/xen/arch/x86/x86_emulate/x86_emulate.c -@@ -1257,6 +1257,7 @@ static inline int mkec(uint8_t e, int32_t ec, ...) - # define invoke_stub(pre, post, constraints...) do { \ - stub_exn.info = (union stub_exception_token) { .raw = ~0 }; \ - stub_exn.line = __LINE__; /* Utility outweighs livepatching cost */ \ -+ block_speculation(); /* SCSB */ \ - asm volatile ( pre "\n\tINDIRECT_CALL %[stub]\n\t" post "\n" \ - ".Lret%=:\n\t" \ - ".pushsection .fixup,\"ax\"\n" \ diff --git a/xsa377.patch b/xsa377.patch deleted file mode 100644 index 1a1887b..0000000 --- a/xsa377.patch +++ /dev/null @@ -1,27 +0,0 @@ -From: Andrew Cooper -Subject: x86/spec-ctrl: Mitigate TAA after S3 resume - -The user chosen setting for MSR_TSX_CTRL needs restoring after S3. - -All APs get the correct setting via start_secondary(), but the BSP was missed -out. - -This is XSA-377 / CVE-2021-28690. - -Fixes: 8c4330818f6 ("x86/spec-ctrl: Mitigate the TSX Asynchronous Abort sidechannel") -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich - -diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c -index 91a8c4d0bd..31a56f02d0 100644 ---- a/xen/arch/x86/acpi/power.c -+++ b/xen/arch/x86/acpi/power.c -@@ -288,6 +288,8 @@ static int enter_state(u32 state) - - microcode_update_one(); - -+ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ -+ - if ( !recheck_cpu_features(0) ) - panic("Missing previously available feature(s)\n"); - diff --git a/xsa378-4.15-1.patch b/xsa378-4.15-1.patch deleted file mode 100644 index 4b38f96..0000000 --- a/xsa378-4.15-1.patch +++ /dev/null @@ -1,142 +0,0 @@ -From: Jan Beulich -Subject: AMD/IOMMU: correct global exclusion range extending - -Besides unity mapping regions, the AMD IOMMU spec also provides for -exclusion ranges (areas of memory not to be subject to DMA translation) -to be specified by firmware in the ACPI tables. The spec does not put -any constraints on the number of such regions. - -Blindly assuming all addresses between any two such ranges should also -be excluded can't be right. Since hardware has room for just a single -such range (comprised of the Exclusion Base Register and the Exclusion -Range Limit Register), combine only adjacent or overlapping regions (for -now; this may require further adjustment in case table entries aren't -sorted by address) with matching exclusion_allow_all settings. This -requires bubbling up error indicators, such that IOMMU init can be -failed when concatenation wasn't possible. - -Furthermore, since the exclusion range specified in IOMMU registers -implies R/W access, reject requests asking for less permissions (this -will be brought closer to the spec by a subsequent change). - -This is part of XSA-378 / CVE-2021-28695. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/drivers/passthrough/amd/iommu_acpi.c -+++ b/xen/drivers/passthrough/amd/iommu_acpi.c -@@ -116,12 +116,21 @@ static struct amd_iommu * __init find_io - return NULL; - } - --static void __init reserve_iommu_exclusion_range( -- struct amd_iommu *iommu, uint64_t base, uint64_t limit) -+static int __init reserve_iommu_exclusion_range( -+ struct amd_iommu *iommu, uint64_t base, uint64_t limit, -+ bool all, bool iw, bool ir) - { -+ if ( !ir || !iw ) -+ return -EPERM; -+ - /* need to extend exclusion range? */ - if ( iommu->exclusion_enable ) - { -+ if ( iommu->exclusion_limit + PAGE_SIZE < base || -+ limit + PAGE_SIZE < iommu->exclusion_base || -+ iommu->exclusion_allow_all != all ) -+ return -EBUSY; -+ - if ( iommu->exclusion_base < base ) - base = iommu->exclusion_base; - if ( iommu->exclusion_limit > limit ) -@@ -129,16 +138,11 @@ static void __init reserve_iommu_exclusi - } - - iommu->exclusion_enable = IOMMU_CONTROL_ENABLED; -+ iommu->exclusion_allow_all = all; - iommu->exclusion_base = base; - iommu->exclusion_limit = limit; --} - --static void __init reserve_iommu_exclusion_range_all( -- struct amd_iommu *iommu, -- unsigned long base, unsigned long limit) --{ -- reserve_iommu_exclusion_range(iommu, base, limit); -- iommu->exclusion_allow_all = IOMMU_CONTROL_ENABLED; -+ return 0; - } - - static void __init reserve_unity_map_for_device( -@@ -176,6 +180,7 @@ static int __init register_exclusion_ran - unsigned long range_top, iommu_top, length; - struct amd_iommu *iommu; - unsigned int bdf; -+ int rc = 0; - - /* is part of exclusion range inside of IOMMU virtual address space? */ - /* note: 'limit' parameter is assumed to be page-aligned */ -@@ -197,10 +202,15 @@ static int __init register_exclusion_ran - if ( limit >= iommu_top ) - { - for_each_amd_iommu( iommu ) -- reserve_iommu_exclusion_range_all(iommu, base, limit); -+ { -+ rc = reserve_iommu_exclusion_range(iommu, base, limit, -+ true /* all */, iw, ir); -+ if ( rc ) -+ break; -+ } - } - -- return 0; -+ return rc; - } - - static int __init register_exclusion_range_for_device( -@@ -211,6 +221,7 @@ static int __init register_exclusion_ran - unsigned long range_top, iommu_top, length; - struct amd_iommu *iommu; - u16 req; -+ int rc = 0; - - iommu = find_iommu_for_device(seg, bdf); - if ( !iommu ) -@@ -240,12 +251,13 @@ static int __init register_exclusion_ran - /* register IOMMU exclusion range settings for device */ - if ( limit >= iommu_top ) - { -- reserve_iommu_exclusion_range(iommu, base, limit); -+ rc = reserve_iommu_exclusion_range(iommu, base, limit, -+ false /* all */, iw, ir); - ivrs_mappings[bdf].dte_allow_exclusion = true; - ivrs_mappings[req].dte_allow_exclusion = true; - } - -- return 0; -+ return rc; - } - - static int __init register_exclusion_range_for_iommu_devices( -@@ -255,6 +267,7 @@ static int __init register_exclusion_ran - unsigned long range_top, iommu_top, length; - unsigned int bdf; - u16 req; -+ int rc = 0; - - /* is part of exclusion range inside of IOMMU virtual address space? */ - /* note: 'limit' parameter is assumed to be page-aligned */ -@@ -285,8 +298,10 @@ static int __init register_exclusion_ran - - /* register IOMMU exclusion range settings */ - if ( limit >= iommu_top ) -- reserve_iommu_exclusion_range_all(iommu, base, limit); -- return 0; -+ rc = reserve_iommu_exclusion_range(iommu, base, limit, -+ true /* all */, iw, ir); -+ -+ return rc; - } - - static int __init parse_ivmd_device_select( diff --git a/xsa378-4.15-2.patch b/xsa378-4.15-2.patch deleted file mode 100644 index aa19772..0000000 --- a/xsa378-4.15-2.patch +++ /dev/null @@ -1,218 +0,0 @@ -From: Jan Beulich -Subject: AMD/IOMMU: correct device unity map handling - -Blindly assuming all addresses between any two such ranges, specified by -firmware in the ACPI tables, should also be unity-mapped can't be right. -Nor can it be correct to merge ranges with differing permissions. Track -ranges individually; don't merge at all, but check for overlaps instead. -This requires bubbling up error indicators, such that IOMMU init can be -failed when allocation of a new tracking struct wasn't possible, or an -overlap was detected. - -At this occasion also stop ignoring -amd_iommu_reserve_domain_unity_map()'s return value. - -This is part of XSA-378 / CVE-2021-28695. - -Signed-off-by: Jan Beulich -Reviewed-by: George Dunlap -Reviewed-by: Paul Durrant - ---- a/xen/drivers/passthrough/amd/iommu.h -+++ b/xen/drivers/passthrough/amd/iommu.h -@@ -107,20 +107,24 @@ struct amd_iommu { - struct list_head ats_devices; - }; - -+struct ivrs_unity_map { -+ bool read:1; -+ bool write:1; -+ paddr_t addr; -+ unsigned long length; -+ struct ivrs_unity_map *next; -+}; -+ - struct ivrs_mappings { - uint16_t dte_requestor_id; - bool valid:1; - bool dte_allow_exclusion:1; -- bool unity_map_enable:1; -- bool write_permission:1; -- bool read_permission:1; - - /* ivhd device data settings */ - uint8_t device_flags; - -- unsigned long addr_range_start; -- unsigned long addr_range_length; - struct amd_iommu *iommu; -+ struct ivrs_unity_map *unity_map; - - /* per device interrupt remapping table */ - void *intremap_table; ---- a/xen/drivers/passthrough/amd/iommu_acpi.c -+++ b/xen/drivers/passthrough/amd/iommu_acpi.c -@@ -145,32 +145,48 @@ static int __init reserve_iommu_exclusio - return 0; - } - --static void __init reserve_unity_map_for_device( -- u16 seg, u16 bdf, unsigned long base, -- unsigned long length, u8 iw, u8 ir) -+static int __init reserve_unity_map_for_device( -+ uint16_t seg, uint16_t bdf, unsigned long base, -+ unsigned long length, bool iw, bool ir) - { - struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg); -- unsigned long old_top, new_top; -+ struct ivrs_unity_map *unity_map = ivrs_mappings[bdf].unity_map; - -- /* need to extend unity-mapped range? */ -- if ( ivrs_mappings[bdf].unity_map_enable ) -+ /* Check for overlaps. */ -+ for ( ; unity_map; unity_map = unity_map->next ) - { -- old_top = ivrs_mappings[bdf].addr_range_start + -- ivrs_mappings[bdf].addr_range_length; -- new_top = base + length; -- if ( old_top > new_top ) -- new_top = old_top; -- if ( ivrs_mappings[bdf].addr_range_start < base ) -- base = ivrs_mappings[bdf].addr_range_start; -- length = new_top - base; -- } -- -- /* extend r/w permissioms and keep aggregate */ -- ivrs_mappings[bdf].write_permission = iw; -- ivrs_mappings[bdf].read_permission = ir; -- ivrs_mappings[bdf].unity_map_enable = true; -- ivrs_mappings[bdf].addr_range_start = base; -- ivrs_mappings[bdf].addr_range_length = length; -+ /* -+ * Exact matches are okay. This can in particular happen when -+ * register_exclusion_range_for_device() calls here twice for the -+ * same (s,b,d,f). -+ */ -+ if ( base == unity_map->addr && length == unity_map->length && -+ ir == unity_map->read && iw == unity_map->write ) -+ return 0; -+ -+ if ( unity_map->addr + unity_map->length > base && -+ base + length > unity_map->addr ) -+ { -+ AMD_IOMMU_DEBUG("IVMD Error: overlap [%lx,%lx) vs [%lx,%lx)\n", -+ base, base + length, unity_map->addr, -+ unity_map->addr + unity_map->length); -+ return -EPERM; -+ } -+ } -+ -+ /* Populate and insert a new unity map. */ -+ unity_map = xmalloc(struct ivrs_unity_map); -+ if ( !unity_map ) -+ return -ENOMEM; -+ -+ unity_map->read = ir; -+ unity_map->write = iw; -+ unity_map->addr = base; -+ unity_map->length = length; -+ unity_map->next = ivrs_mappings[bdf].unity_map; -+ ivrs_mappings[bdf].unity_map = unity_map; -+ -+ return 0; - } - - static int __init register_exclusion_range_for_all_devices( -@@ -193,13 +209,13 @@ static int __init register_exclusion_ran - length = range_top - base; - /* reserve r/w unity-mapped page entries for devices */ - /* note: these entries are part of the exclusion range */ -- for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) -- reserve_unity_map_for_device(seg, bdf, base, length, iw, ir); -+ for ( bdf = 0; !rc && bdf < ivrs_bdf_entries; bdf++ ) -+ rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir); - /* push 'base' just outside of virtual address space */ - base = iommu_top; - } - /* register IOMMU exclusion range settings */ -- if ( limit >= iommu_top ) -+ if ( !rc && limit >= iommu_top ) - { - for_each_amd_iommu( iommu ) - { -@@ -241,15 +257,15 @@ static int __init register_exclusion_ran - length = range_top - base; - /* reserve unity-mapped page entries for device */ - /* note: these entries are part of the exclusion range */ -- reserve_unity_map_for_device(seg, bdf, base, length, iw, ir); -- reserve_unity_map_for_device(seg, req, base, length, iw, ir); -+ rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir) ?: -+ reserve_unity_map_for_device(seg, req, base, length, iw, ir); - - /* push 'base' just outside of virtual address space */ - base = iommu_top; - } - - /* register IOMMU exclusion range settings for device */ -- if ( limit >= iommu_top ) -+ if ( !rc && limit >= iommu_top ) - { - rc = reserve_iommu_exclusion_range(iommu, base, limit, - false /* all */, iw, ir); -@@ -280,15 +296,15 @@ static int __init register_exclusion_ran - length = range_top - base; - /* reserve r/w unity-mapped page entries for devices */ - /* note: these entries are part of the exclusion range */ -- for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) -+ for ( bdf = 0; !rc && bdf < ivrs_bdf_entries; bdf++ ) - { - if ( iommu == find_iommu_for_device(iommu->seg, bdf) ) - { -- reserve_unity_map_for_device(iommu->seg, bdf, base, length, -- iw, ir); - req = get_ivrs_mappings(iommu->seg)[bdf].dte_requestor_id; -- reserve_unity_map_for_device(iommu->seg, req, base, length, -- iw, ir); -+ rc = reserve_unity_map_for_device(iommu->seg, bdf, base, length, -+ iw, ir) ?: -+ reserve_unity_map_for_device(iommu->seg, req, base, length, -+ iw, ir); - } - } - -@@ -297,7 +313,7 @@ static int __init register_exclusion_ran - } - - /* register IOMMU exclusion range settings */ -- if ( limit >= iommu_top ) -+ if ( !rc && limit >= iommu_top ) - rc = reserve_iommu_exclusion_range(iommu, base, limit, - true /* all */, iw, ir); - ---- a/xen/drivers/passthrough/amd/pci_amd_iommu.c -+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c -@@ -367,15 +367,17 @@ static int amd_iommu_assign_device(struc - struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); - int bdf = PCI_BDF2(pdev->bus, devfn); - int req_id = get_dma_requestor_id(pdev->seg, bdf); -+ const struct ivrs_unity_map *unity_map; - -- if ( ivrs_mappings[req_id].unity_map_enable ) -+ for ( unity_map = ivrs_mappings[req_id].unity_map; unity_map; -+ unity_map = unity_map->next ) - { -- amd_iommu_reserve_domain_unity_map( -- d, -- ivrs_mappings[req_id].addr_range_start, -- ivrs_mappings[req_id].addr_range_length, -- ivrs_mappings[req_id].write_permission, -- ivrs_mappings[req_id].read_permission); -+ int rc = amd_iommu_reserve_domain_unity_map( -+ d, unity_map->addr, unity_map->length, -+ unity_map->write, unity_map->read); -+ -+ if ( rc ) -+ return rc; - } - - return reassign_device(pdev->domain, d, devfn, pdev); diff --git a/xsa378-4.15-3.patch b/xsa378-4.15-3.patch deleted file mode 100644 index 9e1bd18..0000000 --- a/xsa378-4.15-3.patch +++ /dev/null @@ -1,102 +0,0 @@ -From: Jan Beulich -Subject: IOMMU: also pass p2m_access_t to p2m_get_iommu_flags() - -A subsequent change will want to customize the IOMMU permissions based -on this. - -This is part of XSA-378. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/arch/x86/mm/p2m-ept.c -+++ b/xen/arch/x86/mm/p2m-ept.c -@@ -681,7 +681,7 @@ ept_set_entry(struct p2m_domain *p2m, gf - uint8_t ipat = 0; - bool_t need_modify_vtd_table = 1; - bool_t vtd_pte_present = 0; -- unsigned int iommu_flags = p2m_get_iommu_flags(p2mt, mfn); -+ unsigned int iommu_flags = p2m_get_iommu_flags(p2mt, p2ma, mfn); - bool_t needs_sync = 1; - ept_entry_t old_entry = { .epte = 0 }; - ept_entry_t new_entry = { .epte = 0 }; -@@ -809,8 +809,8 @@ ept_set_entry(struct p2m_domain *p2m, gf - - /* Safe to read-then-write because we hold the p2m lock */ - if ( ept_entry->mfn == new_entry.mfn && -- p2m_get_iommu_flags(ept_entry->sa_p2mt, _mfn(ept_entry->mfn)) == -- iommu_flags ) -+ p2m_get_iommu_flags(ept_entry->sa_p2mt, ept_entry->access, -+ _mfn(ept_entry->mfn)) == iommu_flags ) - need_modify_vtd_table = 0; - - ept_p2m_type_to_flags(p2m, &new_entry); ---- a/xen/arch/x86/mm/p2m-pt.c -+++ b/xen/arch/x86/mm/p2m-pt.c -@@ -545,6 +545,16 @@ int p2m_pt_handle_deferred_changes(uint6 - return rc; - } - -+/* Reconstruct a fake p2m_access_t from stored PTE flags. */ -+static p2m_access_t p2m_flags_to_access(unsigned int flags) -+{ -+ if ( flags & _PAGE_PRESENT ) -+ return p2m_access_n; -+ -+ /* No need to look at _PAGE_NX for now. */ -+ return flags & _PAGE_RW ? p2m_access_rw : p2m_access_r; -+} -+ - /* Checks only applicable to entries with order > PAGE_ORDER_4K */ - static void check_entry(mfn_t mfn, p2m_type_t new, p2m_type_t old, - unsigned int order) -@@ -579,7 +589,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, - l2_pgentry_t l2e_content; - l3_pgentry_t l3e_content; - int rc; -- unsigned int iommu_pte_flags = p2m_get_iommu_flags(p2mt, mfn); -+ unsigned int iommu_pte_flags = p2m_get_iommu_flags(p2mt, p2ma, mfn); - /* - * old_mfn and iommu_old_flags control possible flush/update needs on the - * IOMMU: We need to flush when MFN or flags (i.e. permissions) change. -@@ -642,6 +652,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, - old_mfn = l1e_get_pfn(*p2m_entry); - iommu_old_flags = - p2m_get_iommu_flags(p2m_flags_to_type(flags), -+ p2m_flags_to_access(flags), - _mfn(old_mfn)); - } - else -@@ -684,9 +695,10 @@ p2m_pt_set_entry(struct p2m_domain *p2m, - 0, L1_PAGETABLE_ENTRIES); - ASSERT(p2m_entry); - old_mfn = l1e_get_pfn(*p2m_entry); -+ flags = l1e_get_flags(*p2m_entry); - iommu_old_flags = -- p2m_get_iommu_flags(p2m_flags_to_type(l1e_get_flags(*p2m_entry)), -- _mfn(old_mfn)); -+ p2m_get_iommu_flags(p2m_flags_to_type(flags), -+ p2m_flags_to_access(flags), _mfn(old_mfn)); - - if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ) - entry_content = p2m_l1e_from_pfn(mfn_x(mfn), -@@ -714,6 +726,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, - old_mfn = l1e_get_pfn(*p2m_entry); - iommu_old_flags = - p2m_get_iommu_flags(p2m_flags_to_type(flags), -+ p2m_flags_to_access(flags), - _mfn(old_mfn)); - } - else ---- a/xen/include/asm-x86/p2m.h -+++ b/xen/include/asm-x86/p2m.h -@@ -915,7 +915,8 @@ static inline void p2m_altp2m_check(stru - /* - * p2m type to IOMMU flags - */ --static inline unsigned int p2m_get_iommu_flags(p2m_type_t p2mt, mfn_t mfn) -+static inline unsigned int p2m_get_iommu_flags(p2m_type_t p2mt, -+ p2m_access_t p2ma, mfn_t mfn) - { - unsigned int flags; - diff --git a/xsa378-4.15-4.patch b/xsa378-4.15-4.patch deleted file mode 100644 index a1d86db..0000000 --- a/xsa378-4.15-4.patch +++ /dev/null @@ -1,399 +0,0 @@ -From: Jan Beulich -Subject: IOMMU: generalize VT-d's tracking of mapped RMRR regions - -In order to re-use it elsewhere, move the logic to vendor independent -code and strip it of RMRR specifics. - -Note that the prior "map" parameter gets folded into the new "p2ma" one -(which AMD IOMMU code will want to make use of), assigning alternative -meaning ("unmap") to p2m_access_x. Prepare set_identity_p2m_entry() and -p2m_get_iommu_flags() for getting passed access types other than -p2m_access_rw (in the latter case just for p2m_mmio_direct requests). - -Note also that, to be on the safe side, an overlap check gets added to -the main loop of iommu_identity_mapping(). - -This is part of XSA-378. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -1365,7 +1365,7 @@ int set_identity_p2m_entry(struct domain - return 0; - return iommu_legacy_map(d, _dfn(gfn_l), _mfn(gfn_l), - 1ul << PAGE_ORDER_4K, -- IOMMUF_readable | IOMMUF_writable); -+ p2m_access_to_iommu_flags(p2ma)); - } - - gfn_lock(p2m, gfn, 0); ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -42,12 +42,6 @@ - #include "vtd.h" - #include "../ats.h" - --struct mapped_rmrr { -- struct list_head list; -- u64 base, end; -- unsigned int count; --}; -- - /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */ - bool __read_mostly untrusted_msi; - -@@ -1311,7 +1305,6 @@ static int intel_iommu_domain_init(struc - struct domain_iommu *hd = dom_iommu(d); - - hd->arch.vtd.agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); -- INIT_LIST_HEAD(&hd->arch.vtd.mapped_rmrrs); - - return 0; - } -@@ -1788,17 +1781,12 @@ static void iommu_clear_root_pgtable(str - static void iommu_domain_teardown(struct domain *d) - { - struct domain_iommu *hd = dom_iommu(d); -- struct mapped_rmrr *mrmrr, *tmp; - const struct acpi_drhd_unit *drhd; - - if ( list_empty(&acpi_drhd_units) ) - return; - -- list_for_each_entry_safe ( mrmrr, tmp, &hd->arch.vtd.mapped_rmrrs, list ) -- { -- list_del(&mrmrr->list); -- xfree(mrmrr); -- } -+ iommu_identity_map_teardown(d); - - ASSERT(!hd->arch.vtd.pgd_maddr); - -@@ -1946,74 +1934,6 @@ static int __init vtd_ept_page_compatibl - (ept_has_1gb(ept_cap) && opt_hap_1gb) <= cap_sps_1gb(vtd_cap); - } - --static int rmrr_identity_mapping(struct domain *d, bool_t map, -- const struct acpi_rmrr_unit *rmrr, -- u32 flag) --{ -- unsigned long base_pfn = rmrr->base_address >> PAGE_SHIFT_4K; -- unsigned long end_pfn = PAGE_ALIGN_4K(rmrr->end_address) >> PAGE_SHIFT_4K; -- struct mapped_rmrr *mrmrr; -- struct domain_iommu *hd = dom_iommu(d); -- -- ASSERT(pcidevs_locked()); -- ASSERT(rmrr->base_address < rmrr->end_address); -- -- /* -- * No need to acquire hd->arch.mapping_lock: Both insertion and removal -- * get done while holding pcidevs_lock. -- */ -- list_for_each_entry( mrmrr, &hd->arch.vtd.mapped_rmrrs, list ) -- { -- if ( mrmrr->base == rmrr->base_address && -- mrmrr->end == rmrr->end_address ) -- { -- int ret = 0; -- -- if ( map ) -- { -- ++mrmrr->count; -- return 0; -- } -- -- if ( --mrmrr->count ) -- return 0; -- -- while ( base_pfn < end_pfn ) -- { -- if ( clear_identity_p2m_entry(d, base_pfn) ) -- ret = -ENXIO; -- base_pfn++; -- } -- -- list_del(&mrmrr->list); -- xfree(mrmrr); -- return ret; -- } -- } -- -- if ( !map ) -- return -ENOENT; -- -- while ( base_pfn < end_pfn ) -- { -- int err = set_identity_p2m_entry(d, base_pfn, p2m_access_rw, flag); -- -- if ( err ) -- return err; -- base_pfn++; -- } -- -- mrmrr = xmalloc(struct mapped_rmrr); -- if ( !mrmrr ) -- return -ENOMEM; -- mrmrr->base = rmrr->base_address; -- mrmrr->end = rmrr->end_address; -- mrmrr->count = 1; -- list_add_tail(&mrmrr->list, &hd->arch.vtd.mapped_rmrrs); -- -- return 0; --} -- - static int intel_iommu_add_device(u8 devfn, struct pci_dev *pdev) - { - struct acpi_rmrr_unit *rmrr; -@@ -2045,7 +1965,9 @@ static int intel_iommu_add_device(u8 dev - * Since RMRRs are always reserved in the e820 map for the hardware - * domain, there shouldn't be a conflict. - */ -- ret = rmrr_identity_mapping(pdev->domain, 1, rmrr, 0); -+ ret = iommu_identity_mapping(pdev->domain, p2m_access_rw, -+ rmrr->base_address, rmrr->end_address, -+ 0); - if ( ret ) - dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n", - pdev->domain->domain_id); -@@ -2090,7 +2012,8 @@ static int intel_iommu_remove_device(u8 - * Any flag is nothing to clear these mappings but here - * its always safe and strict to set 0. - */ -- rmrr_identity_mapping(pdev->domain, 0, rmrr, 0); -+ iommu_identity_mapping(pdev->domain, p2m_access_x, rmrr->base_address, -+ rmrr->end_address, 0); - } - - return domain_context_unmap(pdev->domain, devfn, pdev); -@@ -2289,7 +2212,8 @@ static void __hwdom_init setup_hwdom_rmr - * domain, there shouldn't be a conflict. So its always safe and - * strict to set 0. - */ -- ret = rmrr_identity_mapping(d, 1, rmrr, 0); -+ ret = iommu_identity_mapping(d, p2m_access_rw, rmrr->base_address, -+ rmrr->end_address, 0); - if ( ret ) - dprintk(XENLOG_ERR VTDPREFIX, - "IOMMU: mapping reserved region failed\n"); -@@ -2460,7 +2384,9 @@ static int reassign_device_ownership( - * Any RMRR flag is always ignored when remove a device, - * but its always safe and strict to set 0. - */ -- ret = rmrr_identity_mapping(source, 0, rmrr, 0); -+ ret = iommu_identity_mapping(source, p2m_access_x, -+ rmrr->base_address, -+ rmrr->end_address, 0); - if ( ret != -ENOENT ) - return ret; - } -@@ -2556,7 +2482,8 @@ static int intel_iommu_assign_device( - PCI_BUS(bdf) == bus && - PCI_DEVFN2(bdf) == devfn ) - { -- ret = rmrr_identity_mapping(d, 1, rmrr, flag); -+ ret = iommu_identity_mapping(d, p2m_access_rw, rmrr->base_address, -+ rmrr->end_address, flag); - if ( ret ) - { - int rc; ---- a/xen/drivers/passthrough/x86/iommu.c -+++ b/xen/drivers/passthrough/x86/iommu.c -@@ -143,6 +143,7 @@ int arch_iommu_domain_init(struct domain - - INIT_PAGE_LIST_HEAD(&hd->arch.pgtables.list); - spin_lock_init(&hd->arch.pgtables.lock); -+ INIT_LIST_HEAD(&hd->arch.identity_maps); - - return 0; - } -@@ -158,6 +159,99 @@ void arch_iommu_domain_destroy(struct do - page_list_empty(&dom_iommu(d)->arch.pgtables.list)); - } - -+struct identity_map { -+ struct list_head list; -+ paddr_t base, end; -+ p2m_access_t access; -+ unsigned int count; -+}; -+ -+int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma, -+ paddr_t base, paddr_t end, -+ unsigned int flag) -+{ -+ unsigned long base_pfn = base >> PAGE_SHIFT_4K; -+ unsigned long end_pfn = PAGE_ALIGN_4K(end) >> PAGE_SHIFT_4K; -+ struct identity_map *map; -+ struct domain_iommu *hd = dom_iommu(d); -+ -+ ASSERT(pcidevs_locked()); -+ ASSERT(base < end); -+ -+ /* -+ * No need to acquire hd->arch.mapping_lock: Both insertion and removal -+ * get done while holding pcidevs_lock. -+ */ -+ list_for_each_entry( map, &hd->arch.identity_maps, list ) -+ { -+ if ( map->base == base && map->end == end ) -+ { -+ int ret = 0; -+ -+ if ( p2ma != p2m_access_x ) -+ { -+ if ( map->access != p2ma ) -+ return -EADDRINUSE; -+ ++map->count; -+ return 0; -+ } -+ -+ if ( --map->count ) -+ return 0; -+ -+ while ( base_pfn < end_pfn ) -+ { -+ if ( clear_identity_p2m_entry(d, base_pfn) ) -+ ret = -ENXIO; -+ base_pfn++; -+ } -+ -+ list_del(&map->list); -+ xfree(map); -+ -+ return ret; -+ } -+ -+ if ( end >= map->base && map->end >= base ) -+ return -EADDRINUSE; -+ } -+ -+ if ( p2ma == p2m_access_x ) -+ return -ENOENT; -+ -+ while ( base_pfn < end_pfn ) -+ { -+ int err = set_identity_p2m_entry(d, base_pfn, p2ma, flag); -+ -+ if ( err ) -+ return err; -+ base_pfn++; -+ } -+ -+ map = xmalloc(struct identity_map); -+ if ( !map ) -+ return -ENOMEM; -+ map->base = base; -+ map->end = end; -+ map->access = p2ma; -+ map->count = 1; -+ list_add_tail(&map->list, &hd->arch.identity_maps); -+ -+ return 0; -+} -+ -+void iommu_identity_map_teardown(struct domain *d) -+{ -+ struct domain_iommu *hd = dom_iommu(d); -+ struct identity_map *map, *tmp; -+ -+ list_for_each_entry_safe ( map, tmp, &hd->arch.identity_maps, list ) -+ { -+ list_del(&map->list); -+ xfree(map); -+ } -+} -+ - static bool __hwdom_init hwdom_iommu_map(const struct domain *d, - unsigned long pfn, - unsigned long max_pfn) ---- a/xen/include/asm-x86/iommu.h -+++ b/xen/include/asm-x86/iommu.h -@@ -16,6 +16,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -51,13 +52,14 @@ struct arch_iommu - spinlock_t lock; - } pgtables; - -+ struct list_head identity_maps; -+ - union { - /* Intel VT-d */ - struct { - uint64_t pgd_maddr; /* io page directory machine address */ - unsigned int agaw; /* adjusted guest address width, 0 is level 2 30-bit */ - uint64_t iommu_bitmap; /* bitmap of iommu(s) that the domain uses */ -- struct list_head mapped_rmrrs; - } vtd; - /* AMD IOMMU */ - struct { -@@ -123,6 +125,11 @@ static inline void iommu_disable_x2apic( - iommu_ops.disable_x2apic(); - } - -+int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma, -+ paddr_t base, paddr_t end, -+ unsigned int flag); -+void iommu_identity_map_teardown(struct domain *d); -+ - extern bool untrusted_msi; - - int pi_update_irte(const struct pi_desc *pi_desc, const struct pirq *pirq, ---- a/xen/include/asm-x86/p2m.h -+++ b/xen/include/asm-x86/p2m.h -@@ -912,6 +912,34 @@ struct p2m_domain *p2m_get_altp2m(struct - static inline void p2m_altp2m_check(struct vcpu *v, uint16_t idx) {} - #endif - -+/* p2m access to IOMMU flags */ -+static inline unsigned int p2m_access_to_iommu_flags(p2m_access_t p2ma) -+{ -+ switch ( p2ma ) -+ { -+ case p2m_access_rw: -+ case p2m_access_rwx: -+ return IOMMUF_readable | IOMMUF_writable; -+ -+ case p2m_access_r: -+ case p2m_access_rx: -+ case p2m_access_rx2rw: -+ return IOMMUF_readable; -+ -+ case p2m_access_w: -+ case p2m_access_wx: -+ return IOMMUF_writable; -+ -+ case p2m_access_n: -+ case p2m_access_x: -+ case p2m_access_n2rwx: -+ return 0; -+ } -+ -+ ASSERT_UNREACHABLE(); -+ return 0; -+} -+ - /* - * p2m type to IOMMU flags - */ -@@ -933,9 +961,10 @@ static inline unsigned int p2m_get_iommu - flags = IOMMUF_readable; - break; - case p2m_mmio_direct: -- flags = IOMMUF_readable; -- if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) -- flags |= IOMMUF_writable; -+ flags = p2m_access_to_iommu_flags(p2ma); -+ if ( (flags & IOMMUF_writable) && -+ rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) -+ flags &= ~IOMMUF_writable; - break; - default: - flags = 0; diff --git a/xsa378-4.15-5.patch b/xsa378-4.15-5.patch deleted file mode 100644 index 134f425..0000000 --- a/xsa378-4.15-5.patch +++ /dev/null @@ -1,208 +0,0 @@ -From: Jan Beulich -Subject: AMD/IOMMU: re-arrange/complete re-assignment handling - -Prior to the assignment step having completed successfully, devices -should not get associated with their new owner. Hand the device to DomIO -(perhaps temporarily), until after the de-assignment step has completed. - -De-assignment of a device (from other than Dom0) as well as failure of -reassign_device() during assignment should result in unity mappings -getting torn down. This in turn requires switching to a refcounted -mapping approach, as was already used by VT-d for its RMRRs, to prevent -unmapping a region used by multiple devices. - -This is CVE-2021-28696 / part of XSA-378. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/drivers/passthrough/amd/iommu.h -+++ b/xen/drivers/passthrough/amd/iommu.h -@@ -232,8 +232,10 @@ int __must_check amd_iommu_unmap_page(st - unsigned int *flush_flags); - int __must_check amd_iommu_alloc_root(struct domain *d); - int amd_iommu_reserve_domain_unity_map(struct domain *domain, -- paddr_t phys_addr, unsigned long size, -- int iw, int ir); -+ const struct ivrs_unity_map *map, -+ unsigned int flag); -+int amd_iommu_reserve_domain_unity_unmap(struct domain *d, -+ const struct ivrs_unity_map *map); - int __must_check amd_iommu_flush_iotlb_pages(struct domain *d, dfn_t dfn, - unsigned long page_count, - unsigned int flush_flags); ---- a/xen/drivers/passthrough/amd/iommu_map.c -+++ b/xen/drivers/passthrough/amd/iommu_map.c -@@ -419,38 +419,49 @@ int amd_iommu_flush_iotlb_all(struct dom - return 0; - } - --int amd_iommu_reserve_domain_unity_map(struct domain *domain, -- paddr_t phys_addr, -- unsigned long size, int iw, int ir) -+int amd_iommu_reserve_domain_unity_map(struct domain *d, -+ const struct ivrs_unity_map *map, -+ unsigned int flag) - { -- unsigned long npages, i; -- unsigned long gfn; -- unsigned int flags = !!ir; -- unsigned int flush_flags = 0; -- int rt = 0; -- -- if ( iw ) -- flags |= IOMMUF_writable; -- -- npages = region_to_pages(phys_addr, size); -- gfn = phys_addr >> PAGE_SHIFT; -- for ( i = 0; i < npages; i++ ) -+ int rc; -+ -+ if ( d == dom_io ) -+ return 0; -+ -+ for ( rc = 0; !rc && map; map = map->next ) - { -- unsigned long frame = gfn + i; -+ p2m_access_t p2ma = p2m_access_n; -+ -+ if ( map->read ) -+ p2ma |= p2m_access_r; -+ if ( map->write ) -+ p2ma |= p2m_access_w; - -- rt = amd_iommu_map_page(domain, _dfn(frame), _mfn(frame), flags, -- &flush_flags); -- if ( rt != 0 ) -- break; -+ rc = iommu_identity_mapping(d, p2ma, map->addr, -+ map->addr + map->length - 1, flag); - } - -- /* Use while-break to avoid compiler warning */ -- while ( flush_flags && -- amd_iommu_flush_iotlb_pages(domain, _dfn(gfn), -- npages, flush_flags) ) -- break; -+ return rc; -+} -+ -+int amd_iommu_reserve_domain_unity_unmap(struct domain *d, -+ const struct ivrs_unity_map *map) -+{ -+ int rc; -+ -+ if ( d == dom_io ) -+ return 0; -+ -+ for ( rc = 0; map; map = map->next ) -+ { -+ int ret = iommu_identity_mapping(d, p2m_access_x, map->addr, -+ map->addr + map->length - 1, 0); -+ -+ if ( ret && ret != -ENOENT && !rc ) -+ rc = ret; -+ } - -- return rt; -+ return rc; - } - - int __init amd_iommu_quarantine_init(struct domain *d) ---- a/xen/drivers/passthrough/amd/pci_amd_iommu.c -+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c -@@ -329,6 +329,7 @@ static int reassign_device(struct domain - { - struct amd_iommu *iommu; - int bdf, rc; -+ const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); - - bdf = PCI_BDF2(pdev->bus, pdev->devfn); - iommu = find_iommu_for_device(pdev->seg, bdf); -@@ -343,10 +344,24 @@ static int reassign_device(struct domain - - amd_iommu_disable_domain_device(source, iommu, devfn, pdev); - -- if ( devfn == pdev->devfn ) -+ /* -+ * If the device belongs to the hardware domain, and it has a unity mapping, -+ * don't remove it from the hardware domain, because BIOS may reference that -+ * mapping. -+ */ -+ if ( !is_hardware_domain(source) ) - { -- list_move(&pdev->domain_list, &target->pdev_list); -- pdev->domain = target; -+ rc = amd_iommu_reserve_domain_unity_unmap( -+ source, -+ ivrs_mappings[get_dma_requestor_id(pdev->seg, bdf)].unity_map); -+ if ( rc ) -+ return rc; -+ } -+ -+ if ( devfn == pdev->devfn && pdev->domain != dom_io ) -+ { -+ list_move(&pdev->domain_list, &dom_io->pdev_list); -+ pdev->domain = dom_io; - } - - rc = allocate_domain_resources(target); -@@ -357,6 +372,12 @@ static int reassign_device(struct domain - AMD_IOMMU_DEBUG("Re-assign %pp from dom%d to dom%d\n", - &pdev->sbdf, source->domain_id, target->domain_id); - -+ if ( devfn == pdev->devfn && pdev->domain != target ) -+ { -+ list_move(&pdev->domain_list, &target->pdev_list); -+ pdev->domain = target; -+ } -+ - return 0; - } - -@@ -367,20 +388,28 @@ static int amd_iommu_assign_device(struc - struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); - int bdf = PCI_BDF2(pdev->bus, devfn); - int req_id = get_dma_requestor_id(pdev->seg, bdf); -- const struct ivrs_unity_map *unity_map; -+ int rc = amd_iommu_reserve_domain_unity_map( -+ d, ivrs_mappings[req_id].unity_map, flag); - -- for ( unity_map = ivrs_mappings[req_id].unity_map; unity_map; -- unity_map = unity_map->next ) -+ if ( !rc ) -+ rc = reassign_device(pdev->domain, d, devfn, pdev); -+ -+ if ( rc && !is_hardware_domain(d) ) - { -- int rc = amd_iommu_reserve_domain_unity_map( -- d, unity_map->addr, unity_map->length, -- unity_map->write, unity_map->read); -+ int ret = amd_iommu_reserve_domain_unity_unmap( -+ d, ivrs_mappings[req_id].unity_map); - -- if ( rc ) -- return rc; -+ if ( ret ) -+ { -+ printk(XENLOG_ERR "AMD-Vi: " -+ "unity-unmap for %pd/%04x:%02x:%02x.%u failed (%d)\n", -+ d, pdev->seg, pdev->bus, -+ PCI_SLOT(devfn), PCI_FUNC(devfn), ret); -+ domain_crash(d); -+ } - } - -- return reassign_device(pdev->domain, d, devfn, pdev); -+ return rc; - } - - static void amd_iommu_clear_root_pgtable(struct domain *d) -@@ -394,6 +423,7 @@ static void amd_iommu_clear_root_pgtable - - static void amd_iommu_domain_destroy(struct domain *d) - { -+ iommu_identity_map_teardown(d); - ASSERT(!dom_iommu(d)->arch.amd.root_table); - } - diff --git a/xsa378-4.15-6.patch b/xsa378-4.15-6.patch deleted file mode 100644 index d6ebc2a..0000000 --- a/xsa378-4.15-6.patch +++ /dev/null @@ -1,411 +0,0 @@ -From: Jan Beulich -Subject: AMD/IOMMU: re-arrange exclusion range and unity map recording - -The spec makes no provisions for OS behavior here to depend on the -amount of RAM found on the system. While the spec may not sufficiently -clearly distinguish both kinds of regions, they are surely meant to be -separate things: Only regions with ACPI_IVMD_EXCLUSION_RANGE set should -be candidates for putting in the exclusion range registers. (As there's -only a single such pair of registers per IOMMU, secondary non-adjacent -regions with the flag set already get converted to unity mapped -regions.) - -First of all, drop the dependency on max_page. With commit b4f042236ae0 -("AMD/IOMMU: Cease using a dynamic height for the IOMMU pagetables") the -use of it here was stale anyway; it was bogus already before, as it -didn't account for max_page getting increased later on. Simply try an -exclusion range registration first, and if it fails (for being -unsuitable or non-mergeable), register a unity mapping range. - -With this various local variables become unnecessary and hence get -dropped at the same time. - -With the max_page boundary dropped for using unity maps, the minimum -page table tree height now needs both recording and enforcing in -amd_iommu_domain_init(). Since we can't predict which devices may get -assigned to a domain, our only option is to uniformly force at least -that height for all domains, now that the height isn't dynamic anymore. - -Further don't make use of the exclusion range unless ACPI data says so. - -Note that exclusion range registration in -register_range_for_all_devices() is on a best effort basis. Hence unity -map entries also registered are redundant when the former succeeded, but -they also do no harm. Improvements in this area can be done later imo. - -Also adjust types where suitable without touching extra lines. - -This is part of XSA-378. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/drivers/passthrough/amd/iommu.h -+++ b/xen/drivers/passthrough/amd/iommu.h -@@ -304,6 +304,8 @@ extern struct hpet_sbdf { - } init; - } hpet_sbdf; - -+extern int amd_iommu_min_paging_mode; -+ - extern void *shared_intremap_table; - extern unsigned long *shared_intremap_inuse; - ---- a/xen/drivers/passthrough/amd/iommu_acpi.c -+++ b/xen/drivers/passthrough/amd/iommu_acpi.c -@@ -117,12 +117,8 @@ static struct amd_iommu * __init find_io - } - - static int __init reserve_iommu_exclusion_range( -- struct amd_iommu *iommu, uint64_t base, uint64_t limit, -- bool all, bool iw, bool ir) -+ struct amd_iommu *iommu, paddr_t base, paddr_t limit, bool all) - { -- if ( !ir || !iw ) -- return -EPERM; -- - /* need to extend exclusion range? */ - if ( iommu->exclusion_enable ) - { -@@ -151,14 +147,18 @@ static int __init reserve_unity_map_for_ - { - struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg); - struct ivrs_unity_map *unity_map = ivrs_mappings[bdf].unity_map; -+ int paging_mode = amd_iommu_get_paging_mode(PFN_UP(base + length)); -+ -+ if ( paging_mode < 0 ) -+ return paging_mode; - - /* Check for overlaps. */ - for ( ; unity_map; unity_map = unity_map->next ) - { - /* - * Exact matches are okay. This can in particular happen when -- * register_exclusion_range_for_device() calls here twice for the -- * same (s,b,d,f). -+ * register_range_for_device() calls here twice for the same -+ * (s,b,d,f). - */ - if ( base == unity_map->addr && length == unity_map->length && - ir == unity_map->read && iw == unity_map->write ) -@@ -186,55 +186,52 @@ static int __init reserve_unity_map_for_ - unity_map->next = ivrs_mappings[bdf].unity_map; - ivrs_mappings[bdf].unity_map = unity_map; - -+ if ( paging_mode > amd_iommu_min_paging_mode ) -+ amd_iommu_min_paging_mode = paging_mode; -+ - return 0; - } - --static int __init register_exclusion_range_for_all_devices( -- unsigned long base, unsigned long limit, u8 iw, u8 ir) -+static int __init register_range_for_all_devices( -+ paddr_t base, paddr_t limit, bool iw, bool ir, bool exclusion) - { - int seg = 0; /* XXX */ -- unsigned long range_top, iommu_top, length; - struct amd_iommu *iommu; -- unsigned int bdf; - int rc = 0; - - /* is part of exclusion range inside of IOMMU virtual address space? */ - /* note: 'limit' parameter is assumed to be page-aligned */ -- range_top = limit + PAGE_SIZE; -- iommu_top = max_page * PAGE_SIZE; -- if ( base < iommu_top ) -- { -- if ( range_top > iommu_top ) -- range_top = iommu_top; -- length = range_top - base; -- /* reserve r/w unity-mapped page entries for devices */ -- /* note: these entries are part of the exclusion range */ -- for ( bdf = 0; !rc && bdf < ivrs_bdf_entries; bdf++ ) -- rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir); -- /* push 'base' just outside of virtual address space */ -- base = iommu_top; -- } -- /* register IOMMU exclusion range settings */ -- if ( !rc && limit >= iommu_top ) -+ if ( exclusion ) - { - for_each_amd_iommu( iommu ) - { -- rc = reserve_iommu_exclusion_range(iommu, base, limit, -- true /* all */, iw, ir); -- if ( rc ) -- break; -+ int ret = reserve_iommu_exclusion_range(iommu, base, limit, -+ true /* all */); -+ -+ if ( ret && !rc ) -+ rc = ret; - } - } - -+ if ( !exclusion || rc ) -+ { -+ paddr_t length = limit + PAGE_SIZE - base; -+ unsigned int bdf; -+ -+ /* reserve r/w unity-mapped page entries for devices */ -+ for ( bdf = rc = 0; !rc && bdf < ivrs_bdf_entries; bdf++ ) -+ rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir); -+ } -+ - return rc; - } - --static int __init register_exclusion_range_for_device( -- u16 bdf, unsigned long base, unsigned long limit, u8 iw, u8 ir) -+static int __init register_range_for_device( -+ unsigned int bdf, paddr_t base, paddr_t limit, -+ bool iw, bool ir, bool exclusion) - { - int seg = 0; /* XXX */ - struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg); -- unsigned long range_top, iommu_top, length; - struct amd_iommu *iommu; - u16 req; - int rc = 0; -@@ -248,27 +245,19 @@ static int __init register_exclusion_ran - req = ivrs_mappings[bdf].dte_requestor_id; - - /* note: 'limit' parameter is assumed to be page-aligned */ -- range_top = limit + PAGE_SIZE; -- iommu_top = max_page * PAGE_SIZE; -- if ( base < iommu_top ) -- { -- if ( range_top > iommu_top ) -- range_top = iommu_top; -- length = range_top - base; -+ if ( exclusion ) -+ rc = reserve_iommu_exclusion_range(iommu, base, limit, -+ false /* all */); -+ if ( !exclusion || rc ) -+ { -+ paddr_t length = limit + PAGE_SIZE - base; -+ - /* reserve unity-mapped page entries for device */ -- /* note: these entries are part of the exclusion range */ - rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir) ?: - reserve_unity_map_for_device(seg, req, base, length, iw, ir); -- -- /* push 'base' just outside of virtual address space */ -- base = iommu_top; - } -- -- /* register IOMMU exclusion range settings for device */ -- if ( !rc && limit >= iommu_top ) -+ else - { -- rc = reserve_iommu_exclusion_range(iommu, base, limit, -- false /* all */, iw, ir); - ivrs_mappings[bdf].dte_allow_exclusion = true; - ivrs_mappings[req].dte_allow_exclusion = true; - } -@@ -276,53 +265,42 @@ static int __init register_exclusion_ran - return rc; - } - --static int __init register_exclusion_range_for_iommu_devices( -- struct amd_iommu *iommu, -- unsigned long base, unsigned long limit, u8 iw, u8 ir) -+static int __init register_range_for_iommu_devices( -+ struct amd_iommu *iommu, paddr_t base, paddr_t limit, -+ bool iw, bool ir, bool exclusion) - { -- unsigned long range_top, iommu_top, length; -+ /* note: 'limit' parameter is assumed to be page-aligned */ -+ paddr_t length = limit + PAGE_SIZE - base; - unsigned int bdf; - u16 req; -- int rc = 0; -+ int rc; - -- /* is part of exclusion range inside of IOMMU virtual address space? */ -- /* note: 'limit' parameter is assumed to be page-aligned */ -- range_top = limit + PAGE_SIZE; -- iommu_top = max_page * PAGE_SIZE; -- if ( base < iommu_top ) -- { -- if ( range_top > iommu_top ) -- range_top = iommu_top; -- length = range_top - base; -- /* reserve r/w unity-mapped page entries for devices */ -- /* note: these entries are part of the exclusion range */ -- for ( bdf = 0; !rc && bdf < ivrs_bdf_entries; bdf++ ) -- { -- if ( iommu == find_iommu_for_device(iommu->seg, bdf) ) -- { -- req = get_ivrs_mappings(iommu->seg)[bdf].dte_requestor_id; -- rc = reserve_unity_map_for_device(iommu->seg, bdf, base, length, -- iw, ir) ?: -- reserve_unity_map_for_device(iommu->seg, req, base, length, -- iw, ir); -- } -- } -- -- /* push 'base' just outside of virtual address space */ -- base = iommu_top; -+ if ( exclusion ) -+ { -+ rc = reserve_iommu_exclusion_range(iommu, base, limit, true /* all */); -+ if ( !rc ) -+ return 0; - } - -- /* register IOMMU exclusion range settings */ -- if ( !rc && limit >= iommu_top ) -- rc = reserve_iommu_exclusion_range(iommu, base, limit, -- true /* all */, iw, ir); -+ /* reserve unity-mapped page entries for devices */ -+ for ( bdf = rc = 0; !rc && bdf < ivrs_bdf_entries; bdf++ ) -+ { -+ if ( iommu != find_iommu_for_device(iommu->seg, bdf) ) -+ continue; -+ -+ req = get_ivrs_mappings(iommu->seg)[bdf].dte_requestor_id; -+ rc = reserve_unity_map_for_device(iommu->seg, bdf, base, length, -+ iw, ir) ?: -+ reserve_unity_map_for_device(iommu->seg, req, base, length, -+ iw, ir); -+ } - - return rc; - } - - static int __init parse_ivmd_device_select( - const struct acpi_ivrs_memory *ivmd_block, -- unsigned long base, unsigned long limit, u8 iw, u8 ir) -+ paddr_t base, paddr_t limit, bool iw, bool ir, bool exclusion) - { - u16 bdf; - -@@ -333,12 +311,12 @@ static int __init parse_ivmd_device_sele - return -ENODEV; - } - -- return register_exclusion_range_for_device(bdf, base, limit, iw, ir); -+ return register_range_for_device(bdf, base, limit, iw, ir, exclusion); - } - - static int __init parse_ivmd_device_range( - const struct acpi_ivrs_memory *ivmd_block, -- unsigned long base, unsigned long limit, u8 iw, u8 ir) -+ paddr_t base, paddr_t limit, bool iw, bool ir, bool exclusion) - { - unsigned int first_bdf, last_bdf, bdf; - int error; -@@ -360,15 +338,15 @@ static int __init parse_ivmd_device_rang - } - - for ( bdf = first_bdf, error = 0; (bdf <= last_bdf) && !error; bdf++ ) -- error = register_exclusion_range_for_device( -- bdf, base, limit, iw, ir); -+ error = register_range_for_device( -+ bdf, base, limit, iw, ir, exclusion); - - return error; - } - - static int __init parse_ivmd_device_iommu( - const struct acpi_ivrs_memory *ivmd_block, -- unsigned long base, unsigned long limit, u8 iw, u8 ir) -+ paddr_t base, paddr_t limit, bool iw, bool ir, bool exclusion) - { - int seg = 0; /* XXX */ - struct amd_iommu *iommu; -@@ -383,14 +361,14 @@ static int __init parse_ivmd_device_iomm - return -ENODEV; - } - -- return register_exclusion_range_for_iommu_devices( -- iommu, base, limit, iw, ir); -+ return register_range_for_iommu_devices( -+ iommu, base, limit, iw, ir, exclusion); - } - - static int __init parse_ivmd_block(const struct acpi_ivrs_memory *ivmd_block) - { - unsigned long start_addr, mem_length, base, limit; -- u8 iw, ir; -+ bool iw = true, ir = true, exclusion = false; - - if ( ivmd_block->header.length < sizeof(*ivmd_block) ) - { -@@ -407,13 +385,11 @@ static int __init parse_ivmd_block(const - ivmd_block->header.type, start_addr, mem_length); - - if ( ivmd_block->header.flags & ACPI_IVMD_EXCLUSION_RANGE ) -- iw = ir = IOMMU_CONTROL_ENABLED; -+ exclusion = true; - else if ( ivmd_block->header.flags & ACPI_IVMD_UNITY ) - { -- iw = ivmd_block->header.flags & ACPI_IVMD_READ ? -- IOMMU_CONTROL_ENABLED : IOMMU_CONTROL_DISABLED; -- ir = ivmd_block->header.flags & ACPI_IVMD_WRITE ? -- IOMMU_CONTROL_ENABLED : IOMMU_CONTROL_DISABLED; -+ iw = ivmd_block->header.flags & ACPI_IVMD_READ; -+ ir = ivmd_block->header.flags & ACPI_IVMD_WRITE; - } - else - { -@@ -424,20 +400,20 @@ static int __init parse_ivmd_block(const - switch( ivmd_block->header.type ) - { - case ACPI_IVRS_TYPE_MEMORY_ALL: -- return register_exclusion_range_for_all_devices( -- base, limit, iw, ir); -+ return register_range_for_all_devices( -+ base, limit, iw, ir, exclusion); - - case ACPI_IVRS_TYPE_MEMORY_ONE: -- return parse_ivmd_device_select(ivmd_block, -- base, limit, iw, ir); -+ return parse_ivmd_device_select(ivmd_block, base, limit, -+ iw, ir, exclusion); - - case ACPI_IVRS_TYPE_MEMORY_RANGE: -- return parse_ivmd_device_range(ivmd_block, -- base, limit, iw, ir); -+ return parse_ivmd_device_range(ivmd_block, base, limit, -+ iw, ir, exclusion); - - case ACPI_IVRS_TYPE_MEMORY_IOMMU: -- return parse_ivmd_device_iommu(ivmd_block, -- base, limit, iw, ir); -+ return parse_ivmd_device_iommu(ivmd_block, base, limit, -+ iw, ir, exclusion); - - default: - AMD_IOMMU_DEBUG("IVMD Error: Invalid Block Type!\n"); ---- a/xen/drivers/passthrough/amd/pci_amd_iommu.c -+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c -@@ -234,6 +234,8 @@ static int __must_check allocate_domain_ - return rc; - } - -+int __read_mostly amd_iommu_min_paging_mode = 1; -+ - static int amd_iommu_domain_init(struct domain *d) - { - struct domain_iommu *hd = dom_iommu(d); -@@ -245,11 +247,13 @@ static int amd_iommu_domain_init(struct - * - HVM could in principle use 3 or 4 depending on how much guest - * physical address space we give it, but this isn't known yet so use 4 - * unilaterally. -+ * - Unity maps may require an even higher number. - */ -- hd->arch.amd.paging_mode = amd_iommu_get_paging_mode( -- is_hvm_domain(d) -- ? 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT) -- : get_upper_mfn_bound() + 1); -+ hd->arch.amd.paging_mode = max(amd_iommu_get_paging_mode( -+ is_hvm_domain(d) -+ ? 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT) -+ : get_upper_mfn_bound() + 1), -+ amd_iommu_min_paging_mode); - - return 0; - } diff --git a/xsa378-4.15-7.patch b/xsa378-4.15-7.patch deleted file mode 100644 index 0f59532..0000000 --- a/xsa378-4.15-7.patch +++ /dev/null @@ -1,88 +0,0 @@ -From: Jan Beulich -Subject: x86/p2m: introduce p2m_is_special() - -Seeing the similarity of grant, foreign, and (subsequently) direct-MMIO -handling, introduce a new P2M type group named "special" (as in "needing -special accessors to create/destroy"). - -Also use -EPERM instead of other error codes on the two domain_crash() -paths touched. - -This is part of XSA-378. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -811,7 +811,7 @@ p2m_remove_page(struct p2m_domain *p2m, - for ( i = 0; i < (1UL << page_order); i++ ) - { - p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, NULL, NULL); -- if ( !p2m_is_grant(t) && !p2m_is_shared(t) && !p2m_is_foreign(t) ) -+ if ( !p2m_is_special(t) && !p2m_is_shared(t) ) - set_gpfn_from_mfn(mfn_x(mfn) + i, INVALID_M2P_ENTRY); - } - } -@@ -941,13 +941,13 @@ guest_physmap_add_entry(struct domain *d - &ot, &a, 0, NULL, NULL); - ASSERT(!p2m_is_shared(ot)); - } -- if ( p2m_is_grant(ot) || p2m_is_foreign(ot) ) -+ if ( p2m_is_special(ot) ) - { -- /* Really shouldn't be unmapping grant/foreign maps this way */ -+ /* Don't permit unmapping grant/foreign this way. */ - domain_crash(d); - p2m_unlock(p2m); - -- return -EINVAL; -+ return -EPERM; - } - else if ( p2m_is_ram(ot) && !p2m_is_paged(ot) ) - { -@@ -1041,8 +1041,7 @@ int p2m_change_type_one(struct domain *d - struct p2m_domain *p2m = p2m_get_hostp2m(d); - int rc; - -- BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); -- BUG_ON(p2m_is_foreign(ot) || p2m_is_foreign(nt)); -+ BUG_ON(p2m_is_special(ot) || p2m_is_special(nt)); - - gfn_lock(p2m, gfn, 0); - -@@ -1289,11 +1288,11 @@ static int set_typed_p2m_entry(struct do - gfn_unlock(p2m, gfn, order); - return cur_order + 1; - } -- if ( p2m_is_grant(ot) || p2m_is_foreign(ot) ) -+ if ( p2m_is_special(ot) ) - { - gfn_unlock(p2m, gfn, order); - domain_crash(d); -- return -ENOENT; -+ return -EPERM; - } - else if ( p2m_is_ram(ot) ) - { ---- a/xen/include/asm-x86/p2m.h -+++ b/xen/include/asm-x86/p2m.h -@@ -149,6 +149,10 @@ typedef unsigned int p2m_query_t; - | p2m_to_mask(p2m_ram_logdirty) ) - #define P2M_SHARED_TYPES (p2m_to_mask(p2m_ram_shared)) - -+/* Types established/cleaned up via special accessors. */ -+#define P2M_SPECIAL_TYPES (P2M_GRANT_TYPES | \ -+ p2m_to_mask(p2m_map_foreign)) -+ - /* Valid types not necessarily associated with a (valid) MFN. */ - #define P2M_INVALID_MFN_TYPES (P2M_POD_TYPES \ - | p2m_to_mask(p2m_mmio_direct) \ -@@ -177,6 +181,7 @@ typedef unsigned int p2m_query_t; - #define p2m_is_paged(_t) (p2m_to_mask(_t) & P2M_PAGED_TYPES) - #define p2m_is_sharable(_t) (p2m_to_mask(_t) & P2M_SHARABLE_TYPES) - #define p2m_is_shared(_t) (p2m_to_mask(_t) & P2M_SHARED_TYPES) -+#define p2m_is_special(_t) (p2m_to_mask(_t) & P2M_SPECIAL_TYPES) - #define p2m_is_broken(_t) (p2m_to_mask(_t) & P2M_BROKEN_TYPES) - #define p2m_is_foreign(_t) (p2m_to_mask(_t) & p2m_to_mask(p2m_map_foreign)) - diff --git a/xsa378-4.15-8.patch b/xsa378-4.15-8.patch deleted file mode 100644 index 391a0bc..0000000 --- a/xsa378-4.15-8.patch +++ /dev/null @@ -1,155 +0,0 @@ -From: Jan Beulich -Subject: x86/p2m: guard (in particular) identity mapping entries - -Such entries, created by set_identity_p2m_entry(), should only be -destroyed by clear_identity_p2m_entry(). However, similarly, entries -created by set_mmio_p2m_entry() should only be torn down by -clear_mmio_p2m_entry(), so the logic gets based upon p2m_mmio_direct as -the entry type (separation between "ordinary" and 1:1 mappings would -require a further indicator to tell apart the two). - -As to the guest_remove_page() change, commit 48dfb297a20a ("x86/PVH: -allow guest_remove_page to remove p2m_mmio_direct pages"), which -introduced the call to clear_mmio_p2m_entry(), claimed this was done for -hwdom only without this actually having been the case. However, this -code shouldn't be there in the first place, as MMIO entries shouldn't be -dropped this way. Avoid triggering the warning again that 48dfb297a20a -silenced by an adjustment to xenmem_add_to_physmap_one() instead. - -Note that guest_physmap_mark_populate_on_demand() gets tightened beyond -the immediate purpose of this change. - -Note also that I didn't inspect code which isn't security supported, -e.g. sharing, paging, or altp2m. - -This is CVE-2021-28694 / part of XSA-378. - -Signed-off-by: Jan Beulich -Reviewed-by: Paul Durrant - ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -799,7 +799,8 @@ p2m_remove_page(struct p2m_domain *p2m, - &cur_order, NULL); - - if ( p2m_is_valid(t) && -- (!mfn_valid(mfn) || !mfn_eq(mfn_add(mfn, i), mfn_return)) ) -+ (!mfn_valid(mfn) || t == p2m_mmio_direct || -+ !mfn_eq(mfn_add(mfn, i), mfn_return)) ) - return -EILSEQ; - - i += (1UL << cur_order) - -@@ -899,7 +900,7 @@ guest_physmap_add_entry(struct domain *d - if ( p2m_is_foreign(t) ) - return -EINVAL; - -- if ( !mfn_valid(mfn) ) -+ if ( !mfn_valid(mfn) || t == p2m_mmio_direct ) - { - ASSERT_UNREACHABLE(); - return -EINVAL; -@@ -943,7 +944,7 @@ guest_physmap_add_entry(struct domain *d - } - if ( p2m_is_special(ot) ) - { -- /* Don't permit unmapping grant/foreign this way. */ -+ /* Don't permit unmapping grant/foreign/direct-MMIO this way. */ - domain_crash(d); - p2m_unlock(p2m); - -@@ -1399,8 +1400,8 @@ int set_identity_p2m_entry(struct domain - * order+1 for caller to retry with order (guaranteed smaller than - * the order value passed in) - */ --int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn_l, mfn_t mfn, -- unsigned int order) -+static int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn_l, -+ mfn_t mfn, unsigned int order) - { - int rc = -EINVAL; - gfn_t gfn = _gfn(gfn_l); -@@ -2731,7 +2732,9 @@ int xenmem_add_to_physmap_one( - - /* Remove previously mapped page if it was present. */ - prev_mfn = get_gfn(d, gfn_x(gpfn), &p2mt); -- if ( mfn_valid(prev_mfn) ) -+ if ( p2mt == p2m_mmio_direct ) -+ rc = -EPERM; -+ else if ( mfn_valid(prev_mfn) ) - { - if ( is_special_page(mfn_to_page(prev_mfn)) ) - /* Special pages are simply unhooked from this phys slot. */ ---- a/xen/arch/x86/mm/p2m-pod.c -+++ b/xen/arch/x86/mm/p2m-pod.c -@@ -1299,17 +1299,17 @@ guest_physmap_mark_populate_on_demand(st - - p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, &cur_order, NULL); - n = 1UL << min(order, cur_order); -- if ( p2m_is_ram(ot) ) -+ if ( ot == p2m_populate_on_demand ) -+ { -+ /* Count how many PoD entries we'll be replacing if successful */ -+ pod_count += n; -+ } -+ else if ( ot != p2m_invalid && ot != p2m_mmio_dm ) - { - P2M_DEBUG("gfn_to_mfn returned type %d!\n", ot); - rc = -EBUSY; - goto out; - } -- else if ( ot == p2m_populate_on_demand ) -- { -- /* Count how man PoD entries we'll be replacing if successful */ -- pod_count += n; -- } - } - - /* Now, actually do the two-way mapping */ ---- a/xen/common/memory.c -+++ b/xen/common/memory.c -@@ -330,7 +330,7 @@ int guest_remove_page(struct domain *d, - } - if ( p2mt == p2m_mmio_direct ) - { -- rc = clear_mmio_p2m_entry(d, gmfn, mfn, PAGE_ORDER_4K); -+ rc = -EPERM; - goto out_put_gfn; - } - #else -@@ -1875,6 +1875,15 @@ int check_get_page_from_gfn(struct domai - return -EAGAIN; - } - #endif -+#ifdef CONFIG_X86 -+ if ( p2mt == p2m_mmio_direct ) -+ { -+ if ( page ) -+ put_page(page); -+ -+ return -EPERM; -+ } -+#endif - - if ( !page ) - return -EINVAL; ---- a/xen/include/asm-x86/p2m.h -+++ b/xen/include/asm-x86/p2m.h -@@ -151,7 +151,8 @@ typedef unsigned int p2m_query_t; - - /* Types established/cleaned up via special accessors. */ - #define P2M_SPECIAL_TYPES (P2M_GRANT_TYPES | \ -- p2m_to_mask(p2m_map_foreign)) -+ p2m_to_mask(p2m_map_foreign) | \ -+ p2m_to_mask(p2m_mmio_direct)) - - /* Valid types not necessarily associated with a (valid) MFN. */ - #define P2M_INVALID_MFN_TYPES (P2M_POD_TYPES \ -@@ -666,8 +667,6 @@ int p2m_is_logdirty_range(struct p2m_dom - /* Set mmio addresses in the p2m table (for pass-through) */ - int set_mmio_p2m_entry(struct domain *d, gfn_t gfn, mfn_t mfn, - unsigned int order); --int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, -- unsigned int order); - - /* Set identity addresses in the p2m table (for pass-through) */ - int set_identity_p2m_entry(struct domain *d, unsigned long gfn, diff --git a/xsa379-4.15.patch b/xsa379-4.15.patch deleted file mode 100644 index 004f56e..0000000 --- a/xsa379-4.15.patch +++ /dev/null @@ -1,82 +0,0 @@ -From: Jan Beulich -Subject: x86/mm: widen locked region in xenmem_add_to_physmap_one() - -For pages which can be made part of the P2M by the guest, but which can -also later be de-allocated (grant table v2 status pages being the -present example), it is imperative that they be mapped at no more than a -single GFN. We therefore need to make sure that of two parallel -XENMAPSPACE_grant_table requests for the same status page one completes -before the second checks at which other GFN the underlying MFN is -presently mapped. - -Push down the respective put_gfn(). This leverages that gfn_lock() -really aliases p2m_lock(), but the function makes this assumption -already anyway: In the XENMAPSPACE_gmfn case lock nesting constraints -for both involved GFNs would otherwise need to be enforced to avoid ABBA -deadlocks. - -This is CVE-2021-28697 / XSA-379. - -Signed-off-by: Jan Beulich -Reviewed-by: Julien Grall ---- -Since there was some re-ordering of the checks in staging/master (the --EXDEV now sitting earlier there), I deemed it better to drop the -earlier "if ( rc )" and allow an earlier error to be overwritten by --EXDEV here. - ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -2730,8 +2730,20 @@ int xenmem_add_to_physmap_one( - goto put_both; - } - -- /* Remove previously mapped page if it was present. */ -+ /* -+ * Note that we're (ab)using GFN locking (to really be locking of the -+ * entire P2M) here in (at least) two ways: Finer grained locking would -+ * expose lock order violations in the XENMAPSPACE_gmfn case (due to the -+ * earlier get_gfn_unshare() above). Plus at the very least for the grant -+ * table v2 status page case we need to guarantee that the same page can -+ * only appear at a single GFN. While this is a property we want in -+ * general, for pages which can subsequently be freed this imperative: -+ * Upon freeing we wouldn't be able to find other mappings in the P2M -+ * (unless we did a brute force search). -+ */ - prev_mfn = get_gfn(d, gfn_x(gpfn), &p2mt); -+ -+ /* Remove previously mapped page if it was present. */ - if ( p2mt == p2m_mmio_direct ) - rc = -EPERM; - else if ( mfn_valid(prev_mfn) ) -@@ -2743,27 +2755,21 @@ int xenmem_add_to_physmap_one( - /* Normal domain memory is freed, to avoid leaking memory. */ - rc = guest_remove_page(d, gfn_x(gpfn)); - } -- /* In the XENMAPSPACE_gmfn case we still hold a ref on the old page. */ -- put_gfn(d, gfn_x(gpfn)); -- -- if ( rc ) -- goto put_both; - - /* Unmap from old location, if any. */ - old_gpfn = get_gpfn_from_mfn(mfn_x(mfn)); - ASSERT(!SHARED_M2P(old_gpfn)); - if ( space == XENMAPSPACE_gmfn && old_gpfn != gfn ) -- { - rc = -EXDEV; -- goto put_both; -- } -- if ( old_gpfn != INVALID_M2P_ENTRY ) -+ else if ( !rc && old_gpfn != INVALID_M2P_ENTRY ) - rc = guest_physmap_remove_page(d, _gfn(old_gpfn), mfn, PAGE_ORDER_4K); - - /* Map at new location. */ - if ( !rc ) - rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K); - -+ put_gfn(d, gfn_x(gpfn)); -+ - put_both: - /* - * In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. diff --git a/xsa380-1.patch b/xsa380-1.patch deleted file mode 100644 index 9212d73..0000000 --- a/xsa380-1.patch +++ /dev/null @@ -1,178 +0,0 @@ -From: Jan Beulich -Subject: gnttab: add preemption check to gnttab_release_mappings() - -A guest may die with many grant mappings still in place, or simply with -a large maptrack table. Iterating through this may take more time than -is reasonable without intermediate preemption (to run softirqs and -perhaps the scheduler). - -Move the invocation of the function to the section where other -restartable functions get invoked, and have the function itself check -for preemption every once in a while. Have it iterate the table -backwards, such that decreasing the maptrack limit is all it takes to -convey restart information. - -In domain_teardown() introduce PROG_none such that inserting at the -front will be easier going forward. - -This is part of CVE-2021-28698 / XSA-380. - -Reported-by: Andrew Cooper -Signed-off-by: Jan Beulich -Reviewed-by: Julien Grall ---- -While I consider removal of the freeing of t->maptrack[i] from -grant_table_destroy() an integral part of this change, also freeing -t->maptrack right in gnttab_release_mappings() would seem like an -unrelated change to me, so I'm not moving that one for now. If others -think it would better be moved, I'd be happy to do so. - -While in principle it would be nice to also eliminate the other loops -from grant_table_destroy() (which can all take long as well provided a -large enough max_grant_frames), ->maptrack[] really is special in that -it only gets accessed when processing requests by the domain itself. The -other arrays may all continue to be accessed as remote domains drop uses -of grants by the dying domain. ---- -v3: Add comment. -v2: Move gnttab_release_mappings() invocation into domain_teardown(). - Don't crash when cleaning up domain without maptrack table. Extend - comment next to maptrack_limit. - ---- a/xen/common/domain.c -+++ b/xen/common/domain.c -@@ -412,11 +412,18 @@ static int domain_teardown(struct domain - v = d->teardown.vcpu - - enum { -- PROG_vcpu_teardown = 1, -+ PROG_none, -+ PROG_gnttab_mappings, -+ PROG_vcpu_teardown, - PROG_done, - }; - -- case 0: -+ case PROG_none: -+ rc = gnttab_release_mappings(d); -+ if ( rc ) -+ return rc; -+ -+ PROGRESS(gnttab_mappings): - for_each_vcpu ( d, v ) - { - PROGRESS_VCPU(teardown); -@@ -908,7 +915,6 @@ int domain_kill(struct domain *d) - return domain_kill(d); - d->is_dying = DOMDYING_dying; - argo_destroy(d); -- gnttab_release_mappings(d); - vnuma_destroy(d->vnuma); - domain_set_outstanding_pages(d, 0); - /* fallthrough */ ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -64,7 +64,13 @@ struct grant_table { - unsigned int nr_grant_frames; - /* Number of grant status frames shared with guest (for version 2) */ - unsigned int nr_status_frames; -- /* Number of available maptrack entries. */ -+ /* -+ * Number of available maptrack entries. For cleanup purposes it is -+ * important to realize that this field and @maptrack further down will -+ * only ever be accessed by the local domain. Thus it is okay to clean -+ * up early, and to shrink the limit for the purpose of tracking cleanup -+ * progress. -+ */ - unsigned int maptrack_limit; - /* Shared grant table (see include/public/grant_table.h). */ - union { -@@ -3679,9 +3685,7 @@ do_grant_table_op( - #include "compat/grant_table.c" - #endif - --void --gnttab_release_mappings( -- struct domain *d) -+int gnttab_release_mappings(struct domain *d) - { - struct grant_table *gt = d->grant_table, *rgt; - struct grant_mapping *map; -@@ -3695,8 +3699,32 @@ gnttab_release_mappings( - - BUG_ON(!d->is_dying); - -- for ( handle = 0; handle < gt->maptrack_limit; handle++ ) -+ if ( !gt || !gt->maptrack ) -+ return 0; -+ -+ for ( handle = gt->maptrack_limit; handle; ) - { -+ /* -+ * Deal with full pages such that their freeing (in the body of the -+ * if()) remains simple. -+ */ -+ if ( handle < gt->maptrack_limit && !(handle % MAPTRACK_PER_PAGE) ) -+ { -+ /* -+ * Changing maptrack_limit alters nr_maptrack_frames()'es return -+ * value. Free the then excess trailing page right here, rather -+ * than leaving it to grant_table_destroy() (and in turn requiring -+ * to leave gt->maptrack_limit unaltered). -+ */ -+ gt->maptrack_limit = handle; -+ FREE_XENHEAP_PAGE(gt->maptrack[nr_maptrack_frames(gt)]); -+ -+ if ( hypercall_preempt_check() ) -+ return -ERESTART; -+ } -+ -+ --handle; -+ - map = &maptrack_entry(gt, handle); - if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) ) - continue; -@@ -3780,6 +3808,11 @@ gnttab_release_mappings( - - map->flags = 0; - } -+ -+ gt->maptrack_limit = 0; -+ FREE_XENHEAP_PAGE(gt->maptrack[0]); -+ -+ return 0; - } - - void grant_table_warn_active_grants(struct domain *d) -@@ -3843,8 +3876,7 @@ grant_table_destroy( - free_xenheap_page(t->shared_raw[i]); - xfree(t->shared_raw); - -- for ( i = 0; i < nr_maptrack_frames(t); i++ ) -- free_xenheap_page(t->maptrack[i]); -+ ASSERT(!t->maptrack_limit); - vfree(t->maptrack); - - for ( i = 0; i < nr_active_grant_frames(t); i++ ) ---- a/xen/include/xen/grant_table.h -+++ b/xen/include/xen/grant_table.h -@@ -47,9 +47,7 @@ void grant_table_init_vcpu(struct vcpu * - void grant_table_warn_active_grants(struct domain *d); - - /* Domain death release of granted mappings of other domains' memory. */ --void --gnttab_release_mappings( -- struct domain *d); -+int gnttab_release_mappings(struct domain *d); - - int mem_sharing_gref_to_gfn(struct grant_table *gt, grant_ref_t ref, - gfn_t *gfn, uint16_t *status); -@@ -80,7 +78,7 @@ static inline void grant_table_init_vcpu - - static inline void grant_table_warn_active_grants(struct domain *d) {} - --static inline void gnttab_release_mappings(struct domain *d) {} -+static inline int gnttab_release_mappings(struct domain *d) { return 0; } - - static inline int mem_sharing_gref_to_gfn(struct grant_table *gt, - grant_ref_t ref, diff --git a/xsa380-2.patch b/xsa380-2.patch deleted file mode 100644 index e5a134d..0000000 --- a/xsa380-2.patch +++ /dev/null @@ -1,410 +0,0 @@ -From: Jan Beulich -Subject: gnttab: replace mapkind() - -mapkind() doesn't scale very well with larger maptrack entry counts, -using a brute force linear search through all entries, with the only -option of an early loop exit if a matching writable entry was found. -Introduce a radix tree alongside the main maptrack table, thus -allowing much faster MFN-based lookup. To avoid the need to actually -allocate space for the individual nodes, encode the two counters in the -node pointers themselves, thus limiting the number of permitted -simultaneous r/o and r/w mappings of the same MFN to 2³¹-1 (64-bit) / -2¹⁵-1 (32-bit) each. - -To avoid enforcing an unnecessarily low bound on the number of -simultaneous mappings of a single MFN, introduce -radix_tree_{ulong_to_ptr,ptr_to_ulong} paralleling -radix_tree_{int_to_ptr,ptr_to_int}. - -As a consequence locking changes are also applicable: With there no -longer being any inspection of the remote domain's active entries, -there's also no need anymore to hold the remote domain's grant table -lock. And since we're no longer iterating over the local domain's map -track table, the lock in map_grant_ref() can also be dropped before the -new maptrack entry actually gets populated. - -As a nice side effect this also reduces the number of IOMMU operations -in unmap_common(): Previously we would have "established" a readable -mapping whenever we didn't find a writable entry anymore (yet, of -course, at least one readable one). But we only need to do this if we -actually dropped the last writable entry, not if there were none already -before. - -This is part of CVE-2021-28698 / XSA-380. - -Signed-off-by: Jan Beulich -Reviewed-by: Julien Grall ---- -I hope that limiting the map count to 32k on Arm32 is good enough. I -also hope it is out of question that 2G of mappings are enough on 64-bit -architectures. - -I'm using the grant table lock for synchronization to limit differences -in behavior to prior code. I think in principle the maptrack lock could -be used equally well. - -Shouldn't IOMMU insertions be limited anyway to GNTMAP_device_map -requests? This would further save on the number of radix tree nodes in -need of maintaining. - -I'm hesitant to introduce GNTST_* in a security patch, but being able to -tell allocation failure or counter overflow from other errors might be -worthwhile. - -I don't think adding anything to gnttab_usage_print() is useful: -radix_tree_gang_lookup() requires nodes to record their own indexes into -the tree, which we don't do to save space. Yet without indexes printing -node contents isn't very useful. Plus there's also no printing of the -main maptrack table contents. ---- -v3: Check for radix_tree_lookup_slot() returning NULL. Convert -EEXIST - to -EBUSY. Add comments. Re-base over comment addition in patch 1. -v2: New. - ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -37,6 +37,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -82,8 +83,13 @@ struct grant_table { - grant_status_t **status; - /* Active grant table. */ - struct active_grant_entry **active; -- /* Mapping tracking table per vcpu. */ -+ /* Handle-indexed tracking table of mappings. */ - struct grant_mapping **maptrack; -+ /* -+ * MFN-indexed tracking tree of mappings, if needed. Note that this is -+ * protected by @lock, not @maptrack_lock. -+ */ -+ struct radix_tree_root maptrack_tree; - - /* Domain to which this struct grant_table belongs. */ - const struct domain *domain; -@@ -516,34 +522,6 @@ static int get_paged_frame(unsigned long - return GNTST_okay; - } - --static inline void --double_gt_lock(struct grant_table *lgt, struct grant_table *rgt) --{ -- /* -- * See mapkind() for why the write lock is also required for the -- * remote domain. -- */ -- if ( lgt < rgt ) -- { -- grant_write_lock(lgt); -- grant_write_lock(rgt); -- } -- else -- { -- if ( lgt != rgt ) -- grant_write_lock(rgt); -- grant_write_lock(lgt); -- } --} -- --static inline void --double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt) --{ -- grant_write_unlock(lgt); -- if ( lgt != rgt ) -- grant_write_unlock(rgt); --} -- - #define INVALID_MAPTRACK_HANDLE UINT_MAX - - static inline grant_handle_t -@@ -970,41 +948,17 @@ static struct active_grant_entry *grant_ - return ERR_PTR(-EINVAL); - } - --#define MAPKIND_READ 1 --#define MAPKIND_WRITE 2 --static unsigned int mapkind( -- struct grant_table *lgt, const struct domain *rd, mfn_t mfn) --{ -- struct grant_mapping *map; -- grant_handle_t handle, limit = lgt->maptrack_limit; -- unsigned int kind = 0; -- -- /* -- * Must have the local domain's grant table write lock when -- * iterating over its maptrack entries. -- */ -- ASSERT(percpu_rw_is_write_locked(&lgt->lock)); -- /* -- * Must have the remote domain's grant table write lock while -- * counting its active entries. -- */ -- ASSERT(percpu_rw_is_write_locked(&rd->grant_table->lock)); -- -- smp_rmb(); -- -- for ( handle = 0; !(kind & MAPKIND_WRITE) && handle < limit; handle++ ) -- { -- map = &maptrack_entry(lgt, handle); -- if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) || -- map->domid != rd->domain_id ) -- continue; -- if ( mfn_eq(_active_entry(rd->grant_table, map->ref).mfn, mfn) ) -- kind |= map->flags & GNTMAP_readonly ? -- MAPKIND_READ : MAPKIND_WRITE; -- } -- -- return kind; --} -+union maptrack_node { -+ struct { -+ /* Radix tree slot pointers use two of the bits. */ -+#ifdef __BIG_ENDIAN_BITFIELD -+ unsigned long : 2; -+#endif -+ unsigned long rd : BITS_PER_LONG / 2 - 1; -+ unsigned long wr : BITS_PER_LONG / 2 - 1; -+ } cnt; -+ unsigned long raw; -+}; - - static void - map_grant_ref( -@@ -1023,7 +977,6 @@ map_grant_ref( - struct grant_mapping *mt; - grant_entry_header_t *shah; - uint16_t *status; -- bool_t need_iommu; - - ld = current->domain; - -@@ -1244,31 +1197,75 @@ map_grant_ref( - * as mem-sharing and IOMMU use are incompatible). The dom_io case would - * need checking separately if we compared against owner here. - */ -- need_iommu = ld != rd && gnttab_need_iommu_mapping(ld); -- if ( need_iommu ) -+ if ( ld != rd && gnttab_need_iommu_mapping(ld) ) - { -+ union maptrack_node node = { -+ .cnt.rd = !!(op->flags & GNTMAP_readonly), -+ .cnt.wr = !(op->flags & GNTMAP_readonly), -+ }; -+ int err; -+ void **slot = NULL; - unsigned int kind; - -- double_gt_lock(lgt, rgt); -+ grant_write_lock(lgt); -+ -+ err = radix_tree_insert(&lgt->maptrack_tree, mfn_x(mfn), -+ radix_tree_ulong_to_ptr(node.raw)); -+ if ( err == -EEXIST ) -+ { -+ slot = radix_tree_lookup_slot(&lgt->maptrack_tree, mfn_x(mfn)); -+ if ( likely(slot) ) -+ { -+ node.raw = radix_tree_ptr_to_ulong(*slot); -+ err = -EBUSY; -+ -+ /* Update node only when refcount doesn't overflow. */ -+ if ( op->flags & GNTMAP_readonly ? ++node.cnt.rd -+ : ++node.cnt.wr ) -+ { -+ radix_tree_replace_slot(slot, -+ radix_tree_ulong_to_ptr(node.raw)); -+ err = 0; -+ } -+ } -+ else -+ ASSERT_UNREACHABLE(); -+ } - - /* - * We're not translated, so we know that dfns and mfns are - * the same things, so the IOMMU entry is always 1-to-1. - */ -- kind = mapkind(lgt, rd, mfn); -- if ( !(op->flags & GNTMAP_readonly) && -- !(kind & MAPKIND_WRITE) ) -+ if ( !(op->flags & GNTMAP_readonly) && node.cnt.wr == 1 ) - kind = IOMMUF_readable | IOMMUF_writable; -- else if ( !kind ) -+ else if ( (op->flags & GNTMAP_readonly) && -+ node.cnt.rd == 1 && !node.cnt.wr ) - kind = IOMMUF_readable; - else - kind = 0; -- if ( kind && iommu_legacy_map(ld, _dfn(mfn_x(mfn)), mfn, 1, kind) ) -+ if ( err || -+ (kind && iommu_legacy_map(ld, _dfn(mfn_x(mfn)), mfn, 1, kind)) ) - { -- double_gt_unlock(lgt, rgt); -+ if ( !err ) -+ { -+ if ( slot ) -+ { -+ op->flags & GNTMAP_readonly ? node.cnt.rd-- -+ : node.cnt.wr--; -+ radix_tree_replace_slot(slot, -+ radix_tree_ulong_to_ptr(node.raw)); -+ } -+ else -+ radix_tree_delete(&lgt->maptrack_tree, mfn_x(mfn)); -+ } -+ - rc = GNTST_general_error; -- goto undo_out; - } -+ -+ grant_write_unlock(lgt); -+ -+ if ( rc != GNTST_okay ) -+ goto undo_out; - } - - TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op->dom); -@@ -1276,10 +1273,6 @@ map_grant_ref( - /* - * All maptrack entry users check mt->flags first before using the - * other fields so just ensure the flags field is stored last. -- * -- * However, if gnttab_need_iommu_mapping() then this would race -- * with a concurrent mapkind() call (on an unmap, for example) -- * and a lock is required. - */ - mt = &maptrack_entry(lgt, handle); - mt->domid = op->dom; -@@ -1287,9 +1280,6 @@ map_grant_ref( - smp_wmb(); - write_atomic(&mt->flags, op->flags); - -- if ( need_iommu ) -- double_gt_unlock(lgt, rgt); -- - op->dev_bus_addr = mfn_to_maddr(mfn); - op->handle = handle; - op->status = GNTST_okay; -@@ -1497,19 +1487,34 @@ unmap_common( - /* See the respective comment in map_grant_ref(). */ - if ( rc == GNTST_okay && ld != rd && gnttab_need_iommu_mapping(ld) ) - { -- unsigned int kind; -+ void **slot; -+ union maptrack_node node; - int err = 0; - -- double_gt_lock(lgt, rgt); -+ grant_write_lock(lgt); -+ slot = radix_tree_lookup_slot(&lgt->maptrack_tree, mfn_x(op->mfn)); -+ node.raw = likely(slot) ? radix_tree_ptr_to_ulong(*slot) : 0; -+ -+ /* Refcount must not underflow. */ -+ if ( !(flags & GNTMAP_readonly ? node.cnt.rd-- -+ : node.cnt.wr--) ) -+ BUG(); - -- kind = mapkind(lgt, rd, op->mfn); -- if ( !kind ) -+ if ( !node.raw ) - err = iommu_legacy_unmap(ld, _dfn(mfn_x(op->mfn)), 1); -- else if ( !(kind & MAPKIND_WRITE) ) -+ else if ( !(flags & GNTMAP_readonly) && !node.cnt.wr ) - err = iommu_legacy_map(ld, _dfn(mfn_x(op->mfn)), op->mfn, 1, - IOMMUF_readable); - -- double_gt_unlock(lgt, rgt); -+ if ( err ) -+ ; -+ else if ( !node.raw ) -+ radix_tree_delete(&lgt->maptrack_tree, mfn_x(op->mfn)); -+ else -+ radix_tree_replace_slot(slot, -+ radix_tree_ulong_to_ptr(node.raw)); -+ -+ grant_write_unlock(lgt); - - if ( err ) - rc = GNTST_general_error; -@@ -1956,6 +1961,8 @@ int grant_table_init(struct domain *d, i - gt->maptrack = vzalloc(gt->max_maptrack_frames * sizeof(*gt->maptrack)); - if ( gt->maptrack == NULL ) - goto out; -+ -+ radix_tree_init(>->maptrack_tree); - } - - /* Shared grant table. */ -@@ -3704,6 +3711,8 @@ int gnttab_release_mappings(struct domai - - for ( handle = gt->maptrack_limit; handle; ) - { -+ mfn_t mfn; -+ - /* - * Deal with full pages such that their freeing (in the body of the - * if()) remains simple. -@@ -3801,17 +3810,31 @@ int gnttab_release_mappings(struct domai - - reduce_status_for_pin(rd, act, status, map->flags & GNTMAP_readonly); - -+ mfn = act->mfn; -+ - active_entry_release(act); - grant_read_unlock(rgt); - - rcu_unlock_domain(rd); - - map->flags = 0; -+ -+ /* -+ * This is excessive in that a single such call would suffice per -+ * mapped MFN (or none at all, if no entry was ever inserted). But it -+ * should be the common case for an MFN to be mapped just once, and -+ * this way we don't need to further maintain the counters. We also -+ * don't want to leave cleaning up of the tree as a whole to the end -+ * of the function, as this could take quite some time. -+ */ -+ radix_tree_delete(>->maptrack_tree, mfn_x(mfn)); - } - - gt->maptrack_limit = 0; - FREE_XENHEAP_PAGE(gt->maptrack[0]); - -+ radix_tree_destroy(>->maptrack_tree, NULL); -+ - return 0; - } - ---- a/xen/include/xen/radix-tree.h -+++ b/xen/include/xen/radix-tree.h -@@ -190,6 +190,25 @@ static inline int radix_tree_ptr_to_int( - return (int)((long)ptr >> 2); - } - -+/** -+ * radix_tree_{ulong_to_ptr,ptr_to_ulong}: -+ * -+ * Same for unsigned long values. Beware though that only BITS_PER_LONG-2 -+ * bits are actually usable for the value. -+ */ -+static inline void *radix_tree_ulong_to_ptr(unsigned long val) -+{ -+ unsigned long ptr = (val << 2) | 0x2; -+ ASSERT((ptr >> 2) == val); -+ return (void *)ptr; -+} -+ -+static inline unsigned long radix_tree_ptr_to_ulong(void *ptr) -+{ -+ ASSERT(((unsigned long)ptr & 0x3) == 0x2); -+ return (unsigned long)ptr >> 2; -+} -+ - int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); - void *radix_tree_lookup(struct radix_tree_root *, unsigned long); - void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); diff --git a/xsa380-3.patch b/xsa380-3.patch deleted file mode 100644 index 1517249..0000000 --- a/xsa380-3.patch +++ /dev/null @@ -1,74 +0,0 @@ -From: Jan Beulich -Subject: gnttab: avoid triggering assertion in radix_tree_ulong_to_ptr() - -Relevant quotes from the C11 standard: - -"Except where explicitly stated otherwise, for the purposes of this - subclause unnamed members of objects of structure and union type do not - participate in initialization. Unnamed members of structure objects - have indeterminate value even after initialization." - -"If there are fewer initializers in a brace-enclosed list than there are - elements or members of an aggregate, [...], the remainder of the - aggregate shall be initialized implicitly the same as objects that have - static storage duration." - -"If an object that has static or thread storage duration is not - initialized explicitly, then: - [...] - — if it is an aggregate, every member is initialized (recursively) - according to these rules, and any padding is initialized to zero - bits; - [...]" - -"A bit-field declaration with no declarator, but only a colon and a - width, indicates an unnamed bit-field." Footnote: "An unnamed bit-field - structure member is useful for padding to conform to externally imposed - layouts." - -"There may be unnamed padding within a structure object, but not at its - beginning." - -Which makes me conclude: -- Whether an unnamed bit-field member is an unnamed member or padding is - unclear, and hence also whether the last quote above would render the - big endian case of the structure declaration invalid. -- Whether the number of members of an aggregate includes unnamed ones is - also not really clear. -- The initializer in map_grant_ref() initializes all fields of the "cnt" - sub-structure of the union, so assuming the second quote above applies - here (indirectly), the compiler isn't required to implicitly - initialize the rest (i.e. in particular any padding) like would happen - for static storage duration objects. - -Gcc 7.4.1 can be observed (apparently in debug builds only) to translate -aforementioned initializer to a read-modify-write operation of a stack -variable, leaving unchanged the top two bits of whatever was previously -in that stack slot. Clearly if either of the two bits were set, -radix_tree_ulong_to_ptr()'s assertion would trigger. - -Therefore, to be on the safe side, add an explicit padding field for the -non-big-endian-bitfields case and give a dummy name to both padding -fields. - -Fixes: 9781b51efde2 ("gnttab: replace mapkind()") -Signed-off-by: Jan Beulich -Acked-by: Andrew Cooper - ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -952,10 +952,13 @@ union maptrack_node { - struct { - /* Radix tree slot pointers use two of the bits. */ - #ifdef __BIG_ENDIAN_BITFIELD -- unsigned long : 2; -+ unsigned long _0 : 2; - #endif - unsigned long rd : BITS_PER_LONG / 2 - 1; - unsigned long wr : BITS_PER_LONG / 2 - 1; -+#ifndef __BIG_ENDIAN_BITFIELD -+ unsigned long _0 : 2; -+#endif - } cnt; - unsigned long raw; - }; diff --git a/xsa382.patch b/xsa382.patch deleted file mode 100644 index 936c2de..0000000 --- a/xsa382.patch +++ /dev/null @@ -1,34 +0,0 @@ -From: Jan Beulich -Subject: gnttab: fix array capacity check in gnttab_get_status_frames() - -The number of grant frames is of no interest here; converting the passed -in op.nr_frames this way means we allow for 8 times as many GFNs to be -written as actually fit in the array. We would corrupt xlat areas of -higher vCPU-s (after having faulted many times while trying to write to -the guard pages between any two areas) for 32-bit PV guests. For HVM -guests we'd simply crash as soon as we hit the first guard page, as -accesses to the xlat area are simply memcpy() there. - -This is CVE-2021-28699 / XSA-382. - -Fixes: 18b1be5e324b ("gnttab: make resource limits per domain") -Signed-off-by: Jan Beulich - ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -3243,12 +3243,11 @@ gnttab_get_status_frames(XEN_GUEST_HANDL - goto unlock; - } - -- if ( unlikely(limit_max < grant_to_status_frames(op.nr_frames)) ) -+ if ( unlikely(limit_max < op.nr_frames) ) - { - gdprintk(XENLOG_WARNING, -- "grant_to_status_frames(%u) for d%d is too large (%u,%u)\n", -- op.nr_frames, d->domain_id, -- grant_to_status_frames(op.nr_frames), limit_max); -+ "nr_status_frames for %pd is too large (%u,%u)\n", -+ d, op.nr_frames, limit_max); - op.status = GNTST_general_error; - goto unlock; - } diff --git a/xsa383.patch b/xsa383.patch deleted file mode 100644 index 9ab5eb3..0000000 --- a/xsa383.patch +++ /dev/null @@ -1,55 +0,0 @@ -From: Julien Grall -Date: Sat, 3 Jul 2021 14:03:36 +0100 -Subject: [PATCH] xen/arm: Restrict the amount of memory that dom0less domU and - dom0 can allocate - -Currently, both dom0less domUs and dom0 can allocate an "unlimited" -amount of memory because d->max_pages is set to ~0U. - -In particular, the former are meant to be unprivileged. Therefore the -memory they could allocate should be bounded. As the domain are not yet -officially aware of Xen (we don't expose advertise it in the DT, yet -the hypercalls are accessible), they should not need to allocate more -than the initial amount. So cap set d->max_pages directly the amount of -memory we are meant to allocate. - -Take the opportunity to also restrict the memory for dom0 as the -domain is direct mapped (e.g. MFN == GFN) and therefore cannot -allocate outside of the pre-allocated region. - -This is CVE-2021-28700 / XSA-383. - -Reported-by: Julien Grall -Signed-off-by: Julien Grall -Reviewed-by: Stefano Stabellini -Tested-by: Stefano Stabellini ---- - xen/arch/arm/domain_build.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c -index 6c86d527810f..206038d1c022 100644 ---- a/xen/arch/arm/domain_build.c -+++ b/xen/arch/arm/domain_build.c -@@ -2440,7 +2440,8 @@ static int __init construct_domU(struct domain *d, - - if ( vcpu_create(d, 0) == NULL ) - return -ENOMEM; -- d->max_pages = ~0U; -+ -+ d->max_pages = ((paddr_t)mem * SZ_1K) >> PAGE_SHIFT; - - kinfo.d = d; - -@@ -2546,7 +2547,7 @@ static int __init construct_dom0(struct domain *d) - - iommu_hwdom_init(d); - -- d->max_pages = ~0U; -+ d->max_pages = dom0_mem >> PAGE_SHIFT; - - kinfo.unassigned_mem = dom0_mem; - kinfo.d = d; --- -2.17.1 - diff --git a/xsa384.patch b/xsa384.patch deleted file mode 100644 index 4f155ac..0000000 --- a/xsa384.patch +++ /dev/null @@ -1,81 +0,0 @@ -From: Jan Beulich -Subject: gnttab: deal with status frame mapping race - -Once gnttab_map_frame() drops the grant table lock, the MFN it reports -back to its caller is free to other manipulation. In particular -gnttab_unpopulate_status_frames() might free it, by a racing request on -another CPU, thus resulting in a reference to a deallocated page getting -added to a domain's P2M. - -Obtain a page reference in gnttab_map_frame() to prevent freeing of the -page until xenmem_add_to_physmap_one() has actually completed its acting -on the page. Do so uniformly, even if only strictly required for v2 -status pages, to avoid extra conditionals (which then would all need to -be kept in sync going forward). - -This is CVE-2021-28701 / XSA-384. - -Reported-by: Julien Grall -Signed-off-by: Jan Beulich -Reviewed-by: Julien Grall ---- -v2: Pull get_page() earlier and fold if()s. - ---- a/xen/arch/arm/mm.c -+++ b/xen/arch/arm/mm.c -@@ -1420,6 +1420,8 @@ int xenmem_add_to_physmap_one( - if ( rc ) - return rc; - -+ /* Need to take care of the reference obtained in gnttab_map_frame(). */ -+ page = mfn_to_page(mfn); - t = p2m_ram_rw; - - break; -@@ -1487,9 +1489,12 @@ int xenmem_add_to_physmap_one( - /* Map at new location. */ - rc = guest_physmap_add_entry(d, gfn, mfn, 0, t); - -- /* If we fail to add the mapping, we need to drop the reference we -- * took earlier on foreign pages */ -- if ( rc && space == XENMAPSPACE_gmfn_foreign ) -+ /* -+ * For XENMAPSPACE_gmfn_foreign if we failed to add the mapping, we need -+ * to drop the reference we took earlier. In all other cases we need to -+ * drop any reference we took earlier (perhaps indirectly). -+ */ -+ if ( space == XENMAPSPACE_gmfn_foreign ? rc : page != NULL ) - { - ASSERT(page != NULL); - put_page(page); ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -2726,6 +2726,8 @@ int xenmem_add_to_physmap_one( - rc = gnttab_map_frame(d, idx, gpfn, &mfn); - if ( rc ) - return rc; -+ /* Need to take care of the reference obtained in gnttab_map_frame(). */ -+ page = mfn_to_page(mfn); - break; - - case XENMAPSPACE_gmfn: ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -4097,7 +4097,16 @@ int gnttab_map_frame(struct domain *d, u - } - - if ( !rc ) -- gnttab_set_frame_gfn(gt, status, idx, gfn); -+ { -+ /* -+ * Make sure gnttab_unpopulate_status_frames() won't (successfully) -+ * free the page until our caller has completed its operation. -+ */ -+ if ( get_page(mfn_to_page(*mfn), d) ) -+ gnttab_set_frame_gfn(gt, status, idx, gfn); -+ else -+ rc = -EBUSY; -+ } - - grant_write_unlock(gt); -