diff --git a/4.10.0-shim-comet-3.patch b/4.10.0-shim-comet-3.patch new file mode 100644 index 0000000..61f2645 --- /dev/null +++ b/4.10.0-shim-comet-3.patch @@ -0,0 +1,10861 @@ +From ab7be6ce4ac8cc3f32952d8c9c260412e780e939 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Wed, 20 Dec 2017 15:40:58 +0100 +Subject: [PATCH 02/77] xen/pv: Construct d0v0's GDT properly + +c/s cf6d39f8199 "x86/PV: properly populate descriptor tables" changed the GDT +to reference zero_page for intermediate frames between the guest and Xen +frames. + +Because dom0_construct_pv() doesn't call arch_set_info_guest(), some bits of +initialisation are missed, including the pv_destroy_gdt() which initially +fills the references to zero_page. + +In practice, this means there is a window between starting and the first call +to HYPERCALL_set_gdt() were lar/lsl/verr/verw suffer non-architectural +behaviour. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: 08f27f4468eedbeccaac9fdda4ef732247efd74e +master date: 2017-12-01 19:03:26 +0000 +--- + xen/arch/x86/pv/dom0_build.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c +index 44601d08d3..a13412efb9 100644 +--- a/xen/arch/x86/pv/dom0_build.c ++++ b/xen/arch/x86/pv/dom0_build.c +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + #include + + /* Allow ring-3 access in long mode as guest cannot use ring 1 ... */ +@@ -866,6 +867,13 @@ int __init dom0_construct_pv(struct domain *d, + regs->rsi = vstartinfo_start; + regs->eflags = X86_EFLAGS_IF; + ++ /* ++ * We don't call arch_set_info_guest(), so some initialisation needs doing ++ * by hand: ++ * - Reset the GDT to reference zero_page ++ */ ++ pv_destroy_gdt(v); ++ + if ( test_bit(XENFEAT_supervisor_mode_kernel, parms.f_required) ) + panic("Dom0 requires supervisor-mode execution"); + +-- +2.14.3 + + +From 4150501b717e7fde77c9ab4e96dd9916d7345b55 Mon Sep 17 00:00:00 2001 +From: Sergey Dyasli +Date: Wed, 20 Dec 2017 15:41:33 +0100 +Subject: [PATCH 03/77] x86/vvmx: don't enable vmcs shadowing for nested guests + +Running "./xtf_runner vvmx" in L1 Xen under L0 Xen produces the +following result on H/W with VMCS shadowing: + + Test: vmxon + Failure in test_vmxon_in_root_cpl0() + Expected 0x8200000f: VMfailValid(15) VMXON_IN_ROOT + Got 0x82004400: VMfailValid(17408) + Test result: FAILURE + +This happens because SDM allows vmentries with enabled VMCS shadowing +VM-execution control and VMCS link pointer value of ~0ull. But results +of a nested VMREAD are undefined in such cases. + +Fix this by not copying the value of VMCS shadowing control from vmcs01 +to vmcs02. + +Signed-off-by: Sergey Dyasli +Acked-by: Kevin Tian +master commit: 19fdb8e258619aea265af9c183e035e545cbc2d2 +master date: 2017-12-01 19:03:27 +0000 +--- + xen/arch/x86/hvm/vmx/vvmx.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c +index dde02c076b..013d049f8a 100644 +--- a/xen/arch/x86/hvm/vmx/vvmx.c ++++ b/xen/arch/x86/hvm/vmx/vvmx.c +@@ -633,6 +633,7 @@ void nvmx_update_secondary_exec_control(struct vcpu *v, + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; + + host_cntrl &= ~apicv_bit; ++ host_cntrl &= ~SECONDARY_EXEC_ENABLE_VMCS_SHADOWING; + shadow_cntrl = get_vvmcs(v, SECONDARY_VM_EXEC_CONTROL); + + /* No vAPIC-v support, so it shouldn't be set in vmcs12. */ +-- +2.14.3 + + +From c8f4f45e04dd782ac5dfdf58866339ac97186324 Mon Sep 17 00:00:00 2001 +From: Daniel Kiper +Date: Wed, 20 Dec 2017 15:42:13 +0100 +Subject: [PATCH 04/77] x86/mb2: avoid Xen image when looking for + module/crashkernel position + +Commit e22e1c4 (x86/EFI: avoid Xen image when looking for module/kexec +position) added relevant check for EFI case. However, since commit +f75a304 (x86: add multiboot2 protocol support for relocatable images) +Multiboot2 compatible bootloaders are able to relocate Xen image too. +So, we have to avoid also Xen image region in such cases. + +Reported-by: Andrew Cooper +Reported-by: Konrad Rzeszutek Wilk +Signed-off-by: Daniel Kiper +Reviewed-by: Jan Beulich +master commit: 9589927e5bf9e123ec42b6e0b0809f153bd92732 +master date: 2017-12-12 14:30:53 +0100 +--- + xen/arch/x86/setup.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index 32bb02e3a5..2e10c6bdf4 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -653,7 +653,7 @@ void __init noreturn __start_xen(unsigned long mbi_p) + module_t *mod = (module_t *)__va(mbi->mods_addr); + unsigned long nr_pages, raw_max_page, modules_headroom, *module_map; + int i, j, e820_warn = 0, bytes = 0; +- bool acpi_boot_table_init_done = false; ++ bool acpi_boot_table_init_done = false, relocated = false; + struct domain *dom0; + struct ns16550_defaults ns16550 = { + .data_bits = 8, +@@ -904,8 +904,10 @@ void __init noreturn __start_xen(unsigned long mbi_p) + mod[i].reserved = 0; + } + +- if ( efi_enabled(EFI_LOADER) ) ++ if ( xen_phys_start ) + { ++ relocated = true; ++ + /* + * This needs to remain in sync with xen_in_range() and the + * respective reserve_e820_ram() invocation below. +@@ -1098,8 +1100,7 @@ void __init noreturn __start_xen(unsigned long mbi_p) + + /* Don't overlap with other modules (or Xen itself). */ + end = consider_modules(s, e, size, mod, +- mbi->mods_count + efi_enabled(EFI_LOADER), +- j); ++ mbi->mods_count + relocated, j); + + if ( highmem_start && end > highmem_start ) + continue; +@@ -1126,7 +1127,7 @@ void __init noreturn __start_xen(unsigned long mbi_p) + { + /* Don't overlap with modules (or Xen itself). */ + e = consider_modules(s, e, PAGE_ALIGN(kexec_crash_area.size), mod, +- mbi->mods_count + efi_enabled(EFI_LOADER), -1); ++ mbi->mods_count + relocated, -1); + if ( s >= e ) + break; + if ( e > kexec_crash_area_limit ) +-- +2.14.3 + + +From e2dc7b584f4c7ab6ad7ab543e5cf7ee2e6d1d569 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Wed, 20 Dec 2017 15:42:42 +0100 +Subject: [PATCH 05/77] x86/mm: drop bogus paging mode assertion + +Olaf has observed this assertion to trigger after an aborted migration +of a PV guest: + +(XEN) Xen call trace: +(XEN) [] do_page_fault+0x39f/0x55c +(XEN) [] x86_64/entry.S#handle_exception_saved+0x66/0xa4 +(XEN) [] __copy_to_user_ll+0x22/0x30 +(XEN) [] update_runstate_area+0x19c/0x228 +(XEN) [] domain.c#_update_runstate_area+0x11/0x39 +(XEN) [] context_switch+0x1fd/0xf25 +(XEN) [] schedule.c#schedule+0x303/0x6a8 +(XEN) [] softirq.c#__do_softirq+0x6c/0x95 +(XEN) [] do_softirq+0x13/0x15 +(XEN) [] x86_64/entry.S#process_softirqs+0x21/0x30 + +Release builds work fine, which is a first indication that the assertion +isn't really needed. + +What's worse though - there appears to be a timing window where the +guest runs in shadow mode, but not in log-dirty mode, and that is what +triggers the assertion (the same could, afaict, be achieved by test- +enabling shadow mode on a PV guest). This is because turing off log- +dirty mode is being performed in two steps: First the log-dirty bit gets +cleared (paging_log_dirty_disable() [having paused the domain] -> +sh_disable_log_dirty() -> shadow_one_bit_disable()), followed by +unpausing the domain and only then clearing shadow mode (via +shadow_test_disable(), which pauses the domain a second time). + +Hence besides removing the ASSERT() here (or optionally replacing it by +explicit translate and refcounts mode checks, but this seems rather +pointless now that the three are tied together) I wonder whether either +shadow_one_bit_disable() should turn off shadow mode if no other bit +besides PG_SH_enable remains set (just like shadow_one_bit_enable() +enables it if not already set), or the domain pausing scope should be +extended so that both steps occur without the domain getting a chance to +run in between. + +Reported-by: Olaf Hering +Signed-off-by: Jan Beulich +Reviewed-by: Tim Deegan +Acked-by: Andrew Cooper +master commit: b95f7be32d668fa4b09300892ebe19636ecebe36 +master date: 2017-12-12 16:56:15 +0100 +--- + xen/arch/x86/traps.c | 6 +----- + xen/include/asm-x86/paging.h | 3 --- + 2 files changed, 1 insertion(+), 8 deletions(-) + +diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c +index 86506f3747..642f3cc6d7 100644 +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -1338,12 +1338,8 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) + */ + if ( paging_mode_enabled(d) && !paging_mode_external(d) ) + { +- int ret; ++ int ret = paging_fault(addr, regs); + +- /* Logdirty mode is the only expected paging mode for PV guests. */ +- ASSERT(paging_mode_only_log_dirty(d)); +- +- ret = paging_fault(addr, regs); + if ( ret == EXCRET_fault_fixed ) + trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->rip, addr); + return ret; +diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h +index d99ddedec0..5607ab4b1f 100644 +--- a/xen/include/asm-x86/paging.h ++++ b/xen/include/asm-x86/paging.h +@@ -69,9 +69,6 @@ + #define paging_mode_translate(_d) (!!((_d)->arch.paging.mode & PG_translate)) + #define paging_mode_external(_d) (!!((_d)->arch.paging.mode & PG_external)) + +-#define paging_mode_only_log_dirty(_d) \ +- (((_d)->arch.paging.mode & PG_MASK) == PG_log_dirty) +- + /* flags used for paging debug */ + #define PAGING_DEBUG_LOGDIRTY 0 + +-- +2.14.3 + + +From e5364c32c650fef60b91b9be9b10f38055ffc2cf Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Wed, 20 Dec 2017 15:43:14 +0100 +Subject: [PATCH 06/77] x86/microcode: Add support for fam17h microcode loading + +The size for the Microcode Patch Block (MPB) for an AMD family 17h +processor is 3200 bytes. Add a #define for fam17h so that it does +not default to 2048 bytes and fail a microcode load/update. + +Signed-off-by: Tom Lendacky +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Signed-off-by: Ingo Molnar +[Linux commit f4e9b7af0cd58dd039a0fb2cd67d57cea4889abf] + +Ported to Xen. + +Signed-off-by: Andrew Cooper +Acked-by: Jan Beulich +master commit: 61d458ba8c171809e8dd9abd19339c87f3f934ca +master date: 2017-12-13 14:30:10 +0000 +--- + xen/arch/x86/microcode_amd.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/xen/arch/x86/microcode_amd.c b/xen/arch/x86/microcode_amd.c +index b54b0b99e4..53f9f548cd 100644 +--- a/xen/arch/x86/microcode_amd.c ++++ b/xen/arch/x86/microcode_amd.c +@@ -107,6 +107,7 @@ static bool_t verify_patch_size(uint32_t patch_size) + #define F14H_MPB_MAX_SIZE 1824 + #define F15H_MPB_MAX_SIZE 4096 + #define F16H_MPB_MAX_SIZE 3458 ++#define F17H_MPB_MAX_SIZE 3200 + + switch (boot_cpu_data.x86) + { +@@ -119,6 +120,9 @@ static bool_t verify_patch_size(uint32_t patch_size) + case 0x16: + max_size = F16H_MPB_MAX_SIZE; + break; ++ case 0x17: ++ max_size = F17H_MPB_MAX_SIZE; ++ break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; +-- +2.14.3 + + +From 19dcd8e47dfc81b8e9f867ee79c7ff8e15b975fb Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Wed, 20 Dec 2017 15:43:53 +0100 +Subject: [PATCH 07/77] gnttab: correct GNTTABOP_cache_flush empty batch + handling + +Jann validly points out that with a caller bogusly requesting a zero- +element batch with non-zero high command bits (the ones used for +continuation encoding), the assertion right before the call to +hypercall_create_continuation() would trigger. A similar situation would +arise afaict for non-empty batches with op and/or length zero in every +element. + +While we want the former to succeed (as we do elsewhere for similar +no-op requests), the latter can clearly be converted to an error, as +this is a state that can't be the result of a prior operation. + +Take the opportunity and also correct the order of argument checks: +We shouldn't accept zero-length elements with unknown bits set in "op". +Also constify cache_flush()'s first parameter. + +Reported-by: Jann Horn +Signed-off-by: Jan Beulich +Reviewed-by: Andre Przywara +Acked-by: Stefano Stabellini +master commit: 9c22e4d67f5552c7c896ed83bd95d5d4c5837a9d +master date: 2017-12-04 11:03:32 +0100 +--- + xen/common/grant_table.c | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index c5950f2b3f..bce224be6e 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -3208,7 +3208,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop, + return 0; + } + +-static int cache_flush(gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) ++static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) + { + struct domain *d, *owner; + struct page_info *page; +@@ -3218,19 +3218,17 @@ static int cache_flush(gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) + + if ( (cflush->offset >= PAGE_SIZE) || + (cflush->length > PAGE_SIZE) || +- (cflush->offset + cflush->length > PAGE_SIZE) ) ++ (cflush->offset + cflush->length > PAGE_SIZE) || ++ (cflush->op & ~(GNTTAB_CACHE_INVAL | GNTTAB_CACHE_CLEAN)) ) + return -EINVAL; + + if ( cflush->length == 0 || cflush->op == 0 ) +- return 0; ++ return !*cur_ref ? 0 : -EILSEQ; + + /* currently unimplemented */ + if ( cflush->op & GNTTAB_CACHE_SOURCE_GREF ) + return -EOPNOTSUPP; + +- if ( cflush->op & ~(GNTTAB_CACHE_INVAL|GNTTAB_CACHE_CLEAN) ) +- return -EINVAL; +- + d = rcu_lock_current_domain(); + mfn = cflush->a.dev_bus_addr >> PAGE_SHIFT; + +@@ -3310,6 +3308,9 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop, + *cur_ref = 0; + guest_handle_add_offset(uop, 1); + } ++ ++ *cur_ref = 0; ++ + return 0; + } + +-- +2.14.3 + + +From 682a9d8d37f1141b199bc3aadf8d5d276b22baf9 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Wed, 20 Dec 2017 15:44:20 +0100 +Subject: [PATCH 08/77] gnttab: improve GNTTABOP_cache_flush locking + +Dropping the lock before returning from grant_map_exists() means handing +possibly stale information back to the caller. Return back the pointer +to the active entry instead, for the caller to release the lock once +done. + +Signed-off-by: Jan Beulich +Reviewed-by: Andre Przywara +Reviewed-by: Stefano Stabellini +master commit: 553ac37137c2d1c03bf1b69cfb192ffbfe29daa4 +master date: 2017-12-04 11:04:18 +0100 +--- + xen/common/grant_table.c | 37 +++++++++++++++++-------------------- + 1 file changed, 17 insertions(+), 20 deletions(-) + +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index bce224be6e..250450bdda 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -786,10 +786,10 @@ static int _set_status(unsigned gt_version, + return _set_status_v2(domid, readonly, mapflag, shah, act, status); + } + +-static int grant_map_exists(const struct domain *ld, +- struct grant_table *rgt, +- unsigned long mfn, +- grant_ref_t *cur_ref) ++static struct active_grant_entry *grant_map_exists(const struct domain *ld, ++ struct grant_table *rgt, ++ unsigned long mfn, ++ grant_ref_t *cur_ref) + { + grant_ref_t ref, max_iter; + +@@ -805,28 +805,20 @@ static int grant_map_exists(const struct domain *ld, + nr_grant_entries(rgt)); + for ( ref = *cur_ref; ref < max_iter; ref++ ) + { +- struct active_grant_entry *act; +- bool_t exists; +- +- act = active_entry_acquire(rgt, ref); +- +- exists = act->pin +- && act->domid == ld->domain_id +- && act->frame == mfn; ++ struct active_grant_entry *act = active_entry_acquire(rgt, ref); + ++ if ( act->pin && act->domid == ld->domain_id && act->frame == mfn ) ++ return act; + active_entry_release(act); +- +- if ( exists ) +- return 0; + } + + if ( ref < nr_grant_entries(rgt) ) + { + *cur_ref = ref; +- return 1; ++ return NULL; + } + +- return -EINVAL; ++ return ERR_PTR(-EINVAL); + } + + #define MAPKIND_READ 1 +@@ -3213,6 +3205,7 @@ static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) + struct domain *d, *owner; + struct page_info *page; + unsigned long mfn; ++ struct active_grant_entry *act = NULL; + void *v; + int ret; + +@@ -3250,13 +3243,13 @@ static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) + { + grant_read_lock(owner->grant_table); + +- ret = grant_map_exists(d, owner->grant_table, mfn, cur_ref); +- if ( ret != 0 ) ++ act = grant_map_exists(d, owner->grant_table, mfn, cur_ref); ++ if ( IS_ERR_OR_NULL(act) ) + { + grant_read_unlock(owner->grant_table); + rcu_unlock_domain(d); + put_page(page); +- return ret; ++ return act ? PTR_ERR(act) : 1; + } + } + +@@ -3273,7 +3266,11 @@ static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) + ret = 0; + + if ( d != owner ) ++ { ++ active_entry_release(act); + grant_read_unlock(owner->grant_table); ++ } ++ + unmap_domain_page(v); + put_page(page); + +-- +2.14.3 + + +From 135b67e9bd5281084efe9fb1d3604915dac07ce8 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Wed, 20 Dec 2017 15:44:57 +0100 +Subject: [PATCH 09/77] xen/efi: Fix build with clang-5.0 + +The clang-5.0 build is reliably failing with: + + Error: size of boot.o:.text is 0x01 + +which is because efi_arch_flush_dcache_area() exists as a single ret +instruction. Mark it as __init like everything else in the files. + +Spotted by Travis. + +Signed-off-by: Andrew Cooper +Reviewed-by: Stefano Stabellini +Acked-by: Jan Beulich +master commit: c4f6ad4c5fd25cb0ccc0cdbe711db97e097f0407 +master date: 2017-12-14 10:59:26 +0000 +--- + xen/arch/arm/efi/efi-boot.h | 2 +- + xen/arch/x86/efi/efi-boot.h | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/arm/efi/efi-boot.h b/xen/arch/arm/efi/efi-boot.h +index 56de26e918..ca655ff003 100644 +--- a/xen/arch/arm/efi/efi-boot.h ++++ b/xen/arch/arm/efi/efi-boot.h +@@ -597,7 +597,7 @@ static void __init efi_arch_video_init(EFI_GRAPHICS_OUTPUT_PROTOCOL *gop, + { + } + +-static void efi_arch_flush_dcache_area(const void *vaddr, UINTN size) ++static void __init efi_arch_flush_dcache_area(const void *vaddr, UINTN size) + { + __flush_dcache_area(vaddr, size); + } +diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h +index 8d295ff9af..d30f688a5a 100644 +--- a/xen/arch/x86/efi/efi-boot.h ++++ b/xen/arch/x86/efi/efi-boot.h +@@ -668,7 +668,7 @@ static bool __init efi_arch_use_config_file(EFI_SYSTEM_TABLE *SystemTable) + return true; /* x86 always uses a config file */ + } + +-static void efi_arch_flush_dcache_area(const void *vaddr, UINTN size) { } ++static void __init efi_arch_flush_dcache_area(const void *vaddr, UINTN size) { } + + void __init efi_multiboot2(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable) + { +-- +2.14.3 + + +From 9dc5eda576bafca47abc7202f075f28d6250bf4d Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Wed, 20 Dec 2017 15:45:32 +0100 +Subject: [PATCH 10/77] x86/vmx: Don't use hvm_inject_hw_exception() in + long_mode_do_msr_write() + +Since c/s 49de10f3c1718 "x86/hvm: Don't raise #GP behind the emulators back +for MSR accesses", returning X86EMUL_EXCEPTION has pushed the exception +generation to the top of the call tree. + +Using hvm_inject_hw_exception() and returning X86EMUL_EXCEPTION causes a +double #GP injection, which combines to #DF. + +Signed-off-by: Andrew Cooper +Acked-by: Kevin Tian +Reviewed-by: Jan Beulich +master commit: 896ee3980e72866b602e743396751384de301fb0 +master date: 2017-12-14 18:05:45 +0000 +--- + xen/arch/x86/hvm/vmx/vmx.c | 11 +++-------- + 1 file changed, 3 insertions(+), 8 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index b18cceab55..73254bf5d4 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -542,7 +542,7 @@ long_mode_do_msr_write(unsigned int msr, uint64_t msr_content) + case MSR_GS_BASE: + case MSR_SHADOW_GS_BASE: + if ( !is_canonical_address(msr_content) ) +- goto uncanonical_address; ++ return HNDL_exception_raised; + + if ( msr == MSR_FS_BASE ) + __vmwrite(GUEST_FS_BASE, msr_content); +@@ -560,14 +560,14 @@ long_mode_do_msr_write(unsigned int msr, uint64_t msr_content) + + case MSR_LSTAR: + if ( !is_canonical_address(msr_content) ) +- goto uncanonical_address; ++ return HNDL_exception_raised; + v->arch.hvm_vmx.lstar = msr_content; + wrmsrl(MSR_LSTAR, msr_content); + break; + + case MSR_CSTAR: + if ( !is_canonical_address(msr_content) ) +- goto uncanonical_address; ++ return HNDL_exception_raised; + v->arch.hvm_vmx.cstar = msr_content; + break; + +@@ -581,11 +581,6 @@ long_mode_do_msr_write(unsigned int msr, uint64_t msr_content) + } + + return HNDL_done; +- +- uncanonical_address: +- HVM_DBG_LOG(DBG_LEVEL_MSR, "Not cano address of msr write %x", msr); +- hvm_inject_hw_exception(TRAP_gp_fault, 0); +- return HNDL_exception_raised; + } + + /* +-- +2.14.3 + + +From a87ec4833af47cdd166294f3f4db21231930d65d Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 4 Jan 2018 14:32:01 +0100 +Subject: [PATCH 11/77] x86/msr: Free msr_vcpu_policy during vcpu destruction + +c/s 4187f79dc7 "x86/msr: introduce struct msr_vcpu_policy" introduced a +per-vcpu memory allocation, but failed to free it in the clean vcpu +destruction case. + +This is XSA-253. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: e204e60f77702bf5c884dd37c3f1b01f14e396ae +master date: 2018-01-04 14:27:38 +0100 +--- + xen/arch/x86/domain.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 735f45c133..b44c95b493 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -382,6 +382,9 @@ void vcpu_destroy(struct vcpu *v) + + vcpu_destroy_fpu(v); + ++ xfree(v->arch.msr); ++ v->arch.msr = NULL; ++ + if ( !is_idle_domain(v->domain) ) + vpmu_destroy(v); + +-- +2.14.3 + + +From 69e302e59cfd281449eafb6193476a11a1c286df Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Thu, 11 Jan 2018 17:51:14 +0000 +Subject: [PATCH 12/77] x86/upcall: inject a spurious event after setting + upcall vector +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +In case the vCPU has pending events to inject. This fixes a bug that +happened if the guest mapped the vcpu info area using +VCPUOP_register_vcpu_info without having setup the event channel +upcall, and then setup the upcall vector. + +In this scenario the guest would not receive any upcalls, because the +call to VCPUOP_register_vcpu_info would have marked the vCPU as having +pending events, but the vector could not be injected because it was +not yet setup. + +This has not caused issues so far because all the consumers first +setup the vector callback and then map the vcpu info page, but there's +no limitation that prevents doing it in the inverse order. + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +--- + xen/arch/x86/hvm/hvm.c | 1 + + xen/arch/x86/hvm/irq.c | 5 +++++ + 2 files changed, 6 insertions(+) + +diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c +index 28bc7e4252..9f7b096072 100644 +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -4069,6 +4069,7 @@ static int hvmop_set_evtchn_upcall_vector( + printk(XENLOG_G_INFO "%pv: upcall vector %02x\n", v, op.vector); + + v->arch.hvm_vcpu.evtchn_upcall_vector = op.vector; ++ hvm_assert_evtchn_irq(v); + return 0; + } + +diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c +index 0077f68a83..f528e2d081 100644 +--- a/xen/arch/x86/hvm/irq.c ++++ b/xen/arch/x86/hvm/irq.c +@@ -385,6 +385,7 @@ void hvm_set_callback_via(struct domain *d, uint64_t via) + struct hvm_irq *hvm_irq = hvm_domain_irq(d); + unsigned int gsi=0, pdev=0, pintx=0; + uint8_t via_type; ++ struct vcpu *v; + + via_type = (uint8_t)MASK_EXTR(via, HVM_PARAM_CALLBACK_IRQ_TYPE_MASK) + 1; + if ( ((via_type == HVMIRQ_callback_gsi) && (via == 0)) || +@@ -447,6 +448,10 @@ void hvm_set_callback_via(struct domain *d, uint64_t via) + + spin_unlock(&d->arch.hvm_domain.irq_lock); + ++ for_each_vcpu ( d, v ) ++ if ( is_vcpu_online(v) ) ++ hvm_assert_evtchn_irq(v); ++ + #ifndef NDEBUG + printk(XENLOG_G_INFO "Dom%u callback via changed to ", d->domain_id); + switch ( via_type ) +-- +2.14.3 + + +From caff7f9b59455f1942c96ea7f631e6b0cd9b8e52 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 11 Jan 2018 17:47:57 +0000 +Subject: [PATCH 13/77] x86/svm: Offer CPUID Faulting to AMD HVM guests as well + +CPUID Faulting can be virtulised for HVM guests without hardware support, +meaning it can be offered to SVM guests. + +Signed-off-by: Andrew Cooper +--- + xen/arch/x86/hvm/svm/svm.c | 6 ++++++ + xen/arch/x86/msr.c | 3 ++- + 2 files changed, 8 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c +index b9cf423fd9..8864d82c11 100644 +--- a/xen/arch/x86/hvm/svm/svm.c ++++ b/xen/arch/x86/hvm/svm/svm.c +@@ -1784,6 +1784,12 @@ static void svm_vmexit_do_cpuid(struct cpu_user_regs *regs) + if ( (inst_len = __get_instruction_length(curr, INSTR_CPUID)) == 0 ) + return; + ++ if ( hvm_check_cpuid_faulting(curr) ) ++ { ++ hvm_inject_hw_exception(TRAP_gp_fault, 0); ++ return; ++ } ++ + guest_cpuid(curr, regs->eax, regs->ecx, &res); + HVMTRACE_5D(CPUID, regs->eax, res.a, res.b, res.c, res.d); + +diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c +index 31983edc54..187f8623a5 100644 +--- a/xen/arch/x86/msr.c ++++ b/xen/arch/x86/msr.c +@@ -39,7 +39,8 @@ static void __init calculate_hvm_max_policy(void) + return; + + /* 0x000000ce MSR_INTEL_PLATFORM_INFO */ +- if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) ++ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || ++ boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) + { + dp->plaform_info.available = true; + dp->plaform_info.cpuid_faulting = true; +-- +2.14.3 + + +From 5840f40e88fbdcdcf748d0e581dad587ffdde0a1 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 17:47:58 +0000 +Subject: [PATCH 14/77] xen/x86: report domain id on cpuid +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Use the ECX register of the hypervisor leaf 5. The EAX register on +this leaf is a flags field that can be used to notice the presence of +the domain id in ECX. Note that this is only available to HVM guests. + +Signed-off-by: Roger Pau Monné +--- +Changes since v1: + - Use leaf 5 instead. +--- + xen/arch/x86/traps.c | 5 +++++ + xen/include/public/arch-x86/cpuid.h | 2 ++ + 2 files changed, 7 insertions(+) + +diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c +index 642f3cc6d7..348866b8b5 100644 +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -928,6 +928,11 @@ void cpuid_hypervisor_leaves(const struct vcpu *v, uint32_t leaf, + /* Indicate presence of vcpu id and set it in ebx */ + res->a |= XEN_HVM_CPUID_VCPU_ID_PRESENT; + res->b = v->vcpu_id; ++ ++ /* Indicate presence of domain id and set it in ecx */ ++ res->a |= XEN_HVM_CPUID_DOMID_PRESENT; ++ res->c = d->domain_id; ++ + break; + + case 5: /* PV-specific parameters */ +diff --git a/xen/include/public/arch-x86/cpuid.h b/xen/include/public/arch-x86/cpuid.h +index eb76875d0e..665c4b644d 100644 +--- a/xen/include/public/arch-x86/cpuid.h ++++ b/xen/include/public/arch-x86/cpuid.h +@@ -94,12 +94,14 @@ + * HVM-specific features + * Sub-leaf 0: EAX: Features + * Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag) ++ * Sub-leaf 0: ECX: domain id (iff EAX has XEN_HVM_CPUID_DOMID_PRESENT flag) + */ + #define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */ + #define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */ + /* Memory mapped from other domains has valid IOMMU entries */ + #define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2) + #define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */ ++#define XEN_HVM_CPUID_DOMID_PRESENT (1u << 4) /* domid is present in ECX */ + + /* + * Leaf 6 (0x40000x05) +-- +2.14.3 + + +From 40938b5d5696ccdec67b15fb3a49e8a9f1ab1998 Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Thu, 11 Jan 2018 17:47:58 +0000 +Subject: [PATCH 15/77] tools/libxc: remove extraneous newline in + xc_dom_load_acpi + +Signed-off-by: Wei Liu +Reviewed-by: Andrew Cooper +--- + tools/libxc/xc_dom_core.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/libxc/xc_dom_core.c b/tools/libxc/xc_dom_core.c +index b5f316a1dc..303cb971e8 100644 +--- a/tools/libxc/xc_dom_core.c ++++ b/tools/libxc/xc_dom_core.c +@@ -1078,7 +1078,7 @@ static int xc_dom_load_acpi(struct xc_dom_image *dom) + + while ( (i < MAX_ACPI_MODULES) && dom->acpi_modules[i].length ) + { +- DOMPRINTF("%s: %d bytes at address %" PRIx64 "\n", __FUNCTION__, ++ DOMPRINTF("%s: %d bytes at address %" PRIx64, __FUNCTION__, + dom->acpi_modules[i].length, + dom->acpi_modules[i].guest_addr_out); + +-- +2.14.3 + + +From 4621c10f489de827742f95c31ac0f43fc3bcde88 Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Thu, 11 Jan 2018 17:47:58 +0000 +Subject: [PATCH 16/77] tools/libelf: fix elf notes check for PVH guest + +PVH only requires PHYS32_ENTRY to be set. Return immediately if that's +the case. + +Also remove the printk in pvh_load_kernel. + +Signed-off-by: Wei Liu +Reviewed-by: Andrew Cooper +--- + xen/arch/x86/hvm/dom0_build.c | 4 ---- + xen/common/libelf/libelf-dominfo.c | 9 ++++++++- + 2 files changed, 8 insertions(+), 5 deletions(-) + +diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c +index a67071c739..303ae4e7b5 100644 +--- a/xen/arch/x86/hvm/dom0_build.c ++++ b/xen/arch/x86/hvm/dom0_build.c +@@ -484,10 +484,6 @@ static int __init pvh_load_kernel(struct domain *d, const module_t *image, + return -EINVAL; + } + +- printk("OS: %s version: %s loader: %s bitness: %s\n", parms.guest_os, +- parms.guest_ver, parms.loader, +- elf_64bit(&elf) ? "64-bit" : "32-bit"); +- + /* Copy the OS image and free temporary buffer. */ + elf.dest_base = (void *)(parms.virt_kstart - parms.virt_base); + elf.dest_size = parms.virt_kend - parms.virt_kstart; +diff --git a/xen/common/libelf/libelf-dominfo.c b/xen/common/libelf/libelf-dominfo.c +index a52900c00c..378bc05f39 100644 +--- a/xen/common/libelf/libelf-dominfo.c ++++ b/xen/common/libelf/libelf-dominfo.c +@@ -373,6 +373,13 @@ static elf_errorstatus elf_xen_note_check(struct elf_binary *elf, + return 0; + } + ++ /* PVH only requires one ELF note to be set */ ++ if ( parms->phys_entry != UNSET_ADDR32 ) ++ { ++ elf_msg(elf, "ELF: Found PVH image\n"); ++ return 0; ++ } ++ + /* Check the contents of the Xen notes or guest string. */ + if ( ((strlen(parms->loader) == 0) || + strncmp(parms->loader, "generic", 7)) && +@@ -381,7 +388,7 @@ static elf_errorstatus elf_xen_note_check(struct elf_binary *elf, + { + elf_err(elf, + "ERROR: Will only load images built for the generic loader or Linux images" +- " (Not '%.*s' and '%.*s')\n", ++ " (Not '%.*s' and '%.*s') or with PHYS32_ENTRY set\n", + (int)sizeof(parms->loader), parms->loader, + (int)sizeof(parms->guest_os), parms->guest_os); + return -1; +-- +2.14.3 + + +From 667275050d83fdca61303b09d9c2448f0badf5a9 Mon Sep 17 00:00:00 2001 +From: Jonathan Ludlam +Date: Thu, 11 Jan 2018 17:47:58 +0000 +Subject: [PATCH 17/77] tools/libxc: Multi modules support + +Signed-off-by: Jonathan Ludlam +Signed-off-by: Sergey Dyasli +Signed-off-by: Andrew Cooper +Signed-off-by: Wei Liu +--- + stubdom/grub/kexec.c | 7 +- + tools/helpers/init-xenstore-domain.c | 4 +- + tools/libxc/include/xc_dom.h | 48 ++++++----- + tools/libxc/xc_dom_compat_linux.c | 2 +- + tools/libxc/xc_dom_core.c | 152 +++++++++++++++++++++++------------ + tools/libxc/xc_dom_x86.c | 65 ++++++++------- + tools/libxl/libxl_dom.c | 10 +-- + 7 files changed, 175 insertions(+), 113 deletions(-) + +diff --git a/stubdom/grub/kexec.c b/stubdom/grub/kexec.c +index 437a0a96e9..61ca082d42 100644 +--- a/stubdom/grub/kexec.c ++++ b/stubdom/grub/kexec.c +@@ -202,7 +202,7 @@ static void tpm_hash2pcr(struct xc_dom_image *dom, char *cmdline) + ASSERT(rv == 0 && resp->status == 0); + + cmd.pcr = bswap_32(5); // PCR #5 for initrd +- sha1(dom->ramdisk_blob, dom->ramdisk_size, cmd.hash); ++ sha1(dom->modules[0].blob, dom->modules[0].size, cmd.hash); + rv = tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), (void*)&resp, &resplen); + ASSERT(rv == 0 && resp->status == 0); + +@@ -231,13 +231,12 @@ void kexec(void *kernel, long kernel_size, void *module, long module_size, char + + /* We are using guest owned memory, therefore no limits. */ + xc_dom_kernel_max_size(dom, 0); +- xc_dom_ramdisk_max_size(dom, 0); ++ xc_dom_module_max_size(dom, 0); + + dom->kernel_blob = kernel; + dom->kernel_size = kernel_size; + +- dom->ramdisk_blob = module; +- dom->ramdisk_size = module_size; ++ xc_dom_module_mem(dom, module, module_size, NULL); + + dom->flags = flags; + dom->console_evtchn = start_info.console.domU.evtchn; +diff --git a/tools/helpers/init-xenstore-domain.c b/tools/helpers/init-xenstore-domain.c +index 047ad0cb1d..8453be283b 100644 +--- a/tools/helpers/init-xenstore-domain.c ++++ b/tools/helpers/init-xenstore-domain.c +@@ -145,10 +145,10 @@ static int build(xc_interface *xch) + + if ( ramdisk ) + { +- rv = xc_dom_ramdisk_file(dom, ramdisk); ++ rv = xc_dom_module_file(dom, ramdisk, NULL); + if ( rv ) + { +- fprintf(stderr, "xc_dom_ramdisk_file failed\n"); ++ fprintf(stderr, "xc_dom_module_file failed\n"); + goto err; + } + } +diff --git a/tools/libxc/include/xc_dom.h b/tools/libxc/include/xc_dom.h +index cdcdd07d2b..08be8a8f3f 100644 +--- a/tools/libxc/include/xc_dom.h ++++ b/tools/libxc/include/xc_dom.h +@@ -22,6 +22,7 @@ + #define INVALID_PFN ((xen_pfn_t)-1) + #define X86_HVM_NR_SPECIAL_PAGES 8 + #define X86_HVM_END_SPECIAL_REGION 0xff000u ++#define XG_MAX_MODULES 2 + + /* --- typedefs and structs ---------------------------------------- */ + +@@ -56,17 +57,32 @@ struct xc_dom_phys { + xen_pfn_t count; + }; + ++struct xc_dom_module { ++ void *blob; ++ size_t size; ++ void *cmdline; ++ /* If seg.vstart is non zero then the module will be loaded at that ++ * address, otherwise it will automatically placed. ++ * ++ * If automatic placement is used and the module is gzip ++ * compressed then it will be decompressed as it is loaded. If the ++ * module has been explicitly placed then it is loaded as is ++ * otherwise decompressing risks undoing the manual placement. ++ */ ++ struct xc_dom_seg seg; ++}; ++ + struct xc_dom_image { + /* files */ + void *kernel_blob; + size_t kernel_size; +- void *ramdisk_blob; +- size_t ramdisk_size; ++ unsigned int num_modules; ++ struct xc_dom_module modules[XG_MAX_MODULES]; + void *devicetree_blob; + size_t devicetree_size; + + size_t max_kernel_size; +- size_t max_ramdisk_size; ++ size_t max_module_size; + size_t max_devicetree_size; + + /* arguments and parameters */ +@@ -80,15 +96,6 @@ struct xc_dom_image { + + /* memory layout */ + struct xc_dom_seg kernel_seg; +- /* If ramdisk_seg.vstart is non zero then the ramdisk will be +- * loaded at that address, otherwise it will automatically placed. +- * +- * If automatic placement is used and the ramdisk is gzip +- * compressed then it will be decompressed as it is loaded. If the +- * ramdisk has been explicitly placed then it is loaded as is +- * otherwise decompressing risks undoing the manual placement. +- */ +- struct xc_dom_seg ramdisk_seg; + struct xc_dom_seg p2m_seg; + struct xc_dom_seg pgtables_seg; + struct xc_dom_seg devicetree_seg; +@@ -277,12 +284,12 @@ void xc_dom_release(struct xc_dom_image *dom); + int xc_dom_rambase_init(struct xc_dom_image *dom, uint64_t rambase); + int xc_dom_mem_init(struct xc_dom_image *dom, unsigned int mem_mb); + +-/* Set this larger if you have enormous ramdisks/kernels. Note that ++/* Set this larger if you have enormous modules/kernels. Note that + * you should trust all kernels not to be maliciously large (e.g. to + * exhaust all dom0 memory) if you do this (see CVE-2012-4544 / + * XSA-25). You can also set the default independently for +- * ramdisks/kernels in xc_dom_allocate() or call +- * xc_dom_{kernel,ramdisk}_max_size. ++ * modules/kernels in xc_dom_allocate() or call ++ * xc_dom_{kernel,module}_max_size. + */ + #ifndef XC_DOM_DECOMPRESS_MAX + #define XC_DOM_DECOMPRESS_MAX (1024*1024*1024) /* 1GB */ +@@ -291,8 +298,8 @@ int xc_dom_mem_init(struct xc_dom_image *dom, unsigned int mem_mb); + int xc_dom_kernel_check_size(struct xc_dom_image *dom, size_t sz); + int xc_dom_kernel_max_size(struct xc_dom_image *dom, size_t sz); + +-int xc_dom_ramdisk_check_size(struct xc_dom_image *dom, size_t sz); +-int xc_dom_ramdisk_max_size(struct xc_dom_image *dom, size_t sz); ++int xc_dom_module_check_size(struct xc_dom_image *dom, size_t sz); ++int xc_dom_module_max_size(struct xc_dom_image *dom, size_t sz); + + int xc_dom_devicetree_max_size(struct xc_dom_image *dom, size_t sz); + +@@ -303,11 +310,12 @@ int xc_dom_do_gunzip(xc_interface *xch, + int xc_dom_try_gunzip(struct xc_dom_image *dom, void **blob, size_t * size); + + int xc_dom_kernel_file(struct xc_dom_image *dom, const char *filename); +-int xc_dom_ramdisk_file(struct xc_dom_image *dom, const char *filename); ++int xc_dom_module_file(struct xc_dom_image *dom, const char *filename, ++ const char *cmdline); + int xc_dom_kernel_mem(struct xc_dom_image *dom, const void *mem, + size_t memsize); +-int xc_dom_ramdisk_mem(struct xc_dom_image *dom, const void *mem, +- size_t memsize); ++int xc_dom_module_mem(struct xc_dom_image *dom, const void *mem, ++ size_t memsize, const char *cmdline); + int xc_dom_devicetree_file(struct xc_dom_image *dom, const char *filename); + int xc_dom_devicetree_mem(struct xc_dom_image *dom, const void *mem, + size_t memsize); +diff --git a/tools/libxc/xc_dom_compat_linux.c b/tools/libxc/xc_dom_compat_linux.c +index c922c61e90..b3d43feed9 100644 +--- a/tools/libxc/xc_dom_compat_linux.c ++++ b/tools/libxc/xc_dom_compat_linux.c +@@ -56,7 +56,7 @@ int xc_linux_build(xc_interface *xch, uint32_t domid, + if ( (rc = xc_dom_kernel_file(dom, image_name)) != 0 ) + goto out; + if ( initrd_name && strlen(initrd_name) && +- ((rc = xc_dom_ramdisk_file(dom, initrd_name)) != 0) ) ++ ((rc = xc_dom_module_file(dom, initrd_name, NULL)) != 0) ) + goto out; + + dom->flags |= flags; +diff --git a/tools/libxc/xc_dom_core.c b/tools/libxc/xc_dom_core.c +index 303cb971e8..3e65aff22b 100644 +--- a/tools/libxc/xc_dom_core.c ++++ b/tools/libxc/xc_dom_core.c +@@ -314,16 +314,16 @@ int xc_dom_kernel_check_size(struct xc_dom_image *dom, size_t sz) + return 0; + } + +-int xc_dom_ramdisk_check_size(struct xc_dom_image *dom, size_t sz) ++int xc_dom_module_check_size(struct xc_dom_image *dom, size_t sz) + { + /* No limit */ +- if ( !dom->max_ramdisk_size ) ++ if ( !dom->max_module_size ) + return 0; + +- if ( sz > dom->max_ramdisk_size ) ++ if ( sz > dom->max_module_size ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, +- "ramdisk image too large"); ++ "module image too large"); + return 1; + } + +@@ -764,7 +764,7 @@ struct xc_dom_image *xc_dom_allocate(xc_interface *xch, + dom->xch = xch; + + dom->max_kernel_size = XC_DOM_DECOMPRESS_MAX; +- dom->max_ramdisk_size = XC_DOM_DECOMPRESS_MAX; ++ dom->max_module_size = XC_DOM_DECOMPRESS_MAX; + dom->max_devicetree_size = XC_DOM_DECOMPRESS_MAX; + + if ( cmdline ) +@@ -797,10 +797,10 @@ int xc_dom_kernel_max_size(struct xc_dom_image *dom, size_t sz) + return 0; + } + +-int xc_dom_ramdisk_max_size(struct xc_dom_image *dom, size_t sz) ++int xc_dom_module_max_size(struct xc_dom_image *dom, size_t sz) + { +- DOMPRINTF("%s: ramdisk_max_size=%zx", __FUNCTION__, sz); +- dom->max_ramdisk_size = sz; ++ DOMPRINTF("%s: module_max_size=%zx", __FUNCTION__, sz); ++ dom->max_module_size = sz; + return 0; + } + +@@ -821,16 +821,30 @@ int xc_dom_kernel_file(struct xc_dom_image *dom, const char *filename) + return xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size); + } + +-int xc_dom_ramdisk_file(struct xc_dom_image *dom, const char *filename) ++int xc_dom_module_file(struct xc_dom_image *dom, const char *filename, const char *cmdline) + { ++ unsigned int mod = dom->num_modules++; ++ + DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename); +- dom->ramdisk_blob = +- xc_dom_malloc_filemap(dom, filename, &dom->ramdisk_size, +- dom->max_ramdisk_size); ++ dom->modules[mod].blob = ++ xc_dom_malloc_filemap(dom, filename, &dom->modules[mod].size, ++ dom->max_module_size); + +- if ( dom->ramdisk_blob == NULL ) ++ if ( dom->modules[mod].blob == NULL ) + return -1; +-// return xc_dom_try_gunzip(dom, &dom->ramdisk_blob, &dom->ramdisk_size); ++ ++ if ( cmdline ) ++ { ++ dom->modules[mod].cmdline = xc_dom_strdup(dom, cmdline); ++ ++ if ( dom->modules[mod].cmdline == NULL ) ++ return -1; ++ } ++ else ++ { ++ dom->modules[mod].cmdline = NULL; ++ } ++ + return 0; + } + +@@ -859,13 +873,28 @@ int xc_dom_kernel_mem(struct xc_dom_image *dom, const void *mem, size_t memsize) + return xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size); + } + +-int xc_dom_ramdisk_mem(struct xc_dom_image *dom, const void *mem, +- size_t memsize) ++int xc_dom_module_mem(struct xc_dom_image *dom, const void *mem, ++ size_t memsize, const char *cmdline) + { ++ unsigned int mod = dom->num_modules++; ++ + DOMPRINTF_CALLED(dom->xch); +- dom->ramdisk_blob = (void *)mem; +- dom->ramdisk_size = memsize; +-// return xc_dom_try_gunzip(dom, &dom->ramdisk_blob, &dom->ramdisk_size); ++ ++ dom->modules[mod].blob = (void *)mem; ++ dom->modules[mod].size = memsize; ++ ++ if ( cmdline ) ++ { ++ dom->modules[mod].cmdline = xc_dom_strdup(dom, cmdline); ++ ++ if ( dom->modules[mod].cmdline == NULL ) ++ return -1; ++ } ++ else ++ { ++ dom->modules[mod].cmdline = NULL; ++ } ++ + return 0; + } + +@@ -990,41 +1019,42 @@ int xc_dom_update_guest_p2m(struct xc_dom_image *dom) + return 0; + } + +-static int xc_dom_build_ramdisk(struct xc_dom_image *dom) ++static int xc_dom_build_module(struct xc_dom_image *dom, unsigned int mod) + { +- size_t unziplen, ramdisklen; +- void *ramdiskmap; ++ size_t unziplen, modulelen; ++ void *modulemap; ++ char name[10]; + +- if ( !dom->ramdisk_seg.vstart ) ++ if ( !dom->modules[mod].seg.vstart ) + { + unziplen = xc_dom_check_gzip(dom->xch, +- dom->ramdisk_blob, dom->ramdisk_size); +- if ( xc_dom_ramdisk_check_size(dom, unziplen) != 0 ) ++ dom->modules[mod].blob, dom->modules[mod].size); ++ if ( xc_dom_module_check_size(dom, unziplen) != 0 ) + unziplen = 0; + } + else + unziplen = 0; + +- ramdisklen = unziplen ? unziplen : dom->ramdisk_size; +- +- if ( xc_dom_alloc_segment(dom, &dom->ramdisk_seg, "ramdisk", +- dom->ramdisk_seg.vstart, ramdisklen) != 0 ) ++ modulelen = unziplen ? unziplen : dom->modules[mod].size; ++ snprintf(name, sizeof(name), "module%u", mod); ++ if ( xc_dom_alloc_segment(dom, &dom->modules[mod].seg, name, ++ dom->modules[mod].seg.vstart, modulelen) != 0 ) + goto err; +- ramdiskmap = xc_dom_seg_to_ptr(dom, &dom->ramdisk_seg); +- if ( ramdiskmap == NULL ) ++ modulemap = xc_dom_seg_to_ptr(dom, &dom->modules[mod].seg); ++ if ( modulemap == NULL ) + { +- DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->ramdisk_seg) => NULL", +- __FUNCTION__); ++ DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->modules[%u].seg) => NULL", ++ __FUNCTION__, mod); + goto err; + } + if ( unziplen ) + { +- if ( xc_dom_do_gunzip(dom->xch, dom->ramdisk_blob, dom->ramdisk_size, +- ramdiskmap, ramdisklen) == -1 ) ++ if ( xc_dom_do_gunzip(dom->xch, dom->modules[mod].blob, dom->modules[mod].size, ++ modulemap, modulelen) == -1 ) + goto err; + } + else +- memcpy(ramdiskmap, dom->ramdisk_blob, dom->ramdisk_size); ++ memcpy(modulemap, dom->modules[mod].blob, dom->modules[mod].size); + + return 0; + +@@ -1131,6 +1161,7 @@ int xc_dom_build_image(struct xc_dom_image *dom) + { + unsigned int page_size; + bool unmapped_initrd; ++ unsigned int mod; + + DOMPRINTF_CALLED(dom->xch); + +@@ -1154,15 +1185,24 @@ int xc_dom_build_image(struct xc_dom_image *dom) + if ( dom->kernel_loader->loader(dom) != 0 ) + goto err; + +- /* Don't load ramdisk now if no initial mapping required. */ +- unmapped_initrd = dom->parms.unmapped_initrd && !dom->ramdisk_seg.vstart; +- +- if ( dom->ramdisk_blob && !unmapped_initrd ) ++ /* Don't load ramdisk / other modules now if no initial mapping required. */ ++ for ( mod = 0; mod < dom->num_modules; mod++ ) + { +- if ( xc_dom_build_ramdisk(dom) != 0 ) +- goto err; +- dom->initrd_start = dom->ramdisk_seg.vstart; +- dom->initrd_len = dom->ramdisk_seg.vend - dom->ramdisk_seg.vstart; ++ unmapped_initrd = (dom->parms.unmapped_initrd && ++ !dom->modules[mod].seg.vstart); ++ ++ if ( dom->modules[mod].blob && !unmapped_initrd ) ++ { ++ if ( xc_dom_build_module(dom, mod) != 0 ) ++ goto err; ++ ++ if ( mod == 0 ) ++ { ++ dom->initrd_start = dom->modules[mod].seg.vstart; ++ dom->initrd_len = ++ dom->modules[mod].seg.vend - dom->modules[mod].seg.vstart; ++ } ++ } + } + + /* load devicetree */ +@@ -1216,14 +1256,24 @@ int xc_dom_build_image(struct xc_dom_image *dom) + if ( dom->virt_pgtab_end && xc_dom_alloc_pad(dom, dom->virt_pgtab_end) ) + return -1; + +- /* Load ramdisk if no initial mapping required. */ +- if ( dom->ramdisk_blob && unmapped_initrd ) ++ for ( mod = 0; mod < dom->num_modules; mod++ ) + { +- if ( xc_dom_build_ramdisk(dom) != 0 ) +- goto err; +- dom->flags |= SIF_MOD_START_PFN; +- dom->initrd_start = dom->ramdisk_seg.pfn; +- dom->initrd_len = page_size * dom->ramdisk_seg.pages; ++ unmapped_initrd = (dom->parms.unmapped_initrd && ++ !dom->modules[mod].seg.vstart); ++ ++ /* Load ramdisk / other modules if no initial mapping required. */ ++ if ( dom->modules[mod].blob && unmapped_initrd ) ++ { ++ if ( xc_dom_build_module(dom, mod) != 0 ) ++ goto err; ++ ++ if ( mod == 0 ) ++ { ++ dom->flags |= SIF_MOD_START_PFN; ++ dom->initrd_start = dom->modules[mod].seg.pfn; ++ dom->initrd_len = page_size * dom->modules[mod].seg.pages; ++ } ++ } + } + + /* Allocate p2m list if outside of initial kernel mapping. */ +diff --git a/tools/libxc/xc_dom_x86.c b/tools/libxc/xc_dom_x86.c +index bff68a011f..0b65dab4bc 100644 +--- a/tools/libxc/xc_dom_x86.c ++++ b/tools/libxc/xc_dom_x86.c +@@ -70,8 +70,8 @@ + #define round_up(addr, mask) ((addr) | (mask)) + #define round_pg_up(addr) (((addr) + PAGE_SIZE_X86 - 1) & ~(PAGE_SIZE_X86 - 1)) + +-#define HVMLOADER_MODULE_MAX_COUNT 1 +-#define HVMLOADER_MODULE_NAME_SIZE 10 ++#define HVMLOADER_MODULE_MAX_COUNT 2 ++#define HVMLOADER_MODULE_CMDLINE_SIZE MAX_GUEST_CMDLINE + + struct xc_dom_params { + unsigned levels; +@@ -627,6 +627,12 @@ static int alloc_magic_pages_hvm(struct xc_dom_image *dom) + xc_hvm_param_set(xch, domid, HVM_PARAM_SHARING_RING_PFN, + special_pfn(SPECIALPAGE_SHARING)); + ++ start_info_size += ++ sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT; ++ ++ start_info_size += ++ HVMLOADER_MODULE_CMDLINE_SIZE * HVMLOADER_MODULE_MAX_COUNT; ++ + if ( !dom->device_model ) + { + if ( dom->cmdline ) +@@ -634,22 +640,9 @@ static int alloc_magic_pages_hvm(struct xc_dom_image *dom) + dom->cmdline_size = ROUNDUP(strlen(dom->cmdline) + 1, 8); + start_info_size += dom->cmdline_size; + } +- +- /* Limited to one module. */ +- if ( dom->ramdisk_blob ) +- start_info_size += sizeof(struct hvm_modlist_entry); + } + else + { +- start_info_size += +- sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT; +- /* +- * Add extra space to write modules name. +- * The HVMLOADER_MODULE_NAME_SIZE accounts for NUL byte. +- */ +- start_info_size += +- HVMLOADER_MODULE_NAME_SIZE * HVMLOADER_MODULE_MAX_COUNT; +- + /* + * Allocate and clear additional ioreq server pages. The default + * server will use the IOREQ and BUFIOREQ special pages above. +@@ -749,7 +742,7 @@ static int start_info_x86_32(struct xc_dom_image *dom) + start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn); + start_info->console.domU.evtchn = dom->console_evtchn; + +- if ( dom->ramdisk_blob ) ++ if ( dom->modules[0].blob ) + { + start_info->mod_start = dom->initrd_start; + start_info->mod_len = dom->initrd_len; +@@ -800,7 +793,7 @@ static int start_info_x86_64(struct xc_dom_image *dom) + start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn); + start_info->console.domU.evtchn = dom->console_evtchn; + +- if ( dom->ramdisk_blob ) ++ if ( dom->modules[0].blob ) + { + start_info->mod_start = dom->initrd_start; + start_info->mod_len = dom->initrd_len; +@@ -1237,7 +1230,7 @@ static int meminit_hvm(struct xc_dom_image *dom) + unsigned long target_pages = dom->target_pages; + unsigned long cur_pages, cur_pfn; + int rc; +- unsigned long stat_normal_pages = 0, stat_2mb_pages = 0, ++ unsigned long stat_normal_pages = 0, stat_2mb_pages = 0, + stat_1gb_pages = 0; + unsigned int memflags = 0; + int claim_enabled = dom->claim_enabled; +@@ -1303,6 +1296,8 @@ static int meminit_hvm(struct xc_dom_image *dom) + p2m_size = 0; + for ( i = 0; i < nr_vmemranges; i++ ) + { ++ DOMPRINTF("range: start=0x%"PRIx64" end=0x%"PRIx64, vmemranges[i].start, vmemranges[i].end); ++ + total_pages += ((vmemranges[i].end - vmemranges[i].start) + >> PAGE_SHIFT); + p2m_size = p2m_size > (vmemranges[i].end >> PAGE_SHIFT) ? +@@ -1633,7 +1628,7 @@ static int alloc_pgtables_hvm(struct xc_dom_image *dom) + */ + static void add_module_to_list(struct xc_dom_image *dom, + struct xc_hvm_firmware_module *module, +- const char *name, ++ const char *cmdline, + struct hvm_modlist_entry *modlist, + struct hvm_start_info *start_info) + { +@@ -1648,16 +1643,20 @@ static void add_module_to_list(struct xc_dom_image *dom, + return; + + assert(start_info->nr_modules < HVMLOADER_MODULE_MAX_COUNT); +- assert(strnlen(name, HVMLOADER_MODULE_NAME_SIZE) +- < HVMLOADER_MODULE_NAME_SIZE); + + modlist[index].paddr = module->guest_addr_out; + modlist[index].size = module->length; + +- strncpy(modules_cmdline_start + HVMLOADER_MODULE_NAME_SIZE * index, +- name, HVMLOADER_MODULE_NAME_SIZE); ++ if ( cmdline ) ++ { ++ assert(strnlen(cmdline, HVMLOADER_MODULE_CMDLINE_SIZE) ++ < HVMLOADER_MODULE_CMDLINE_SIZE); ++ strncpy(modules_cmdline_start + HVMLOADER_MODULE_CMDLINE_SIZE * index, ++ cmdline, HVMLOADER_MODULE_CMDLINE_SIZE); ++ } ++ + modlist[index].cmdline_paddr = +- modules_cmdline_paddr + HVMLOADER_MODULE_NAME_SIZE * index; ++ modules_cmdline_paddr + HVMLOADER_MODULE_CMDLINE_SIZE * index; + + start_info->nr_modules++; + } +@@ -1669,10 +1668,10 @@ static int bootlate_hvm(struct xc_dom_image *dom) + struct hvm_start_info *start_info; + size_t start_info_size; + struct hvm_modlist_entry *modlist; ++ unsigned int i; + + start_info_size = sizeof(*start_info) + dom->cmdline_size; +- if ( dom->ramdisk_blob ) +- start_info_size += sizeof(struct hvm_modlist_entry); ++ start_info_size += sizeof(struct hvm_modlist_entry) * dom->num_modules; + + if ( start_info_size > + dom->start_info_seg.pages << XC_DOM_PAGE_SHIFT(dom) ) +@@ -1703,12 +1702,18 @@ static int bootlate_hvm(struct xc_dom_image *dom) + ((uintptr_t)cmdline - (uintptr_t)start_info); + } + +- if ( dom->ramdisk_blob ) ++ for ( i = 0; i < dom->num_modules; i++ ) + { ++ struct xc_hvm_firmware_module mod; ++ ++ DOMPRINTF("Adding module %u", i); ++ mod.guest_addr_out = ++ dom->modules[i].seg.vstart - dom->parms.virt_base; ++ mod.length = ++ dom->modules[i].seg.vend - dom->modules[i].seg.vstart; + +- modlist[0].paddr = dom->ramdisk_seg.vstart - dom->parms.virt_base; +- modlist[0].size = dom->ramdisk_seg.vend - dom->ramdisk_seg.vstart; +- start_info->nr_modules = 1; ++ add_module_to_list(dom, &mod, dom->modules[i].cmdline, ++ modlist, start_info); + } + + /* ACPI module 0 is the RSDP */ +diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c +index ef834e652d..fbbdb9ec2f 100644 +--- a/tools/libxl/libxl_dom.c ++++ b/tools/libxl/libxl_dom.c +@@ -796,12 +796,12 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid, + + if ( state->pv_ramdisk.path && strlen(state->pv_ramdisk.path) ) { + if (state->pv_ramdisk.mapped) { +- if ( (ret = xc_dom_ramdisk_mem(dom, state->pv_ramdisk.data, state->pv_ramdisk.size)) != 0 ) { ++ if ( (ret = xc_dom_module_mem(dom, state->pv_ramdisk.data, state->pv_ramdisk.size, NULL)) != 0 ) { + LOGE(ERROR, "xc_dom_ramdisk_mem failed"); + goto out; + } + } else { +- if ( (ret = xc_dom_ramdisk_file(dom, state->pv_ramdisk.path)) != 0 ) { ++ if ( (ret = xc_dom_module_file(dom, state->pv_ramdisk.path, NULL)) != 0 ) { + LOGE(ERROR, "xc_dom_ramdisk_file failed"); + goto out; + } +@@ -1043,14 +1043,14 @@ static int libxl__domain_firmware(libxl__gc *gc, + + if (state->pv_ramdisk.path && strlen(state->pv_ramdisk.path)) { + if (state->pv_ramdisk.mapped) { +- rc = xc_dom_ramdisk_mem(dom, state->pv_ramdisk.data, +- state->pv_ramdisk.size); ++ rc = xc_dom_module_mem(dom, state->pv_ramdisk.data, ++ state->pv_ramdisk.size, NULL); + if (rc) { + LOGE(ERROR, "xc_dom_ramdisk_mem failed"); + goto out; + } + } else { +- rc = xc_dom_ramdisk_file(dom, state->pv_ramdisk.path); ++ rc = xc_dom_module_file(dom, state->pv_ramdisk.path, NULL); + if (rc) { + LOGE(ERROR, "xc_dom_ramdisk_file failed"); + goto out; +-- +2.14.3 + + +From 78e9cc3488ffd55131b129a3ab90169d4e903efe Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 11 Jan 2018 17:47:58 +0000 +Subject: [PATCH 18/77] xen/common: Widen the guest logging buffer slightly + +This reduces the amount of line wrapping from guests; Xen in particular likes +to print lines longer than 80 characters. + +Signed-off-by: Andrew Cooper +Reviewed-by: Wei Liu +--- + xen/include/xen/sched.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h +index 002ba29d6d..64abc1df6c 100644 +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -427,7 +427,7 @@ struct domain + xen_domain_handle_t handle; + + /* hvm_print_line() and guest_console_write() logging. */ +-#define DOMAIN_PBUF_SIZE 80 ++#define DOMAIN_PBUF_SIZE 200 + char *pbuf; + unsigned pbuf_idx; + spinlock_t pbuf_lock; +-- +2.14.3 + + +From 92a6295c30a9f323de9d741e2e43f49df4412308 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 11 Jan 2018 17:47:59 +0000 +Subject: [PATCH 19/77] x86/time: Print a more helpful error when a platform + timer can't be found + +Signed-off-by: Andrew Cooper +Reviewed-by: Wei Liu +--- + xen/arch/x86/time.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c +index eba7aed72d..6c20b1036d 100644 +--- a/xen/arch/x86/time.c ++++ b/xen/arch/x86/time.c +@@ -708,7 +708,8 @@ static u64 __init init_platform_timer(void) + } + } + +- BUG_ON(rc <= 0); ++ if ( rc <= 0 ) ++ panic("Unable to find usable platform timer"); + + printk("Platform timer is %s %s\n", + freq_string(pts->frequency), pts->name); +-- +2.14.3 + + +From ff1fb8fe53bb91823a1a37b6dd0e816d519c19d8 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 11 Jan 2018 17:47:59 +0000 +Subject: [PATCH 20/77] x86/link: Introduce and use SECTION_ALIGN + +... to reduce the quantity of #ifdef EFI. + +Signed-off-by: Andrew Cooper +Reviewed-by: Wei Liu +--- +CC: Jan Beulich +--- + xen/arch/x86/xen.lds.S | 50 +++++++++++++------------------------------------- + 1 file changed, 13 insertions(+), 37 deletions(-) + +diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S +index d5e8821d41..6164ad094f 100644 +--- a/xen/arch/x86/xen.lds.S ++++ b/xen/arch/x86/xen.lds.S +@@ -12,12 +12,14 @@ + #define FORMAT "pei-x86-64" + #undef __XEN_VIRT_START + #define __XEN_VIRT_START __image_base__ ++#define SECTION_ALIGN MB(2) + + ENTRY(efi_start) + + #else /* !EFI */ + + #define FORMAT "elf64-x86-64" ++#define SECTION_ALIGN PAGE_SIZE + + ENTRY(start) + +@@ -67,11 +69,7 @@ SECTIONS + _etext = .; /* End of text section */ + } :text = 0x9090 + +-#ifdef EFI +- . = ALIGN(MB(2)); +-#else +- . = ALIGN(PAGE_SIZE); +-#endif ++ . = ALIGN(SECTION_ALIGN); + __2M_text_end = .; + + __2M_rodata_start = .; /* Start of 2M superpages, mapped RO. */ +@@ -149,11 +147,7 @@ SECTIONS + #endif + _erodata = .; + +-#ifdef EFI +- . = ALIGN(MB(2)); +-#else +- . = ALIGN(PAGE_SIZE); +-#endif ++ . = ALIGN(SECTION_ALIGN); + __2M_rodata_end = .; + + __2M_init_start = .; /* Start of 2M superpages, mapped RWX (boot only). */ +@@ -215,11 +209,7 @@ SECTIONS + __ctors_end = .; + } :text + +-#ifdef EFI +- . = ALIGN(MB(2)); +-#else +- . = ALIGN(PAGE_SIZE); +-#endif ++ . = ALIGN(SECTION_ALIGN); + __init_end = .; + __2M_init_end = .; + +@@ -257,11 +247,7 @@ SECTIONS + } :text + _end = . ; + +-#ifdef EFI +- . = ALIGN(MB(2)); +-#else +- . = ALIGN(PAGE_SIZE); +-#endif ++ . = ALIGN(SECTION_ALIGN); + __2M_rwdata_end = .; + + #ifdef EFI +@@ -310,23 +296,13 @@ ASSERT(__image_base__ > XEN_VIRT_START || + ASSERT(kexec_reloc_size - kexec_reloc <= PAGE_SIZE, "kexec_reloc is too large") + #endif + +-#ifdef EFI +-ASSERT(IS_ALIGNED(__2M_text_end, MB(2)), "__2M_text_end misaligned") +-ASSERT(IS_ALIGNED(__2M_rodata_start, MB(2)), "__2M_rodata_start misaligned") +-ASSERT(IS_ALIGNED(__2M_rodata_end, MB(2)), "__2M_rodata_end misaligned") +-ASSERT(IS_ALIGNED(__2M_init_start, MB(2)), "__2M_init_start misaligned") +-ASSERT(IS_ALIGNED(__2M_init_end, MB(2)), "__2M_init_end misaligned") +-ASSERT(IS_ALIGNED(__2M_rwdata_start, MB(2)), "__2M_rwdata_start misaligned") +-ASSERT(IS_ALIGNED(__2M_rwdata_end, MB(2)), "__2M_rwdata_end misaligned") +-#else +-ASSERT(IS_ALIGNED(__2M_text_end, PAGE_SIZE), "__2M_text_end misaligned") +-ASSERT(IS_ALIGNED(__2M_rodata_start, PAGE_SIZE), "__2M_rodata_start misaligned") +-ASSERT(IS_ALIGNED(__2M_rodata_end, PAGE_SIZE), "__2M_rodata_end misaligned") +-ASSERT(IS_ALIGNED(__2M_init_start, PAGE_SIZE), "__2M_init_start misaligned") +-ASSERT(IS_ALIGNED(__2M_init_end, PAGE_SIZE), "__2M_init_end misaligned") +-ASSERT(IS_ALIGNED(__2M_rwdata_start, PAGE_SIZE), "__2M_rwdata_start misaligned") +-ASSERT(IS_ALIGNED(__2M_rwdata_end, PAGE_SIZE), "__2M_rwdata_end misaligned") +-#endif ++ASSERT(IS_ALIGNED(__2M_text_end, SECTION_ALIGN), "__2M_text_end misaligned") ++ASSERT(IS_ALIGNED(__2M_rodata_start, SECTION_ALIGN), "__2M_rodata_start misaligned") ++ASSERT(IS_ALIGNED(__2M_rodata_end, SECTION_ALIGN), "__2M_rodata_end misaligned") ++ASSERT(IS_ALIGNED(__2M_init_start, SECTION_ALIGN), "__2M_init_start misaligned") ++ASSERT(IS_ALIGNED(__2M_init_end, SECTION_ALIGN), "__2M_init_end misaligned") ++ASSERT(IS_ALIGNED(__2M_rwdata_start, SECTION_ALIGN), "__2M_rwdata_start misaligned") ++ASSERT(IS_ALIGNED(__2M_rwdata_end, SECTION_ALIGN), "__2M_rwdata_end misaligned") + + ASSERT(IS_ALIGNED(cpu0_stack, STACK_SIZE), "cpu0_stack misaligned") + +-- +2.14.3 + + +From 9e46ae12edc8be1dd846ce545600db28dabfabc8 Mon Sep 17 00:00:00 2001 +From: Bob Moore +Date: Thu, 11 Jan 2018 17:47:59 +0000 +Subject: [PATCH 21/77] ACPICA: Make ACPI Power Management Timer (PM Timer) + optional. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +PM Timer is now optional. +This support is already in Windows8 and "SHOULD" come out in ACPI 5.0A +(if all goes well). + +The change doesn't affect Xen directly, because it does not rely +on the presence of the PM timer. + +Signed-off-by: Bob Moore +Signed-off-by: Lv Zheng +Signed-off-by: Rafael J. Wysocki +[ported to Xen] +Signed-off-by: Roger Pau Monné +--- + xen/drivers/acpi/tables/tbfadt.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/xen/drivers/acpi/tables/tbfadt.c b/xen/drivers/acpi/tables/tbfadt.c +index d62d8d5cb9..f11fd5a900 100644 +--- a/xen/drivers/acpi/tables/tbfadt.c ++++ b/xen/drivers/acpi/tables/tbfadt.c +@@ -95,7 +95,8 @@ static struct acpi_fadt_info __initdata fadt_info_table[] = { + + {"PmTimerBlock", ACPI_FADT_OFFSET(xpm_timer_block), + ACPI_FADT_OFFSET(pm_timer_block), +- ACPI_FADT_OFFSET(pm_timer_length), ACPI_FADT_REQUIRED}, ++ ACPI_FADT_OFFSET(pm_timer_length), ++ ACPI_FADT_SEPARATE_LENGTH}, /* ACPI 5.0A: Timer is optional */ + + {"Gpe0Block", ACPI_FADT_OFFSET(xgpe0_block), + ACPI_FADT_OFFSET(gpe0_block), +@@ -437,7 +438,7 @@ static void __init acpi_tb_validate_fadt(void) + + if (fadt_info_table[i].type & ACPI_FADT_REQUIRED) { + /* +- * Field is required (Pm1a_event, Pm1a_control, pm_timer). ++ * Field is required (Pm1a_event, Pm1a_control). + * Both the address and length must be non-zero. + */ + if (!address64->address || !length) { +-- +2.14.3 + + +From e7c8187b91fbff4c15e2cba06e33a1dce4b0b55e Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 11 Jan 2018 17:47:59 +0000 +Subject: [PATCH 22/77] xen/domctl: Return arch_config via getdomaininfo + +This allows toolstack software to distinguish HVM from PVH guests. + +Signed-off-by: Andrew Cooper +Signed-off-by: Wei Liu +Reviewed-by: Jan Beulich +--- +v2: bump domctl version number +--- + tools/libxc/include/xenctrl.h | 1 + + tools/libxc/xc_domain.c | 1 + + xen/arch/x86/domctl.c | 2 ++ + xen/include/public/domctl.h | 3 ++- + 4 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h +index 666db0b919..a92a8d7a53 100644 +--- a/tools/libxc/include/xenctrl.h ++++ b/tools/libxc/include/xenctrl.h +@@ -456,6 +456,7 @@ typedef struct xc_dominfo { + unsigned int max_vcpu_id; + xen_domain_handle_t handle; + unsigned int cpupool; ++ struct xen_arch_domainconfig arch_config; + } xc_dominfo_t; + + typedef xen_domctl_getdomaininfo_t xc_domaininfo_t; +diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c +index 3ccd27f101..8169284dc1 100644 +--- a/tools/libxc/xc_domain.c ++++ b/tools/libxc/xc_domain.c +@@ -421,6 +421,7 @@ int xc_domain_getinfo(xc_interface *xch, + info->nr_online_vcpus = domctl.u.getdomaininfo.nr_online_vcpus; + info->max_vcpu_id = domctl.u.getdomaininfo.max_vcpu_id; + info->cpupool = domctl.u.getdomaininfo.cpupool; ++ info->arch_config = domctl.u.getdomaininfo.arch_config; + + memcpy(info->handle, domctl.u.getdomaininfo.handle, + sizeof(xen_domain_handle_t)); +diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c +index 075ee92cd7..b52d6d9552 100644 +--- a/xen/arch/x86/domctl.c ++++ b/xen/arch/x86/domctl.c +@@ -345,6 +345,8 @@ void arch_get_domain_info(const struct domain *d, + { + if ( paging_mode_hap(d) ) + info->flags |= XEN_DOMINF_hap; ++ ++ info->arch_config.emulation_flags = d->arch.emulation_flags; + } + + #define MAX_IOPORTS 0x10000 +diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h +index 70027abc00..463f8cc420 100644 +--- a/xen/include/public/domctl.h ++++ b/xen/include/public/domctl.h +@@ -38,7 +38,7 @@ + #include "hvm/save.h" + #include "memory.h" + +-#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000e ++#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000f + + /* + * NB. xen_domctl.domain is an IN/OUT parameter for this operation. +@@ -116,6 +116,7 @@ struct xen_domctl_getdomaininfo { + uint32_t ssidref; + xen_domain_handle_t handle; + uint32_t cpupool; ++ struct xen_arch_domainconfig arch_config; + }; + typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t; + DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t); +-- +2.14.3 + + +From 78898c9d1b5bffe141da923bf4b5b19cc388e260 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 11 Jan 2018 17:47:59 +0000 +Subject: [PATCH 23/77] tools/ocaml: Expose arch_config in domaininfo + +Signed-off-by: Andrew Cooper +--- + tools/ocaml/libs/xc/xenctrl.ml | 29 +++++++++++++++++++++++++++++ + tools/ocaml/libs/xc/xenctrl.mli | 28 ++++++++++++++++++++++++++++ + tools/ocaml/libs/xc/xenctrl_stubs.c | 26 ++++++++++++++++++++++++-- + 3 files changed, 81 insertions(+), 2 deletions(-) + +diff --git a/tools/ocaml/libs/xc/xenctrl.ml b/tools/ocaml/libs/xc/xenctrl.ml +index 70a325b0e9..d549068d60 100644 +--- a/tools/ocaml/libs/xc/xenctrl.ml ++++ b/tools/ocaml/libs/xc/xenctrl.ml +@@ -28,6 +28,34 @@ type vcpuinfo = + cpumap: int32; + } + ++type xen_arm_arch_domainconfig = ++{ ++ gic_version: int; ++ nr_spis: int; ++ clock_frequency: int32; ++} ++ ++type x86_arch_emulation_flags = ++ | X86_EMU_LAPIC ++ | X86_EMU_HPET ++ | X86_EMU_PM ++ | X86_EMU_RTC ++ | X86_EMU_IOAPIC ++ | X86_EMU_PIC ++ | X86_EMU_VGA ++ | X86_EMU_IOMMU ++ | X86_EMU_PIT ++ | X86_EMU_USE_PIRQ ++ ++type xen_x86_arch_domainconfig = ++{ ++ emulation_flags: x86_arch_emulation_flags list; ++} ++ ++type arch_domainconfig = ++ | ARM of xen_arm_arch_domainconfig ++ | X86 of xen_x86_arch_domainconfig ++ + type domaininfo = + { + domid : domid; +@@ -46,6 +74,7 @@ type domaininfo = + max_vcpu_id : int; + ssidref : int32; + handle : int array; ++ arch_config : arch_domainconfig; + } + + type sched_control = +diff --git a/tools/ocaml/libs/xc/xenctrl.mli b/tools/ocaml/libs/xc/xenctrl.mli +index 702d8a7ab8..08f1fd26ae 100644 +--- a/tools/ocaml/libs/xc/xenctrl.mli ++++ b/tools/ocaml/libs/xc/xenctrl.mli +@@ -22,6 +22,33 @@ type vcpuinfo = { + cputime : int64; + cpumap : int32; + } ++ ++type xen_arm_arch_domainconfig = { ++ gic_version: int; ++ nr_spis: int; ++ clock_frequency: int32; ++} ++ ++type x86_arch_emulation_flags = ++ | X86_EMU_LAPIC ++ | X86_EMU_HPET ++ | X86_EMU_PM ++ | X86_EMU_RTC ++ | X86_EMU_IOAPIC ++ | X86_EMU_PIC ++ | X86_EMU_VGA ++ | X86_EMU_IOMMU ++ | X86_EMU_PIT ++ | X86_EMU_USE_PIRQ ++ ++type xen_x86_arch_domainconfig = { ++ emulation_flags: x86_arch_emulation_flags list; ++} ++ ++type arch_domainconfig = ++ | ARM of xen_arm_arch_domainconfig ++ | X86 of xen_x86_arch_domainconfig ++ + type domaininfo = { + domid : domid; + dying : bool; +@@ -39,6 +66,7 @@ type domaininfo = { + max_vcpu_id : int; + ssidref : int32; + handle : int array; ++ arch_config : arch_domainconfig; + } + type sched_control = { weight : int; cap : int; } + type physinfo_cap_flag = CAP_HVM | CAP_DirectIO +diff --git a/tools/ocaml/libs/xc/xenctrl_stubs.c b/tools/ocaml/libs/xc/xenctrl_stubs.c +index c66732f67c..124aa34fe8 100644 +--- a/tools/ocaml/libs/xc/xenctrl_stubs.c ++++ b/tools/ocaml/libs/xc/xenctrl_stubs.c +@@ -273,10 +273,10 @@ CAMLprim value stub_xc_domain_shutdown(value xch, value domid, value reason) + static value alloc_domaininfo(xc_domaininfo_t * info) + { + CAMLparam0(); +- CAMLlocal2(result, tmp); ++ CAMLlocal5(result, tmp, arch_config, x86_arch_config, emul_list); + int i; + +- result = caml_alloc_tuple(16); ++ result = caml_alloc_tuple(17); + + Store_field(result, 0, Val_int(info->domain)); + Store_field(result, 1, Val_bool(info->flags & XEN_DOMINF_dying)); +@@ -302,6 +302,28 @@ static value alloc_domaininfo(xc_domaininfo_t * info) + + Store_field(result, 15, tmp); + ++ /* emulation_flags: x86_arch_emulation_flags list; */ ++ tmp = emul_list = Val_emptylist; ++ for (i = 0; i < 10; i++) { ++ if ((info->arch_config.emulation_flags >> i) & 1) { ++ tmp = caml_alloc_small(2, Tag_cons); ++ Field(tmp, 0) = Val_int(i); ++ Field(tmp, 1) = emul_list; ++ emul_list = tmp; ++ } ++ } ++ ++ /* xen_x86_arch_domainconfig */ ++ x86_arch_config = caml_alloc_tuple(1); ++ Store_field(x86_arch_config, 0, emul_list); ++ ++ /* arch_config: arch_domainconfig */ ++ arch_config = caml_alloc_small(1, 1); ++ ++ Store_field(arch_config, 0, x86_arch_config); ++ ++ Store_field(result, 16, arch_config); ++ + CAMLreturn(result); + } + +-- +2.14.3 + + +From 48811d481cedd5838a2d0ba8dfa149133888c84b Mon Sep 17 00:00:00 2001 +From: Jon Ludlam +Date: Thu, 11 Jan 2018 17:47:59 +0000 +Subject: [PATCH 24/77] tools/ocaml: Extend domain_create() to take + arch_domainconfig + +No longer passing NULL into xc_domain_create() allows for the creation +of PVH guests. + +Signed-off-by: Jon Ludlam +Signed-off-by: Andrew Cooper +--- + tools/ocaml/libs/xc/xenctrl.ml | 2 +- + tools/ocaml/libs/xc/xenctrl.mli | 2 +- + tools/ocaml/libs/xc/xenctrl_stubs.c | 22 ++++++++++++++++++++-- + 3 files changed, 22 insertions(+), 4 deletions(-) + +diff --git a/tools/ocaml/libs/xc/xenctrl.ml b/tools/ocaml/libs/xc/xenctrl.ml +index d549068d60..9116aa222c 100644 +--- a/tools/ocaml/libs/xc/xenctrl.ml ++++ b/tools/ocaml/libs/xc/xenctrl.ml +@@ -143,7 +143,7 @@ let with_intf f = + interface_close xc; + r + +-external _domain_create: handle -> int32 -> domain_create_flag list -> int array -> domid ++external _domain_create: handle -> int32 -> domain_create_flag list -> int array -> arch_domainconfig -> domid + = "stub_xc_domain_create" + + let int_array_of_uuid_string s = +diff --git a/tools/ocaml/libs/xc/xenctrl.mli b/tools/ocaml/libs/xc/xenctrl.mli +index 08f1fd26ae..54c099c88f 100644 +--- a/tools/ocaml/libs/xc/xenctrl.mli ++++ b/tools/ocaml/libs/xc/xenctrl.mli +@@ -102,7 +102,7 @@ external sizeof_xen_pfn : unit -> int = "stub_sizeof_xen_pfn" + external interface_open : unit -> handle = "stub_xc_interface_open" + external interface_close : handle -> unit = "stub_xc_interface_close" + val with_intf : (handle -> 'a) -> 'a +-val domain_create : handle -> int32 -> domain_create_flag list -> string -> domid ++val domain_create : handle -> int32 -> domain_create_flag list -> string -> arch_domainconfig -> domid + val domain_sethandle : handle -> domid -> string -> unit + external domain_max_vcpus : handle -> domid -> int -> unit + = "stub_xc_domain_max_vcpus" +diff --git a/tools/ocaml/libs/xc/xenctrl_stubs.c b/tools/ocaml/libs/xc/xenctrl_stubs.c +index 124aa34fe8..0b5a2361c0 100644 +--- a/tools/ocaml/libs/xc/xenctrl_stubs.c ++++ b/tools/ocaml/libs/xc/xenctrl_stubs.c +@@ -144,7 +144,8 @@ static int domain_create_flag_table[] = { + }; + + CAMLprim value stub_xc_domain_create(value xch, value ssidref, +- value flags, value handle) ++ value flags, value handle, ++ value domconfig) + { + CAMLparam4(xch, ssidref, flags, handle); + +@@ -155,6 +156,7 @@ CAMLprim value stub_xc_domain_create(value xch, value ssidref, + uint32_t c_ssidref = Int32_val(ssidref); + unsigned int c_flags = 0; + value l; ++ xc_domain_configuration_t config = {}; + + if (Wosize_val(handle) != 16) + caml_invalid_argument("Handle not a 16-integer array"); +@@ -168,8 +170,24 @@ CAMLprim value stub_xc_domain_create(value xch, value ssidref, + c_flags |= domain_create_flag_table[v]; + } + ++ switch(Tag_val(domconfig)) { ++ case 0: /* ARM - nothing to do */ ++ caml_failwith("Unhandled: ARM"); ++ break; ++ ++ case 1: /* X86 - emulation flags in the block */ ++ for (l = Field(Field(domconfig, 0), 0); ++ l != Val_none; ++ l = Field(l, 1)) ++ config.emulation_flags |= 1u << Int_val(Field(l, 0)); ++ break; ++ ++ default: ++ caml_failwith("Unhandled domconfig type"); ++ } ++ + caml_enter_blocking_section(); +- result = xc_domain_create(_H(xch), c_ssidref, h, c_flags, &domid, NULL); ++ result = xc_domain_create(_H(xch), c_ssidref, h, c_flags, &domid, &config); + caml_leave_blocking_section(); + + if (result < 0) +-- +2.14.3 + + +From 57dc22b80d3ba6db7eea87d84a009015e65eefb0 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 11 Jan 2018 17:48:00 +0000 +Subject: [PATCH 25/77] x86/fixmap: Modify fix_to_virt() to return a void + pointer + +Almost all users of fix_to_virt() actually want a pointer. Include the cast +within the definition, so the callers don't need to. + +Two users which need the integer value are switched to using __fix_to_virt() +directly. A few users stay fully unchanged, due to GCC's void pointer +arithmetic extension causing the same behaviour. Most users however have +their explicit casting dropped. + +Since __iomem is not used consistently in Xen, we drop it too. + +No functional change. + +Signed-off-by: Andrew Cooper +Reviewed-by: Wei Liu +Signed-off-by: Wei Liu +--- +v2: update commit message and remove unnecessary parentheses. +--- + xen/arch/x86/acpi/lib.c | 2 +- + xen/arch/x86/mm.c | 4 ++-- + xen/arch/x86/mpparse.c | 2 +- + xen/arch/x86/msi.c | 3 +-- + xen/arch/x86/tboot.c | 4 ++-- + xen/drivers/acpi/apei/apei-io.c | 2 +- + xen/drivers/char/ehci-dbgp.c | 2 +- + xen/drivers/char/ns16550.c | 2 +- + xen/include/asm-x86/apicdef.h | 2 +- + xen/include/asm-x86/fixmap.h | 2 +- + 10 files changed, 12 insertions(+), 13 deletions(-) + +diff --git a/xen/arch/x86/acpi/lib.c b/xen/arch/x86/acpi/lib.c +index 7d7c71848b..265b9ad819 100644 +--- a/xen/arch/x86/acpi/lib.c ++++ b/xen/arch/x86/acpi/lib.c +@@ -49,7 +49,7 @@ char *__acpi_map_table(paddr_t phys, unsigned long size) + offset = phys & (PAGE_SIZE - 1); + mapped_size = PAGE_SIZE - offset; + set_fixmap(FIX_ACPI_END, phys); +- base = fix_to_virt(FIX_ACPI_END); ++ base = __fix_to_virt(FIX_ACPI_END); + + /* + * Most cases can be covered by the below. +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index a7a76a71db..0569342200 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -5205,12 +5205,12 @@ void __set_fixmap( + enum fixed_addresses idx, unsigned long mfn, unsigned long flags) + { + BUG_ON(idx >= __end_of_fixed_addresses); +- map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags); ++ map_pages_to_xen(__fix_to_virt(idx), mfn, 1, flags); + } + + void *__init arch_vmap_virt_end(void) + { +- return (void *)fix_to_virt(__end_of_fixed_addresses); ++ return fix_to_virt(__end_of_fixed_addresses); + } + + void __iomem *ioremap(paddr_t pa, size_t len) +diff --git a/xen/arch/x86/mpparse.c b/xen/arch/x86/mpparse.c +index a1a0738a19..49140e46f0 100644 +--- a/xen/arch/x86/mpparse.c ++++ b/xen/arch/x86/mpparse.c +@@ -703,7 +703,7 @@ static void __init efi_check_config(void) + return; + + __set_fixmap(FIX_EFI_MPF, PFN_DOWN(efi.mps), __PAGE_HYPERVISOR); +- mpf = (void *)fix_to_virt(FIX_EFI_MPF) + ((long)efi.mps & (PAGE_SIZE-1)); ++ mpf = fix_to_virt(FIX_EFI_MPF) + ((long)efi.mps & (PAGE_SIZE-1)); + + if (memcmp(mpf->mpf_signature, "_MP_", 4) == 0 && + mpf->mpf_length == 1 && +diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c +index 4652b98c2d..475881ed89 100644 +--- a/xen/arch/x86/msi.c ++++ b/xen/arch/x86/msi.c +@@ -961,8 +961,7 @@ static int msix_capability_init(struct pci_dev *dev, + xfree(entry); + return idx; + } +- base = (void *)(fix_to_virt(idx) + +- ((unsigned long)entry_paddr & (PAGE_SIZE - 1))); ++ base = fix_to_virt(idx) + (entry_paddr & (PAGE_SIZE - 1)); + + /* Mask interrupt here */ + writel(1, base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); +diff --git a/xen/arch/x86/tboot.c b/xen/arch/x86/tboot.c +index 59d7c477f4..d36bf33407 100644 +--- a/xen/arch/x86/tboot.c ++++ b/xen/arch/x86/tboot.c +@@ -82,7 +82,7 @@ static void __init tboot_copy_memory(unsigned char *va, uint32_t size, + { + map_base = PFN_DOWN(pa + i); + set_fixmap(FIX_TBOOT_MAP_ADDRESS, map_base << PAGE_SHIFT); +- map_addr = (unsigned char *)fix_to_virt(FIX_TBOOT_MAP_ADDRESS); ++ map_addr = fix_to_virt(FIX_TBOOT_MAP_ADDRESS); + } + va[i] = map_addr[pa + i - (map_base << PAGE_SHIFT)]; + } +@@ -98,7 +98,7 @@ void __init tboot_probe(void) + + /* Map and check for tboot UUID. */ + set_fixmap(FIX_TBOOT_SHARED_BASE, opt_tboot_pa); +- tboot_shared = (tboot_shared_t *)fix_to_virt(FIX_TBOOT_SHARED_BASE); ++ tboot_shared = fix_to_virt(FIX_TBOOT_SHARED_BASE); + if ( tboot_shared == NULL ) + return; + if ( memcmp(&tboot_shared_uuid, (uuid_t *)tboot_shared, sizeof(uuid_t)) ) +diff --git a/xen/drivers/acpi/apei/apei-io.c b/xen/drivers/acpi/apei/apei-io.c +index 8955de935e..89b70f45ef 100644 +--- a/xen/drivers/acpi/apei/apei-io.c ++++ b/xen/drivers/acpi/apei/apei-io.c +@@ -92,7 +92,7 @@ static void __iomem *__init apei_range_map(paddr_t paddr, unsigned long size) + apei_range_nr++; + } + +- return (void __iomem *)fix_to_virt(FIX_APEI_RANGE_BASE + start_nr); ++ return fix_to_virt(FIX_APEI_RANGE_BASE + start_nr); + } + + /* +diff --git a/xen/drivers/char/ehci-dbgp.c b/xen/drivers/char/ehci-dbgp.c +index d48e777c34..d0071d3114 100644 +--- a/xen/drivers/char/ehci-dbgp.c ++++ b/xen/drivers/char/ehci-dbgp.c +@@ -1327,7 +1327,7 @@ static void __init ehci_dbgp_init_preirq(struct serial_port *port) + * than enough. 1k is the biggest that was seen. + */ + set_fixmap_nocache(FIX_EHCI_DBGP, dbgp->bar_val); +- ehci_bar = (void __iomem *)fix_to_virt(FIX_EHCI_DBGP); ++ ehci_bar = fix_to_virt(FIX_EHCI_DBGP); + ehci_bar += dbgp->bar_val & ~PAGE_MASK; + dbgp_printk("ehci_bar: %p\n", ehci_bar); + +diff --git a/xen/drivers/char/ns16550.c b/xen/drivers/char/ns16550.c +index e0f8199f98..f32dbd3247 100644 +--- a/xen/drivers/char/ns16550.c ++++ b/xen/drivers/char/ns16550.c +@@ -697,7 +697,7 @@ static void __init ns16550_init_preirq(struct serial_port *port) + enum fixed_addresses idx = FIX_COM_BEGIN + (uart - ns16550_com); + + set_fixmap_nocache(idx, uart->io_base); +- uart->remapped_io_base = (void __iomem *)fix_to_virt(idx); ++ uart->remapped_io_base = fix_to_virt(idx); + uart->remapped_io_base += uart->io_base & ~PAGE_MASK; + #else + uart->remapped_io_base = (char *)ioremap(uart->io_base, uart->io_size); +diff --git a/xen/include/asm-x86/apicdef.h b/xen/include/asm-x86/apicdef.h +index eed504a31a..2fa0b77a8a 100644 +--- a/xen/include/asm-x86/apicdef.h ++++ b/xen/include/asm-x86/apicdef.h +@@ -119,7 +119,7 @@ + /* Only available in x2APIC mode */ + #define APIC_SELF_IPI 0x3F0 + +-#define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) ++#define APIC_BASE __fix_to_virt(FIX_APIC_BASE) + + /* It's only used in x2APIC mode of an x2APIC unit. */ + #define APIC_MSR_BASE 0x800 +diff --git a/xen/include/asm-x86/fixmap.h b/xen/include/asm-x86/fixmap.h +index 89bf6cb611..51b0e7e945 100644 +--- a/xen/include/asm-x86/fixmap.h ++++ b/xen/include/asm-x86/fixmap.h +@@ -79,7 +79,7 @@ extern void __set_fixmap( + #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) + #define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + +-#define fix_to_virt(x) (__fix_to_virt(x)) ++#define fix_to_virt(x) ((void *)__fix_to_virt(x)) + + static inline unsigned long virt_to_fix(const unsigned long vaddr) + { +-- +2.14.3 + + +From b538a13a68b42dbe47832d76299011765bf59e60 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 11 Jan 2018 17:48:00 +0000 +Subject: [PATCH 26/77] x86: Common cpuid faulting support + +With CPUID Faulting offered to SVM guests, move Xen's faulting code to being +common rather than Intel specific. + +This is necessary for nested Xen (inc. pv-shim mode) to prevent PV guests from +finding the outer HVM Xen leaves via native cpuid. + +Signed-off-by: Andrew Cooper +--- + xen/arch/x86/cpu/amd.c | 16 +++++--- + xen/arch/x86/cpu/common.c | 76 ++++++++++++++++++++++++++++++++++++-- + xen/arch/x86/cpu/intel.c | 82 +++++++---------------------------------- + xen/include/asm-x86/cpuid.h | 3 -- + xen/include/asm-x86/processor.h | 4 +- + 5 files changed, 98 insertions(+), 83 deletions(-) + +diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c +index 5f36ac75a7..2bff3ee377 100644 +--- a/xen/arch/x86/cpu/amd.c ++++ b/xen/arch/x86/cpu/amd.c +@@ -198,11 +198,12 @@ static void __init noinline probe_masking_msrs(void) + } + + /* +- * Context switch levelling state to the next domain. A parameter of NULL is +- * used to context switch to the default host state (by the cpu bringup-code, +- * crash path, etc). ++ * Context switch CPUID masking state to the next domain. Only called if ++ * CPUID Faulting isn't available, but masking MSRs have been detected. A ++ * parameter of NULL is used to context switch to the default host state (by ++ * the cpu bringup-code, crash path, etc). + */ +-static void amd_ctxt_switch_levelling(const struct vcpu *next) ++static void amd_ctxt_switch_masking(const struct vcpu *next) + { + struct cpuidmasks *these_masks = &this_cpu(cpuidmasks); + const struct domain *nextd = next ? next->domain : NULL; +@@ -263,6 +264,9 @@ static void __init noinline amd_init_levelling(void) + { + const struct cpuidmask *m = NULL; + ++ if (probe_cpuid_faulting()) ++ return; ++ + probe_masking_msrs(); + + if (*opt_famrev != '\0') { +@@ -352,7 +356,7 @@ static void __init noinline amd_init_levelling(void) + } + + if (levelling_caps) +- ctxt_switch_levelling = amd_ctxt_switch_levelling; ++ ctxt_switch_masking = amd_ctxt_switch_masking; + } + + /* +@@ -518,7 +522,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) + if (c == &boot_cpu_data) + amd_init_levelling(); + +- amd_ctxt_switch_levelling(NULL); ++ ctxt_switch_levelling(NULL); + } + + static void init_amd(struct cpuinfo_x86 *c) +diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c +index 6cf362849e..157bae2026 100644 +--- a/xen/arch/x86/cpu/common.c ++++ b/xen/arch/x86/cpu/common.c +@@ -113,12 +113,80 @@ static const struct cpu_dev default_cpu = { + }; + static const struct cpu_dev *this_cpu = &default_cpu; + +-static void default_ctxt_switch_levelling(const struct vcpu *next) ++static DEFINE_PER_CPU(uint64_t, msr_misc_features); ++void (* __read_mostly ctxt_switch_masking)(const struct vcpu *next); ++ ++bool __init probe_cpuid_faulting(void) ++{ ++ uint64_t val; ++ ++ if (rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) || ++ !(val & MSR_PLATFORM_INFO_CPUID_FAULTING) || ++ rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, ++ this_cpu(msr_misc_features))) ++ { ++ setup_clear_cpu_cap(X86_FEATURE_CPUID_FAULTING); ++ return false; ++ } ++ ++ expected_levelling_cap |= LCAP_faulting; ++ levelling_caps |= LCAP_faulting; ++ setup_force_cpu_cap(X86_FEATURE_CPUID_FAULTING); ++ ++ return true; ++} ++ ++static void set_cpuid_faulting(bool enable) ++{ ++ uint64_t *this_misc_features = &this_cpu(msr_misc_features); ++ uint64_t val = *this_misc_features; ++ ++ if (!!(val & MSR_MISC_FEATURES_CPUID_FAULTING) == enable) ++ return; ++ ++ val ^= MSR_MISC_FEATURES_CPUID_FAULTING; ++ ++ wrmsrl(MSR_INTEL_MISC_FEATURES_ENABLES, val); ++ *this_misc_features = val; ++} ++ ++void ctxt_switch_levelling(const struct vcpu *next) + { +- /* Nop */ ++ const struct domain *nextd = next ? next->domain : NULL; ++ ++ if (cpu_has_cpuid_faulting) { ++ /* ++ * No need to alter the faulting setting if we are switching ++ * to idle; it won't affect any code running in idle context. ++ */ ++ if (nextd && is_idle_domain(nextd)) ++ return; ++ /* ++ * We *should* be enabling faulting for the control domain. ++ * ++ * Unfortunately, the domain builder (having only ever been a ++ * PV guest) expects to be able to see host cpuid state in a ++ * native CPUID instruction, to correctly build a CPUID policy ++ * for HVM guests (notably the xstate leaves). ++ * ++ * This logic is fundimentally broken for HVM toolstack ++ * domains, and faulting causes PV guests to behave like HVM ++ * guests from their point of view. ++ * ++ * Future development plans will move responsibility for ++ * generating the maximum full cpuid policy into Xen, at which ++ * this problem will disappear. ++ */ ++ set_cpuid_faulting(nextd && !is_control_domain(nextd) && ++ (is_pv_domain(nextd) || ++ next->arch.msr-> ++ misc_features_enables.cpuid_faulting)); ++ return; ++ } ++ ++ if (ctxt_switch_masking) ++ ctxt_switch_masking(next); + } +-void (* __read_mostly ctxt_switch_levelling)(const struct vcpu *next) = +- default_ctxt_switch_levelling; + + bool_t opt_cpu_info; + boolean_param("cpuinfo", opt_cpu_info); +diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c +index ac932e5b38..508e56f5c1 100644 +--- a/xen/arch/x86/cpu/intel.c ++++ b/xen/arch/x86/cpu/intel.c +@@ -17,41 +17,6 @@ + + #define select_idle_routine(x) ((void)0) + +-static bool __init probe_intel_cpuid_faulting(void) +-{ +- uint64_t x; +- +- if (rdmsr_safe(MSR_INTEL_PLATFORM_INFO, x) || +- !(x & MSR_PLATFORM_INFO_CPUID_FAULTING)) +- return 0; +- +- expected_levelling_cap |= LCAP_faulting; +- levelling_caps |= LCAP_faulting; +- setup_force_cpu_cap(X86_FEATURE_CPUID_FAULTING); +- return 1; +-} +- +-DEFINE_PER_CPU(bool, cpuid_faulting_enabled); +- +-static void set_cpuid_faulting(bool enable) +-{ +- bool *this_enabled = &this_cpu(cpuid_faulting_enabled); +- uint32_t hi, lo; +- +- ASSERT(cpu_has_cpuid_faulting); +- +- if (*this_enabled == enable) +- return; +- +- rdmsr(MSR_INTEL_MISC_FEATURES_ENABLES, lo, hi); +- lo &= ~MSR_MISC_FEATURES_CPUID_FAULTING; +- if (enable) +- lo |= MSR_MISC_FEATURES_CPUID_FAULTING; +- wrmsr(MSR_INTEL_MISC_FEATURES_ENABLES, lo, hi); +- +- *this_enabled = enable; +-} +- + /* + * Set caps in expected_levelling_cap, probe a specific masking MSR, and set + * caps in levelling_caps if it is found, or clobber the MSR index if missing. +@@ -147,40 +112,17 @@ static void __init probe_masking_msrs(void) + } + + /* +- * Context switch levelling state to the next domain. A parameter of NULL is +- * used to context switch to the default host state (by the cpu bringup-code, +- * crash path, etc). ++ * Context switch CPUID masking state to the next domain. Only called if ++ * CPUID Faulting isn't available, but masking MSRs have been detected. A ++ * parameter of NULL is used to context switch to the default host state (by ++ * the cpu bringup-code, crash path, etc). + */ +-static void intel_ctxt_switch_levelling(const struct vcpu *next) ++static void intel_ctxt_switch_masking(const struct vcpu *next) + { + struct cpuidmasks *these_masks = &this_cpu(cpuidmasks); + const struct domain *nextd = next ? next->domain : NULL; +- const struct cpuidmasks *masks; +- +- if (cpu_has_cpuid_faulting) { +- /* +- * We *should* be enabling faulting for the control domain. +- * +- * Unfortunately, the domain builder (having only ever been a +- * PV guest) expects to be able to see host cpuid state in a +- * native CPUID instruction, to correctly build a CPUID policy +- * for HVM guests (notably the xstate leaves). +- * +- * This logic is fundimentally broken for HVM toolstack +- * domains, and faulting causes PV guests to behave like HVM +- * guests from their point of view. +- * +- * Future development plans will move responsibility for +- * generating the maximum full cpuid policy into Xen, at which +- * this problem will disappear. +- */ +- set_cpuid_faulting(nextd && !is_control_domain(nextd) && +- (is_pv_domain(nextd) || +- next->arch.msr->misc_features_enables.cpuid_faulting)); +- return; +- } +- +- masks = (nextd && is_pv_domain(nextd) && nextd->arch.pv_domain.cpuidmasks) ++ const struct cpuidmasks *masks = ++ (nextd && is_pv_domain(nextd) && nextd->arch.pv_domain.cpuidmasks) + ? nextd->arch.pv_domain.cpuidmasks : &cpuidmask_defaults; + + if (msr_basic) { +@@ -225,8 +167,10 @@ static void intel_ctxt_switch_levelling(const struct vcpu *next) + */ + static void __init noinline intel_init_levelling(void) + { +- if (!probe_intel_cpuid_faulting()) +- probe_masking_msrs(); ++ if (probe_cpuid_faulting()) ++ return; ++ ++ probe_masking_msrs(); + + if (msr_basic) { + uint32_t ecx, edx, tmp; +@@ -280,7 +224,7 @@ static void __init noinline intel_init_levelling(void) + } + + if (levelling_caps) +- ctxt_switch_levelling = intel_ctxt_switch_levelling; ++ ctxt_switch_masking = intel_ctxt_switch_masking; + } + + static void early_init_intel(struct cpuinfo_x86 *c) +@@ -320,7 +264,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) + if (c == &boot_cpu_data) + intel_init_levelling(); + +- intel_ctxt_switch_levelling(NULL); ++ ctxt_switch_levelling(NULL); + } + + /* +diff --git a/xen/include/asm-x86/cpuid.h b/xen/include/asm-x86/cpuid.h +index d2dd841e15..74d6f123e5 100644 +--- a/xen/include/asm-x86/cpuid.h ++++ b/xen/include/asm-x86/cpuid.h +@@ -58,9 +58,6 @@ DECLARE_PER_CPU(struct cpuidmasks, cpuidmasks); + /* Default masking MSR values, calculated at boot. */ + extern struct cpuidmasks cpuidmask_defaults; + +-/* Whether or not cpuid faulting is available for the current domain. */ +-DECLARE_PER_CPU(bool, cpuid_faulting_enabled); +- + #define CPUID_GUEST_NR_BASIC (0xdu + 1) + #define CPUID_GUEST_NR_FEAT (0u + 1) + #define CPUID_GUEST_NR_CACHE (5u + 1) +diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h +index 41a8d8c32f..c9601b2fb2 100644 +--- a/xen/include/asm-x86/processor.h ++++ b/xen/include/asm-x86/processor.h +@@ -151,7 +151,9 @@ extern struct cpuinfo_x86 boot_cpu_data; + extern struct cpuinfo_x86 cpu_data[]; + #define current_cpu_data cpu_data[smp_processor_id()] + +-extern void (*ctxt_switch_levelling)(const struct vcpu *next); ++extern bool probe_cpuid_faulting(void); ++extern void ctxt_switch_levelling(const struct vcpu *next); ++extern void (*ctxt_switch_masking)(const struct vcpu *next); + + extern u64 host_pat; + extern bool_t opt_cpu_info; +-- +2.14.3 + + +From af2f50b2b6f284a5498bcfe8e4203b25e120338e Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Fri, 10 Nov 2017 16:35:26 +0000 +Subject: [PATCH 27/77] x86/Kconfig: Options for Xen and PVH support + +Introduce two options. One to detect whether the binary is running on +Xen, the other enables PVH ABI support. + +The former will be useful to PV in HVM approach. Both will be used by +PV in PVH approach. + +Signed-off-by: Andrew Cooper +Signed-off-by: Wei Liu +--- +v2: +Write commit message. Didn't change the config option value as it +requires a lot of changes in later patches. +--- + xen/arch/x86/Kconfig | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig +index 7c4582922f..c0b0bcdcb3 100644 +--- a/xen/arch/x86/Kconfig ++++ b/xen/arch/x86/Kconfig +@@ -117,6 +117,23 @@ config TBOOT + Technology (TXT) + + If unsure, say Y. ++ ++config XEN_GUEST ++ def_bool n ++ prompt "Xen Guest" ++ ---help--- ++ Support for Xen detecting when it is running under Xen. ++ ++ If unsure, say N. ++ ++config PVH_GUEST ++ def_bool n ++ prompt "PVH Guest" ++ depends on XEN_GUEST ++ ---help--- ++ Support booting using the PVH ABI. ++ ++ If unsure, say N. + endmenu + + source "common/Kconfig" +-- +2.14.3 + + +From f575701f3c7a6c6afde7c289058d9d3110a617d1 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Wed, 22 Nov 2017 11:09:41 +0000 +Subject: [PATCH 28/77] x86/link: Relocate program headers + +When the xen binary is loaded by libelf (in the future) we rely on the +elf loader to load the binary accordingly. Specify the load address so +that the resulting binary can make p_vaddr and p_paddr have different +values. + +Signed-off-by: Andrew Cooper +Signed-off-by: Wei Liu +--- +v2: +Clarify commit message. Haven't tested grub1 boot. +--- + xen/arch/x86/xen.lds.S | 22 +++++++++++++--------- + 1 file changed, 13 insertions(+), 9 deletions(-) + +diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S +index 6164ad094f..400d8a56c4 100644 +--- a/xen/arch/x86/xen.lds.S ++++ b/xen/arch/x86/xen.lds.S +@@ -13,6 +13,7 @@ + #undef __XEN_VIRT_START + #define __XEN_VIRT_START __image_base__ + #define SECTION_ALIGN MB(2) ++#define DECL_SECTION(x) x : + + ENTRY(efi_start) + +@@ -20,8 +21,9 @@ ENTRY(efi_start) + + #define FORMAT "elf64-x86-64" + #define SECTION_ALIGN PAGE_SIZE ++#define DECL_SECTION(x) x : AT(ADDR(x) - __XEN_VIRT_START) + +-ENTRY(start) ++ENTRY(start_pa) + + #endif /* EFI */ + +@@ -56,9 +58,11 @@ SECTIONS + __2M_text_start = .; /* Start of 2M superpages, mapped RX. */ + #endif + ++ start_pa = ABSOLUTE(start - __XEN_VIRT_START); ++ + . = __XEN_VIRT_START + XEN_IMG_OFFSET; + _start = .; +- .text : { ++ DECL_SECTION(.text) { + _stext = .; /* Text and read-only data */ + *(.text) + *(.text.cold) +@@ -73,7 +77,7 @@ SECTIONS + __2M_text_end = .; + + __2M_rodata_start = .; /* Start of 2M superpages, mapped RO. */ +- .rodata : { ++ DECL_SECTION(.rodata) { + _srodata = .; + /* Bug frames table */ + __start_bug_frames = .; +@@ -132,13 +136,13 @@ SECTIONS + * compiler may want to inject other things in the .note which we don't care + * about - hence this unique name. + */ +- .note.gnu.build-id : { ++ DECL_SECTION(.note.gnu.build-id) { + __note_gnu_build_id_start = .; + *(.note.gnu.build-id) + __note_gnu_build_id_end = .; + } :note :text + #elif defined(BUILD_ID_EFI) +- .buildid : { ++ DECL_SECTION(.buildid) { + __note_gnu_build_id_start = .; + *(.buildid) + __note_gnu_build_id_end = .; +@@ -153,7 +157,7 @@ SECTIONS + __2M_init_start = .; /* Start of 2M superpages, mapped RWX (boot only). */ + . = ALIGN(PAGE_SIZE); /* Init code and data */ + __init_begin = .; +- .init : { ++ DECL_SECTION(.init) { + _sinittext = .; + *(.init.text) + /* +@@ -215,7 +219,7 @@ SECTIONS + + __2M_rwdata_start = .; /* Start of 2M superpages, mapped RW. */ + . = ALIGN(SMP_CACHE_BYTES); +- .data.read_mostly : { ++ DECL_SECTION(.data.read_mostly) { + *(.data.read_mostly) + . = ALIGN(8); + __start_schedulers_array = .; +@@ -223,7 +227,7 @@ SECTIONS + __end_schedulers_array = .; + } :text + +- .data : { /* Data */ ++ DECL_SECTION(.data) { + *(.data.page_aligned) + *(.data) + *(.data.rel) +@@ -231,7 +235,7 @@ SECTIONS + CONSTRUCTORS + } :text + +- .bss : { /* BSS */ ++ DECL_SECTION(.bss) { + __bss_start = .; + *(.bss.stack_aligned) + *(.bss.page_aligned*) +-- +2.14.3 + + +From 887c705600114c502cd3b529659af085680f526a Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Fri, 10 Nov 2017 12:36:49 +0000 +Subject: [PATCH 29/77] x86: introduce ELFNOTE macro + +It is needed later for introducing PVH entry point. + +Signed-off-by: Wei Liu +--- +v2: +1. Specify section attribute and type. +2. Use p2align. +3. Align instructions. +4. Haven't used .L or turned it into assembly macro. +--- + xen/include/asm-x86/asm_defns.h | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/xen/include/asm-x86/asm_defns.h b/xen/include/asm-x86/asm_defns.h +index 388fc93b9d..35a5d9ee03 100644 +--- a/xen/include/asm-x86/asm_defns.h ++++ b/xen/include/asm-x86/asm_defns.h +@@ -409,4 +409,16 @@ static always_inline void stac(void) + #define REX64_PREFIX "rex64/" + #endif + ++#define ELFNOTE(name, type, desc) \ ++ .pushsection .note.name, "a", @note ; \ ++ .p2align 2 ; \ ++ .long 2f - 1f /* namesz */ ; \ ++ .long 4f - 3f /* descsz */ ; \ ++ .long type /* type */ ; \ ++1: .asciz #name /* name */ ; \ ++2: .p2align 2 ; \ ++3: desc /* desc */ ; \ ++4: .p2align 2 ; \ ++ .popsection ++ + #endif /* __X86_ASM_DEFNS_H__ */ +-- +2.14.3 + + +From 51f937a39bb6acadec1f4ab55f01048c2c1caee0 Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Fri, 10 Nov 2017 16:19:40 +0000 +Subject: [PATCH 30/77] x86: produce a binary that can be booted as PVH + +Produce a binary that can be booted as PVH. It doesn't do much yet. + +Signed-off-by: Wei Liu +Signed-off-by: Andrew Cooper +--- +v2: +1. Remove shim-y dependency. +2. Remove extraneous blank line. +3. Fix bugs in xen.lds.S. +4. Haven't split code into pvh.S because that will break later + patches. +--- + .gitignore | 1 + + xen/arch/x86/Makefile | 8 ++++++++ + xen/arch/x86/boot/head.S | 9 +++++++++ + xen/arch/x86/xen.lds.S | 9 ++++++++- + 4 files changed, 26 insertions(+), 1 deletion(-) + +diff --git a/.gitignore b/.gitignore +index d64b03d06c..8da67daf31 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -323,6 +323,7 @@ xen/xsm/flask/xenpolicy-* + tools/flask/policy/policy.conf + tools/flask/policy/xenpolicy-* + xen/xen ++xen/xen-shim + xen/xen-syms + xen/xen-syms.map + xen/xen.* +diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile +index d5d58a205e..01d1178530 100644 +--- a/xen/arch/x86/Makefile ++++ b/xen/arch/x86/Makefile +@@ -75,6 +75,8 @@ efi-y := $(shell if [ ! -r $(BASEDIR)/include/xen/compile.h -o \ + -O $(BASEDIR)/include/xen/compile.h ]; then \ + echo '$(TARGET).efi'; fi) + ++shim-$(CONFIG_PVH_GUEST) := $(TARGET)-shim ++ + ifneq ($(build_id_linker),) + notes_phdrs = --notes + else +@@ -144,6 +146,11 @@ $(TARGET)-syms: prelink.o xen.lds $(BASEDIR)/common/symbols-dummy.o + >$(@D)/$(@F).map + rm -f $(@D)/.$(@F).[0-9]* + ++# Use elf32-x86-64 if toolchain support exists, elf32-i386 otherwise. ++$(TARGET)-shim: FORMAT = $(firstword $(filter elf32-x86-64,$(shell $(OBJCOPY) --help)) elf32-i386) ++$(TARGET)-shim: $(TARGET)-syms ++ $(OBJCOPY) -O $(FORMAT) $< $@ ++ + note.o: $(TARGET)-syms + $(OBJCOPY) -O binary --only-section=.note.gnu.build-id $(BASEDIR)/xen-syms $@.bin + $(OBJCOPY) -I binary -O elf64-x86-64 -B i386:x86-64 \ +@@ -224,5 +231,6 @@ clean:: + rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32 + rm -f $(BASEDIR)/.xen-syms.[0-9]* boot/.*.d + rm -f $(BASEDIR)/.xen.efi.[0-9]* efi/*.efi efi/disabled efi/mkreloc ++ rm -f $(BASEDIR)/xen-shim + rm -f boot/cmdline.S boot/reloc.S boot/*.lnk boot/*.bin + rm -f note.o +diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S +index 9cc35da558..af25d23736 100644 +--- a/xen/arch/x86/boot/head.S ++++ b/xen/arch/x86/boot/head.S +@@ -7,6 +7,7 @@ + #include + #include + #include ++#include + + .text + .code32 +@@ -374,6 +375,14 @@ cs32_switch: + /* Jump to earlier loaded address. */ + jmp *%edi + ++#ifdef CONFIG_PVH_GUEST ++ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, .long sym_offs(__pvh_start)) ++ ++__pvh_start: ++ ud2a ++ ++#endif /* CONFIG_PVH_GUEST */ ++ + __start: + cld + cli +diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S +index 400d8a56c4..2023f971e4 100644 +--- a/xen/arch/x86/xen.lds.S ++++ b/xen/arch/x86/xen.lds.S +@@ -34,7 +34,7 @@ OUTPUT_ARCH(i386:x86-64) + PHDRS + { + text PT_LOAD ; +-#if defined(BUILD_ID) && !defined(EFI) ++#if (defined(BUILD_ID) || defined (CONFIG_PVH_GUEST)) && !defined(EFI) + note PT_NOTE ; + #endif + } +@@ -128,6 +128,12 @@ SECTIONS + __param_end = .; + } :text + ++#if defined(CONFIG_PVH_GUEST) && !defined(EFI) ++ DECL_SECTION(.note.Xen) { ++ *(.note.Xen) ++ } :note :text ++#endif ++ + #if defined(BUILD_ID) + #if !defined(EFI) + /* +@@ -279,6 +285,7 @@ SECTIONS + #ifdef EFI + *(.comment) + *(.comment.*) ++ *(.note.Xen) + #endif + } + +-- +2.14.3 + + +From db65173fe73568d0c718ce2a1c3ef8dc69c66b99 Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Mon, 13 Nov 2017 17:32:19 +0000 +Subject: [PATCH 31/77] x86/entry: Early PVH boot code + +Signed-off-by: Wei Liu +Signed-off-by: Andrew Cooper +--- +v2: +1. Fix comment. +2. Use cmpb $0. +3. Address comments on pvh-boot.c. +4. Haven't changed the pritnk modifiers to accommodate future changes. +5. Missing a prerequisite patch to relocate pvh_info to make __va work reliably. + [BLOCKER]. +--- + xen/arch/x86/Makefile | 1 + + xen/arch/x86/boot/head.S | 40 +++++++++++- + xen/arch/x86/boot/x86_64.S | 2 +- + xen/arch/x86/guest/Makefile | 1 + + xen/arch/x86/guest/pvh-boot.c | 119 +++++++++++++++++++++++++++++++++++ + xen/arch/x86/setup.c | 18 +++++- + xen/include/asm-x86/guest.h | 34 ++++++++++ + xen/include/asm-x86/guest/pvh-boot.h | 57 +++++++++++++++++ + 8 files changed, 268 insertions(+), 4 deletions(-) + create mode 100644 xen/arch/x86/guest/Makefile + create mode 100644 xen/arch/x86/guest/pvh-boot.c + create mode 100644 xen/include/asm-x86/guest.h + create mode 100644 xen/include/asm-x86/guest/pvh-boot.h + +diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile +index 01d1178530..ac91e13606 100644 +--- a/xen/arch/x86/Makefile ++++ b/xen/arch/x86/Makefile +@@ -1,6 +1,7 @@ + subdir-y += acpi + subdir-y += cpu + subdir-y += genapic ++subdir-$(CONFIG_XEN_GUEST) += guest + subdir-$(CONFIG_HVM) += hvm + subdir-y += mm + subdir-$(CONFIG_XENOPROF) += oprofile +diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S +index af25d23736..14caca6798 100644 +--- a/xen/arch/x86/boot/head.S ++++ b/xen/arch/x86/boot/head.S +@@ -379,7 +379,39 @@ cs32_switch: + ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, .long sym_offs(__pvh_start)) + + __pvh_start: +- ud2a ++ cld ++ cli ++ ++ /* ++ * We need one push/pop to determine load address. Use the same ++ * absolute stack address as the native path, for lack of a better ++ * alternative. ++ */ ++ mov $0x1000, %esp ++ ++ /* Calculate the load base address. */ ++ call 1f ++1: pop %esi ++ sub $sym_offs(1b), %esi ++ ++ /* Set up stack. */ ++ lea STACK_SIZE + sym_esi(cpu0_stack), %esp ++ ++ mov %ebx, sym_esi(pvh_start_info_pa) ++ ++ /* Prepare gdt and segments */ ++ add %esi, sym_esi(gdt_boot_base) ++ lgdt sym_esi(gdt_boot_descr) ++ ++ mov $BOOT_DS, %ecx ++ mov %ecx, %ds ++ mov %ecx, %es ++ mov %ecx, %ss ++ ++ /* Skip bootloader setup and bios setup, go straight to trampoline */ ++ movb $1, sym_esi(pvh_boot) ++ movb $1, sym_esi(skip_realmode) ++ jmp trampoline_setup + + #endif /* CONFIG_PVH_GUEST */ + +@@ -543,12 +575,18 @@ trampoline_setup: + /* Get bottom-most low-memory stack address. */ + add $TRAMPOLINE_SPACE,%ecx + ++#ifdef CONFIG_PVH_GUEST ++ cmpb $0, sym_fs(pvh_boot) ++ jne 1f ++#endif ++ + /* Save the Multiboot info struct (after relocation) for later use. */ + push %ecx /* Bottom-most low-memory stack address. */ + push %ebx /* Multiboot information address. */ + push %eax /* Multiboot magic. */ + call reloc + mov %eax,sym_fs(multiboot_ptr) ++1: + + /* + * Now trampoline_phys points to the following structure (lowest address +diff --git a/xen/arch/x86/boot/x86_64.S b/xen/arch/x86/boot/x86_64.S +index 925fd4bb0a..cf47e019f5 100644 +--- a/xen/arch/x86/boot/x86_64.S ++++ b/xen/arch/x86/boot/x86_64.S +@@ -31,7 +31,7 @@ ENTRY(__high_start) + test %ebx,%ebx + jnz start_secondary + +- /* Pass off the Multiboot info structure to C land. */ ++ /* Pass off the Multiboot info structure to C land (if applicable). */ + mov multiboot_ptr(%rip),%edi + call __start_xen + BUG /* __start_xen() shouldn't return. */ +diff --git a/xen/arch/x86/guest/Makefile b/xen/arch/x86/guest/Makefile +new file mode 100644 +index 0000000000..a5f1625ab1 +--- /dev/null ++++ b/xen/arch/x86/guest/Makefile +@@ -0,0 +1 @@ ++obj-bin-$(CONFIG_PVH_GUEST) += pvh-boot.init.o +diff --git a/xen/arch/x86/guest/pvh-boot.c b/xen/arch/x86/guest/pvh-boot.c +new file mode 100644 +index 0000000000..186e332657 +--- /dev/null ++++ b/xen/arch/x86/guest/pvh-boot.c +@@ -0,0 +1,119 @@ ++/****************************************************************************** ++ * arch/x86/guest/pvh-boot.c ++ * ++ * PVH boot time support ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; If not, see . ++ * ++ * Copyright (c) 2017 Citrix Systems Ltd. ++ */ ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++/* Initialised in head.S, before .bss is zeroed. */ ++bool __initdata pvh_boot; ++uint32_t __initdata pvh_start_info_pa; ++ ++static multiboot_info_t __initdata pvh_mbi; ++static module_t __initdata pvh_mbi_mods[8]; ++static const char *__initdata pvh_loader = "PVH Directboot"; ++ ++static void __init convert_pvh_info(void) ++{ ++ const struct hvm_start_info *pvh_info = __va(pvh_start_info_pa); ++ const struct hvm_modlist_entry *entry; ++ module_t *mod; ++ unsigned int i; ++ ++ ASSERT(pvh_info->magic == XEN_HVM_START_MAGIC_VALUE); ++ ++ /* ++ * Turn hvm_start_info into mbi. Luckily all modules are placed under 4GB ++ * boundary on x86. ++ */ ++ pvh_mbi.flags = MBI_CMDLINE | MBI_MODULES | MBI_LOADERNAME; ++ ++ BUG_ON(pvh_info->cmdline_paddr >> 32); ++ pvh_mbi.cmdline = pvh_info->cmdline_paddr; ++ pvh_mbi.boot_loader_name = __pa(pvh_loader); ++ ++ BUG_ON(pvh_info->nr_modules >= ARRAY_SIZE(pvh_mbi_mods)); ++ pvh_mbi.mods_count = pvh_info->nr_modules; ++ pvh_mbi.mods_addr = __pa(pvh_mbi_mods); ++ ++ mod = pvh_mbi_mods; ++ entry = __va(pvh_info->modlist_paddr); ++ for ( i = 0; i < pvh_info->nr_modules; i++ ) ++ { ++ BUG_ON(entry[i].paddr >> 32); ++ BUG_ON(entry[i].cmdline_paddr >> 32); ++ ++ mod[i].mod_start = entry[i].paddr; ++ mod[i].mod_end = entry[i].paddr + entry[i].size; ++ mod[i].string = entry[i].cmdline_paddr; ++ } ++} ++ ++multiboot_info_t *__init pvh_init(void) ++{ ++ convert_pvh_info(); ++ ++ return &pvh_mbi; ++} ++ ++void __init pvh_print_info(void) ++{ ++ const struct hvm_start_info *pvh_info = __va(pvh_start_info_pa); ++ const struct hvm_modlist_entry *entry; ++ unsigned int i; ++ ++ ASSERT(pvh_info->magic == XEN_HVM_START_MAGIC_VALUE); ++ ++ printk("PVH start info: (pa %08x)\n", pvh_start_info_pa); ++ printk(" version: %u\n", pvh_info->version); ++ printk(" flags: %#"PRIx32"\n", pvh_info->flags); ++ printk(" nr_modules: %u\n", pvh_info->nr_modules); ++ printk(" modlist_pa: %016"PRIx64"\n", pvh_info->modlist_paddr); ++ printk(" cmdline_pa: %016"PRIx64"\n", pvh_info->cmdline_paddr); ++ if ( pvh_info->cmdline_paddr ) ++ printk(" cmdline: '%s'\n", (char *)__va(pvh_info->cmdline_paddr)); ++ printk(" rsdp_pa: %016"PRIx64"\n", pvh_info->rsdp_paddr); ++ ++ entry = __va(pvh_info->modlist_paddr); ++ for ( i = 0; i < pvh_info->nr_modules; i++ ) ++ { ++ printk(" mod[%u].pa: %016"PRIx64"\n", i, entry[i].paddr); ++ printk(" mod[%u].size: %016"PRIu64"\n", i, entry[i].size); ++ printk(" mod[%u].cmdline_pa: %016"PRIx64"\n", ++ i, entry[i].cmdline_paddr); ++ if ( entry[i].cmdline_paddr ) ++ printk(" mod[%1u].cmdline: '%s'\n", i, ++ (char *)__va(entry[i].cmdline_paddr)); ++ } ++} ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index 2e10c6bdf4..4b8d09b751 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -51,6 +51,7 @@ + #include + #include + #include ++#include + + /* opt_nosmp: If true, secondary processors are ignored. */ + static bool __initdata opt_nosmp; +@@ -649,8 +650,8 @@ void __init noreturn __start_xen(unsigned long mbi_p) + char *memmap_type = NULL; + char *cmdline, *kextra, *loader; + unsigned int initrdidx, domcr_flags = DOMCRF_s3_integrity; +- multiboot_info_t *mbi = __va(mbi_p); +- module_t *mod = (module_t *)__va(mbi->mods_addr); ++ multiboot_info_t *mbi; ++ module_t *mod; + unsigned long nr_pages, raw_max_page, modules_headroom, *module_map; + int i, j, e820_warn = 0, bytes = 0; + bool acpi_boot_table_init_done = false, relocated = false; +@@ -680,6 +681,16 @@ void __init noreturn __start_xen(unsigned long mbi_p) + + /* Full exception support from here on in. */ + ++ if ( pvh_boot ) ++ { ++ ASSERT(mbi_p == 0); ++ mbi = pvh_init(); ++ } ++ else ++ mbi = __va(mbi_p); ++ ++ mod = __va(mbi->mods_addr); ++ + loader = (mbi->flags & MBI_LOADERNAME) + ? (char *)__va(mbi->boot_loader_name) : "unknown"; + +@@ -719,6 +730,9 @@ void __init noreturn __start_xen(unsigned long mbi_p) + ehci_dbgp_init(); + console_init_preirq(); + ++ if ( pvh_boot ) ++ pvh_print_info(); ++ + printk("Bootloader: %s\n", loader); + + printk("Command line: %s\n", cmdline); +diff --git a/xen/include/asm-x86/guest.h b/xen/include/asm-x86/guest.h +new file mode 100644 +index 0000000000..630c092c25 +--- /dev/null ++++ b/xen/include/asm-x86/guest.h +@@ -0,0 +1,34 @@ ++/****************************************************************************** ++ * asm-x86/guest.h ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms and conditions of the GNU General Public ++ * License, version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public ++ * License along with this program; If not, see . ++ * ++ * Copyright (c) 2017 Citrix Systems Ltd. ++ */ ++ ++#ifndef __X86_GUEST_H__ ++#define __X86_GUEST_H__ ++ ++#include ++ ++#endif /* __X86_GUEST_H__ */ ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/include/asm-x86/guest/pvh-boot.h b/xen/include/asm-x86/guest/pvh-boot.h +new file mode 100644 +index 0000000000..1b429f9401 +--- /dev/null ++++ b/xen/include/asm-x86/guest/pvh-boot.h +@@ -0,0 +1,57 @@ ++/****************************************************************************** ++ * asm-x86/guest/pvh-boot.h ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms and conditions of the GNU General Public ++ * License, version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public ++ * License along with this program; If not, see . ++ * ++ * Copyright (c) 2017 Citrix Systems Ltd. ++ */ ++ ++#ifndef __X86_PVH_BOOT_H__ ++#define __X86_PVH_BOOT_H__ ++ ++#include ++ ++#ifdef CONFIG_PVH_GUEST ++ ++extern bool pvh_boot; ++ ++multiboot_info_t *pvh_init(void); ++void pvh_print_info(void); ++ ++#else ++ ++#define pvh_boot 0 ++ ++static inline multiboot_info_t *pvh_init(void) ++{ ++ ASSERT_UNREACHABLE(); ++ return NULL; ++} ++ ++static inline void pvh_print_info(void) ++{ ++ ASSERT_UNREACHABLE(); ++} ++ ++#endif /* CONFIG_PVH_GUEST */ ++#endif /* __X86_PVH_BOOT_H__ */ ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +-- +2.14.3 + + +From 31b664a93f5efd8f40889d04028881c18b76a5a3 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Wed, 22 Nov 2017 11:39:04 +0000 +Subject: [PATCH 32/77] x86/boot: Map more than the first 16MB + +TODO: Replace somehow (bootstrap_map() ?) + +Signed-off-by: Andrew Cooper +--- + xen/arch/x86/boot/x86_64.S | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/xen/arch/x86/boot/x86_64.S b/xen/arch/x86/boot/x86_64.S +index cf47e019f5..42636cf334 100644 +--- a/xen/arch/x86/boot/x86_64.S ++++ b/xen/arch/x86/boot/x86_64.S +@@ -114,11 +114,10 @@ GLOBAL(__page_tables_start) + GLOBAL(l2_identmap) + .quad sym_offs(l1_identmap) + __PAGE_HYPERVISOR + idx = 1 +- .rept 7 ++ .rept 4 * L2_PAGETABLE_ENTRIES - 1 + .quad (idx << L2_PAGETABLE_SHIFT) | PAGE_HYPERVISOR | _PAGE_PSE + idx = idx + 1 + .endr +- .fill 4 * L2_PAGETABLE_ENTRIES - 8, 8, 0 + .size l2_identmap, . - l2_identmap + + /* +-- +2.14.3 + + +From 3d1afab1f6a092006b5bbd36a84186203989d846 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 28 Nov 2017 14:53:51 +0000 +Subject: [PATCH 33/77] x86/entry: Probe for Xen early during boot + +Signed-off-by: Andrew Cooper +--- +v2: Add __read_mostly. +--- + xen/arch/x86/guest/Makefile | 2 ++ + xen/arch/x86/guest/xen.c | 75 +++++++++++++++++++++++++++++++++++++++++ + xen/arch/x86/setup.c | 2 ++ + xen/include/asm-x86/guest.h | 1 + + xen/include/asm-x86/guest/xen.h | 47 ++++++++++++++++++++++++++ + 5 files changed, 127 insertions(+) + create mode 100644 xen/arch/x86/guest/xen.c + create mode 100644 xen/include/asm-x86/guest/xen.h + +diff --git a/xen/arch/x86/guest/Makefile b/xen/arch/x86/guest/Makefile +index a5f1625ab1..1345a60c81 100644 +--- a/xen/arch/x86/guest/Makefile ++++ b/xen/arch/x86/guest/Makefile +@@ -1 +1,3 @@ ++obj-y += xen.o ++ + obj-bin-$(CONFIG_PVH_GUEST) += pvh-boot.init.o +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +new file mode 100644 +index 0000000000..8507757841 +--- /dev/null ++++ b/xen/arch/x86/guest/xen.c +@@ -0,0 +1,75 @@ ++/****************************************************************************** ++ * arch/x86/guest/xen.c ++ * ++ * Support for detecting and running under Xen. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; If not, see . ++ * ++ * Copyright (c) 2017 Citrix Systems Ltd. ++ */ ++#include ++#include ++ ++#include ++#include ++ ++#include ++ ++bool __read_mostly xen_guest; ++ ++static __read_mostly uint32_t xen_cpuid_base; ++ ++static void __init find_xen_leaves(void) ++{ ++ uint32_t eax, ebx, ecx, edx, base; ++ ++ for ( base = XEN_CPUID_FIRST_LEAF; ++ base < XEN_CPUID_FIRST_LEAF + 0x10000; base += 0x100 ) ++ { ++ cpuid(base, &eax, &ebx, &ecx, &edx); ++ ++ if ( (ebx == XEN_CPUID_SIGNATURE_EBX) && ++ (ecx == XEN_CPUID_SIGNATURE_ECX) && ++ (edx == XEN_CPUID_SIGNATURE_EDX) && ++ ((eax - base) >= 2) ) ++ { ++ xen_cpuid_base = base; ++ break; ++ } ++ } ++} ++ ++void __init probe_hypervisor(void) ++{ ++ /* Too early to use cpu_has_hypervisor */ ++ if ( !(cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_HYPERVISOR)) ) ++ return; ++ ++ find_xen_leaves(); ++ ++ if ( !xen_cpuid_base ) ++ return; ++ ++ xen_guest = true; ++} ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index 4b8d09b751..d8059f23b5 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -715,6 +715,8 @@ void __init noreturn __start_xen(unsigned long mbi_p) + * allocing any xenheap structures wanted in lower memory. */ + kexec_early_calculations(); + ++ probe_hypervisor(); ++ + parse_video_info(); + + rdmsrl(MSR_EFER, this_cpu(efer)); +diff --git a/xen/include/asm-x86/guest.h b/xen/include/asm-x86/guest.h +index 630c092c25..8d91f81451 100644 +--- a/xen/include/asm-x86/guest.h ++++ b/xen/include/asm-x86/guest.h +@@ -20,6 +20,7 @@ + #define __X86_GUEST_H__ + + #include ++#include + + #endif /* __X86_GUEST_H__ */ + +diff --git a/xen/include/asm-x86/guest/xen.h b/xen/include/asm-x86/guest/xen.h +new file mode 100644 +index 0000000000..97a7c8d531 +--- /dev/null ++++ b/xen/include/asm-x86/guest/xen.h +@@ -0,0 +1,47 @@ ++/****************************************************************************** ++ * asm-x86/guest/xen.h ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms and conditions of the GNU General Public ++ * License, version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public ++ * License along with this program; If not, see . ++ * ++ * Copyright (c) 2017 Citrix Systems Ltd. ++ */ ++ ++#ifndef __X86_GUEST_XEN_H__ ++#define __X86_GUEST_XEN_H__ ++ ++#include ++ ++#ifdef CONFIG_XEN_GUEST ++ ++extern bool xen_guest; ++ ++void probe_hypervisor(void); ++ ++#else ++ ++#define xen_guest 0 ++ ++static inline void probe_hypervisor(void) {}; ++ ++#endif /* CONFIG_XEN_GUEST */ ++#endif /* __X86_GUEST_XEN_H__ */ ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +-- +2.14.3 + + +From b38cc15b2f6170e0a8864aa9f151cc0e4b388c3f Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 21 Nov 2017 13:54:47 +0000 +Subject: [PATCH 34/77] x86/guest: Hypercall support + +Signed-off-by: Andrew Cooper +Signed-off-by: Wei Liu +--- +v2: append underscores to tmp. +--- + xen/arch/x86/guest/Makefile | 1 + + xen/arch/x86/guest/hypercall_page.S | 79 ++++++++++++++++++++++++++++++ + xen/arch/x86/guest/xen.c | 5 ++ + xen/arch/x86/xen.lds.S | 1 + + xen/include/asm-x86/guest.h | 1 + + xen/include/asm-x86/guest/hypercall.h | 92 +++++++++++++++++++++++++++++++++++ + 6 files changed, 179 insertions(+) + create mode 100644 xen/arch/x86/guest/hypercall_page.S + create mode 100644 xen/include/asm-x86/guest/hypercall.h + +diff --git a/xen/arch/x86/guest/Makefile b/xen/arch/x86/guest/Makefile +index 1345a60c81..26fb4b1007 100644 +--- a/xen/arch/x86/guest/Makefile ++++ b/xen/arch/x86/guest/Makefile +@@ -1,3 +1,4 @@ ++obj-y += hypercall_page.o + obj-y += xen.o + + obj-bin-$(CONFIG_PVH_GUEST) += pvh-boot.init.o +diff --git a/xen/arch/x86/guest/hypercall_page.S b/xen/arch/x86/guest/hypercall_page.S +new file mode 100644 +index 0000000000..fdd2e72272 +--- /dev/null ++++ b/xen/arch/x86/guest/hypercall_page.S +@@ -0,0 +1,79 @@ ++#include ++#include ++#include ++ ++ .section ".text.page_aligned", "ax", @progbits ++ .p2align PAGE_SHIFT ++ ++GLOBAL(hypercall_page) ++ /* Poisoned with `ret` for safety before hypercalls are set up. */ ++ .fill PAGE_SIZE, 1, 0xc3 ++ .type hypercall_page, STT_OBJECT ++ .size hypercall_page, PAGE_SIZE ++ ++/* ++ * Identify a specific hypercall in the hypercall page ++ * @param name Hypercall name. ++ */ ++#define DECLARE_HYPERCALL(name) \ ++ .globl HYPERCALL_ ## name; \ ++ .set HYPERCALL_ ## name, hypercall_page + __HYPERVISOR_ ## name * 32; \ ++ .type HYPERCALL_ ## name, STT_FUNC; \ ++ .size HYPERCALL_ ## name, 32 ++ ++DECLARE_HYPERCALL(set_trap_table) ++DECLARE_HYPERCALL(mmu_update) ++DECLARE_HYPERCALL(set_gdt) ++DECLARE_HYPERCALL(stack_switch) ++DECLARE_HYPERCALL(set_callbacks) ++DECLARE_HYPERCALL(fpu_taskswitch) ++DECLARE_HYPERCALL(sched_op_compat) ++DECLARE_HYPERCALL(platform_op) ++DECLARE_HYPERCALL(set_debugreg) ++DECLARE_HYPERCALL(get_debugreg) ++DECLARE_HYPERCALL(update_descriptor) ++DECLARE_HYPERCALL(memory_op) ++DECLARE_HYPERCALL(multicall) ++DECLARE_HYPERCALL(update_va_mapping) ++DECLARE_HYPERCALL(set_timer_op) ++DECLARE_HYPERCALL(event_channel_op_compat) ++DECLARE_HYPERCALL(xen_version) ++DECLARE_HYPERCALL(console_io) ++DECLARE_HYPERCALL(physdev_op_compat) ++DECLARE_HYPERCALL(grant_table_op) ++DECLARE_HYPERCALL(vm_assist) ++DECLARE_HYPERCALL(update_va_mapping_otherdomain) ++DECLARE_HYPERCALL(iret) ++DECLARE_HYPERCALL(vcpu_op) ++DECLARE_HYPERCALL(set_segment_base) ++DECLARE_HYPERCALL(mmuext_op) ++DECLARE_HYPERCALL(xsm_op) ++DECLARE_HYPERCALL(nmi_op) ++DECLARE_HYPERCALL(sched_op) ++DECLARE_HYPERCALL(callback_op) ++DECLARE_HYPERCALL(xenoprof_op) ++DECLARE_HYPERCALL(event_channel_op) ++DECLARE_HYPERCALL(physdev_op) ++DECLARE_HYPERCALL(hvm_op) ++DECLARE_HYPERCALL(sysctl) ++DECLARE_HYPERCALL(domctl) ++DECLARE_HYPERCALL(kexec_op) ++DECLARE_HYPERCALL(tmem_op) ++DECLARE_HYPERCALL(xc_reserved_op) ++DECLARE_HYPERCALL(xenpmu_op) ++ ++DECLARE_HYPERCALL(arch_0) ++DECLARE_HYPERCALL(arch_1) ++DECLARE_HYPERCALL(arch_2) ++DECLARE_HYPERCALL(arch_3) ++DECLARE_HYPERCALL(arch_4) ++DECLARE_HYPERCALL(arch_5) ++DECLARE_HYPERCALL(arch_6) ++DECLARE_HYPERCALL(arch_7) ++ ++/* ++ * Local variables: ++ * tab-width: 8 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index 8507757841..10b90d0f61 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -22,6 +22,7 @@ + #include + + #include ++#include + #include + + #include +@@ -29,6 +30,7 @@ + bool __read_mostly xen_guest; + + static __read_mostly uint32_t xen_cpuid_base; ++extern char hypercall_page[]; + + static void __init find_xen_leaves(void) + { +@@ -61,6 +63,9 @@ void __init probe_hypervisor(void) + if ( !xen_cpuid_base ) + return; + ++ /* Fill the hypercall page. */ ++ wrmsrl(cpuid_ebx(xen_cpuid_base + 2), __pa(hypercall_page)); ++ + xen_guest = true; + } + +diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S +index 2023f971e4..509f176913 100644 +--- a/xen/arch/x86/xen.lds.S ++++ b/xen/arch/x86/xen.lds.S +@@ -65,6 +65,7 @@ SECTIONS + DECL_SECTION(.text) { + _stext = .; /* Text and read-only data */ + *(.text) ++ *(.text.page_aligned) + *(.text.cold) + *(.text.unlikely) + *(.fixup) +diff --git a/xen/include/asm-x86/guest.h b/xen/include/asm-x86/guest.h +index 8d91f81451..5abdb8c433 100644 +--- a/xen/include/asm-x86/guest.h ++++ b/xen/include/asm-x86/guest.h +@@ -19,6 +19,7 @@ + #ifndef __X86_GUEST_H__ + #define __X86_GUEST_H__ + ++#include + #include + #include + +diff --git a/xen/include/asm-x86/guest/hypercall.h b/xen/include/asm-x86/guest/hypercall.h +new file mode 100644 +index 0000000000..d959c3dd8a +--- /dev/null ++++ b/xen/include/asm-x86/guest/hypercall.h +@@ -0,0 +1,92 @@ ++/****************************************************************************** ++ * asm-x86/guest/hypercall.h ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms and conditions of the GNU General Public ++ * License, version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public ++ * License along with this program; If not, see . ++ * ++ * Copyright (c) 2017 Citrix Systems Ltd. ++ */ ++ ++#ifndef __X86_XEN_HYPERCALL_H__ ++#define __X86_XEN_HYPERCALL_H__ ++ ++#ifdef CONFIG_XEN_GUEST ++ ++/* ++ * Hypercall primatives for 64bit ++ * ++ * Inputs: %rdi, %rsi, %rdx, %r10, %r8, %r9 (arguments 1-6) ++ */ ++ ++#define _hypercall64_1(type, hcall, a1) \ ++ ({ \ ++ long res, tmp__; \ ++ asm volatile ( \ ++ "call hypercall_page + %c[offset]" \ ++ : "=a" (res), "=D" (tmp__) \ ++ : [offset] "i" (hcall * 32), \ ++ "1" ((long)(a1)) \ ++ : "memory" ); \ ++ (type)res; \ ++ }) ++ ++#define _hypercall64_2(type, hcall, a1, a2) \ ++ ({ \ ++ long res, tmp__; \ ++ asm volatile ( \ ++ "call hypercall_page + %c[offset]" \ ++ : "=a" (res), "=D" (tmp__), "=S" (tmp__) \ ++ : [offset] "i" (hcall * 32), \ ++ "1" ((long)(a1)), "2" ((long)(a2)) \ ++ : "memory" ); \ ++ (type)res; \ ++ }) ++ ++#define _hypercall64_3(type, hcall, a1, a2, a3) \ ++ ({ \ ++ long res, tmp__; \ ++ asm volatile ( \ ++ "call hypercall_page + %c[offset]" \ ++ : "=a" (res), "=D" (tmp__), "=S" (tmp__), "=d" (tmp__) \ ++ : [offset] "i" (hcall * 32), \ ++ "1" ((long)(a1)), "2" ((long)(a2)), "3" ((long)(a3)) \ ++ : "memory" ); \ ++ (type)res; \ ++ }) ++ ++#define _hypercall64_4(type, hcall, a1, a2, a3, a4) \ ++ ({ \ ++ long res, tmp__; \ ++ register long _a4 asm ("r10") = ((long)(a4)); \ ++ asm volatile ( \ ++ "call hypercall_page + %c[offset]" \ ++ : "=a" (res), "=D" (tmp__), "=S" (tmp__), "=d" (tmp__), \ ++ "=&r" (tmp__) \ ++ : [offset] "i" (hcall * 32), \ ++ "1" ((long)(a1)), "2" ((long)(a2)), "3" ((long)(a3)), \ ++ "4" (_a4) \ ++ : "memory" ); \ ++ (type)res; \ ++ }) ++ ++#endif /* CONFIG_XEN_GUEST */ ++#endif /* __X86_XEN_HYPERCALL_H__ */ ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +-- +2.14.3 + + +From 9752c7422b9193e18523d9c443bc0dad7ae0c7c7 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 21 Nov 2017 14:43:32 +0000 +Subject: [PATCH 35/77] x86/shutdown: Support for using + SCHEDOP_{shutdown,reboot} + +Signed-off-by: Andrew Cooper +Signed-off-by: Wei Liu +Reviewed-by: Jan Beulich +--- +v2: +1. Use sched_shutdown +2. Move header inclusion +--- + docs/misc/xen-command-line.markdown | 3 +++ + xen/arch/x86/shutdown.c | 34 ++++++++++++++++++++++++++++++---- + xen/include/asm-x86/guest/hypercall.h | 32 ++++++++++++++++++++++++++++++++ + 3 files changed, 65 insertions(+), 4 deletions(-) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 781110d4b2..e5979bceee 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -1478,6 +1478,9 @@ Specify the host reboot method. + 'efi' instructs Xen to reboot using the EFI reboot call (in EFI mode by + default it will use that method first). + ++`xen` instructs Xen to reboot using Xen's SCHEDOP hypercall (this is the default ++when running nested Xen) ++ + ### rmrr + > '= start<-end>=[s1]bdf1[,[s1]bdf2[,...]];start<-end>=[s2]bdf1[,[s2]bdf2[,...]] + +diff --git a/xen/arch/x86/shutdown.c b/xen/arch/x86/shutdown.c +index a87aa60add..689f6f137d 100644 +--- a/xen/arch/x86/shutdown.c ++++ b/xen/arch/x86/shutdown.c +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + enum reboot_type { + BOOT_INVALID, +@@ -34,6 +35,7 @@ enum reboot_type { + BOOT_CF9 = 'p', + BOOT_CF9_PWR = 'P', + BOOT_EFI = 'e', ++ BOOT_XEN = 'x', + }; + + static int reboot_mode; +@@ -49,6 +51,7 @@ static int reboot_mode; + * pci Use the so-called "PCI reset register", CF9 + * Power Like 'pci' but for a full power-cyle reset + * efi Use the EFI reboot (if running under EFI) ++ * xen Use Xen SCHEDOP hypercall (if running under Xen as a guest) + */ + static enum reboot_type reboot_type = BOOT_INVALID; + +@@ -75,6 +78,7 @@ static int __init set_reboot_type(const char *str) + case 'P': + case 'p': + case 't': ++ case 'x': + reboot_type = *str; + break; + default: +@@ -93,6 +97,13 @@ static int __init set_reboot_type(const char *str) + reboot_type = BOOT_INVALID; + } + ++ if ( reboot_type == BOOT_XEN && !xen_guest ) ++ { ++ printk("Xen reboot selected, but Xen hypervisor not detected\n" ++ "Falling back to default\n"); ++ reboot_type = BOOT_INVALID; ++ } ++ + return rc; + } + custom_param("reboot", set_reboot_type); +@@ -109,6 +120,10 @@ static inline void kb_wait(void) + static void noreturn __machine_halt(void *unused) + { + local_irq_disable(); ++ ++ if ( reboot_type == BOOT_XEN ) ++ xen_hypercall_shutdown(SHUTDOWN_poweroff); ++ + for ( ; ; ) + halt(); + } +@@ -129,10 +144,17 @@ void machine_halt(void) + + static void default_reboot_type(void) + { +- if ( reboot_type == BOOT_INVALID ) +- reboot_type = efi_enabled(EFI_RS) ? BOOT_EFI +- : acpi_disabled ? BOOT_KBD +- : BOOT_ACPI; ++ if ( reboot_type != BOOT_INVALID ) ++ return; ++ ++ if ( xen_guest ) ++ reboot_type = BOOT_XEN; ++ else if ( efi_enabled(EFI_RS) ) ++ reboot_type = BOOT_EFI; ++ else if ( acpi_disabled ) ++ reboot_type = BOOT_KBD; ++ else ++ reboot_type = BOOT_ACPI; + } + + static int __init override_reboot(struct dmi_system_id *d) +@@ -618,6 +640,10 @@ void machine_restart(unsigned int delay_millisecs) + } + reboot_type = BOOT_ACPI; + break; ++ ++ case BOOT_XEN: ++ xen_hypercall_shutdown(SHUTDOWN_reboot); ++ break; + } + } + } +diff --git a/xen/include/asm-x86/guest/hypercall.h b/xen/include/asm-x86/guest/hypercall.h +index d959c3dd8a..a05041d30b 100644 +--- a/xen/include/asm-x86/guest/hypercall.h ++++ b/xen/include/asm-x86/guest/hypercall.h +@@ -21,6 +21,11 @@ + + #ifdef CONFIG_XEN_GUEST + ++#include ++ ++#include ++#include ++ + /* + * Hypercall primatives for 64bit + * +@@ -78,6 +83,33 @@ + (type)res; \ + }) + ++/* ++ * Primitive Hypercall wrappers ++ */ ++static inline long xen_hypercall_sched_op(unsigned int cmd, void *arg) ++{ ++ return _hypercall64_2(long, __HYPERVISOR_sched_op, cmd, arg); ++} ++ ++/* ++ * Higher level hypercall helpers ++ */ ++static inline long xen_hypercall_shutdown(unsigned int reason) ++{ ++ struct sched_shutdown s = { .reason = reason }; ++ return xen_hypercall_sched_op(SCHEDOP_shutdown, &s); ++} ++ ++#else /* CONFIG_XEN_GUEST */ ++ ++#include ++ ++static inline long xen_hypercall_shutdown(unsigned int reason) ++{ ++ ASSERT_UNREACHABLE(); ++ return 0; ++} ++ + #endif /* CONFIG_XEN_GUEST */ + #endif /* __X86_XEN_HYPERCALL_H__ */ + +-- +2.14.3 + + +From 2f5a0121434559b2f8e5b17dc0119699684e3b17 Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Tue, 14 Nov 2017 18:19:09 +0000 +Subject: [PATCH 36/77] x86/pvh: Retrieve memory map from Xen +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Wei Liu +Signed-off-by: Andrew Cooper +Signed-off-by: Roger Pau Monné +--- +v2: fixed clang build, dropped rb tag +--- + xen/arch/x86/e820.c | 3 +-- + xen/arch/x86/guest/pvh-boot.c | 20 ++++++++++++++++++++ + xen/arch/x86/guest/xen.c | 3 +++ + xen/arch/x86/setup.c | 7 ++++++- + xen/include/asm-x86/e820.h | 1 + + xen/include/asm-x86/guest/hypercall.h | 5 +++++ + 6 files changed, 36 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/e820.c b/xen/arch/x86/e820.c +index 7c572bade2..b422a684ee 100644 +--- a/xen/arch/x86/e820.c ++++ b/xen/arch/x86/e820.c +@@ -134,8 +134,7 @@ static struct change_member *change_point[2*E820MAX] __initdata; + static struct e820entry *overlap_list[E820MAX] __initdata; + static struct e820entry new_bios[E820MAX] __initdata; + +-static int __init sanitize_e820_map(struct e820entry *biosmap, +- unsigned int *pnr_map) ++int __init sanitize_e820_map(struct e820entry *biosmap, unsigned int *pnr_map) + { + struct change_member *change_tmp; + unsigned long current_type, last_type; +diff --git a/xen/arch/x86/guest/pvh-boot.c b/xen/arch/x86/guest/pvh-boot.c +index 186e332657..be3122b16c 100644 +--- a/xen/arch/x86/guest/pvh-boot.c ++++ b/xen/arch/x86/guest/pvh-boot.c +@@ -22,6 +22,7 @@ + #include + #include + ++#include + #include + + #include +@@ -70,10 +71,29 @@ static void __init convert_pvh_info(void) + } + } + ++static void __init get_memory_map(void) ++{ ++ struct xen_memory_map memmap = { ++ .nr_entries = E820MAX, ++ }; ++ ++ set_xen_guest_handle(memmap.buffer, e820_raw.map); ++ BUG_ON(xen_hypercall_memory_op(XENMEM_memory_map, &memmap)); ++ e820_raw.nr_map = memmap.nr_entries; ++ ++ /* :( Various toolstacks don't sort the memory map. */ ++ sanitize_e820_map(e820_raw.map, &e820_raw.nr_map); ++} ++ + multiboot_info_t *__init pvh_init(void) + { + convert_pvh_info(); + ++ probe_hypervisor(); ++ ASSERT(xen_guest); ++ ++ get_memory_map(); ++ + return &pvh_mbi; + } + +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index 10b90d0f61..c253ebd983 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -54,6 +54,9 @@ static void __init find_xen_leaves(void) + + void __init probe_hypervisor(void) + { ++ if ( xen_guest ) ++ return; ++ + /* Too early to use cpu_has_hypervisor */ + if ( !(cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_HYPERVISOR)) ) + return; +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index d8059f23b5..edb43bf2cb 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -795,7 +795,12 @@ void __init noreturn __start_xen(unsigned long mbi_p) + if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) ) + panic("dom0 kernel not specified. Check bootloader configuration."); + +- if ( efi_enabled(EFI_LOADER) ) ++ if ( pvh_boot ) ++ { ++ /* pvh_init() already filled in e820_raw */ ++ memmap_type = "PVH-e820"; ++ } ++ else if ( efi_enabled(EFI_LOADER) ) + { + set_pdx_range(xen_phys_start >> PAGE_SHIFT, + (xen_phys_start + BOOTSTRAP_MAP_BASE) >> PAGE_SHIFT); +diff --git a/xen/include/asm-x86/e820.h b/xen/include/asm-x86/e820.h +index 28defa8545..ee317b17aa 100644 +--- a/xen/include/asm-x86/e820.h ++++ b/xen/include/asm-x86/e820.h +@@ -23,6 +23,7 @@ struct e820map { + struct e820entry map[E820MAX]; + }; + ++extern int sanitize_e820_map(struct e820entry *biosmap, unsigned int *pnr_map); + extern int e820_all_mapped(u64 start, u64 end, unsigned type); + extern int reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e); + extern int e820_change_range_type( +diff --git a/xen/include/asm-x86/guest/hypercall.h b/xen/include/asm-x86/guest/hypercall.h +index a05041d30b..e0b00f97fb 100644 +--- a/xen/include/asm-x86/guest/hypercall.h ++++ b/xen/include/asm-x86/guest/hypercall.h +@@ -91,6 +91,11 @@ static inline long xen_hypercall_sched_op(unsigned int cmd, void *arg) + return _hypercall64_2(long, __HYPERVISOR_sched_op, cmd, arg); + } + ++static inline long xen_hypercall_memory_op(unsigned int cmd, void *arg) ++{ ++ return _hypercall64_2(long, __HYPERVISOR_memory_op, cmd, arg); ++} ++ + /* + * Higher level hypercall helpers + */ +-- +2.14.3 + + +From 10128f33aa344f1f57584fd9ea528e1518b0d5fd Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Thu, 11 Jan 2018 10:18:09 +0000 +Subject: [PATCH 37/77] xen/console: Introduce console=xen + +This specifies whether to use Xen specific console output. There are +two variants: one is the hypervisor console, the other is the magic +debug port 0xe9. + +Signed-off-by: Andrew Cooper +Signed-off-by: Wei Liu +--- + xen/drivers/char/console.c | 46 +++++++++++++++++++++++++++++++++++ + xen/include/asm-x86/guest/hypercall.h | 13 ++++++++++ + 2 files changed, 59 insertions(+) + +diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c +index 19d0e74f17..d05ebf9f70 100644 +--- a/xen/drivers/char/console.c ++++ b/xen/drivers/char/console.c +@@ -31,6 +31,10 @@ + #include + #include + ++#ifdef CONFIG_X86 ++#include ++#endif ++ + /* console: comma-separated list of console outputs. */ + static char __initdata opt_console[30] = OPT_CONSOLE_STR; + string_param("console", opt_console); +@@ -83,6 +87,10 @@ static uint32_t conringc, conringp; + + static int __read_mostly sercon_handle = -1; + ++#ifdef CONFIG_X86 ++static bool __read_mostly opt_console_xen; /* console=xen */ ++#endif ++ + static DEFINE_SPINLOCK(console_lock); + + /* +@@ -432,6 +440,16 @@ static void notify_dom0_con_ring(unsigned long unused) + static DECLARE_SOFTIRQ_TASKLET(notify_dom0_con_ring_tasklet, + notify_dom0_con_ring, 0); + ++#ifdef CONFIG_X86 ++static inline void xen_console_write_debug_port(const char *buf, size_t len) ++{ ++ unsigned long tmp; ++ asm volatile ( "rep outsb;" ++ : "=&S" (tmp), "=&c" (tmp) ++ : "0" (buf), "1" (len), "d" (0xe9) ); ++} ++#endif ++ + static long guest_console_write(XEN_GUEST_HANDLE_PARAM(char) buffer, int count) + { + char kbuf[128]; +@@ -458,6 +476,18 @@ static long guest_console_write(XEN_GUEST_HANDLE_PARAM(char) buffer, int count) + sercon_puts(kbuf); + video_puts(kbuf); + ++#ifdef CONFIG_X86 ++ if ( opt_console_xen ) ++ { ++ size_t len = strlen(kbuf); ++ ++ if ( xen_guest ) ++ xen_hypercall_console_write(kbuf, len); ++ else ++ xen_console_write_debug_port(kbuf, len); ++ } ++#endif ++ + if ( opt_console_to_ring ) + { + conring_puts(kbuf); +@@ -567,6 +597,18 @@ static void __putstr(const char *str) + sercon_puts(str); + video_puts(str); + ++#ifdef CONFIG_X86 ++ if ( opt_console_xen ) ++ { ++ size_t len = strlen(str); ++ ++ if ( xen_guest ) ++ xen_hypercall_console_write(str, len); ++ else ++ xen_console_write_debug_port(str, len); ++ } ++#endif ++ + conring_puts(str); + + if ( !console_locks_busted ) +@@ -762,6 +804,10 @@ void __init console_init_preirq(void) + p++; + if ( !strncmp(p, "vga", 3) ) + video_init(); ++#ifdef CONFIG_X86 ++ else if ( !strncmp(p, "xen", 3) ) ++ opt_console_xen = true; ++#endif + else if ( !strncmp(p, "none", 4) ) + continue; + else if ( (sh = serial_parse_handle(p)) >= 0 ) +diff --git a/xen/include/asm-x86/guest/hypercall.h b/xen/include/asm-x86/guest/hypercall.h +index e0b00f97fb..9cd95d2b92 100644 +--- a/xen/include/asm-x86/guest/hypercall.h ++++ b/xen/include/asm-x86/guest/hypercall.h +@@ -99,6 +99,13 @@ static inline long xen_hypercall_memory_op(unsigned int cmd, void *arg) + /* + * Higher level hypercall helpers + */ ++static inline void xen_hypercall_console_write( ++ const char *buf, unsigned int count) ++{ ++ (void)_hypercall64_3(long, __HYPERVISOR_console_io, ++ CONSOLEIO_write, count, buf); ++} ++ + static inline long xen_hypercall_shutdown(unsigned int reason) + { + struct sched_shutdown s = { .reason = reason }; +@@ -109,6 +116,12 @@ static inline long xen_hypercall_shutdown(unsigned int reason) + + #include + ++static inline void xen_hypercall_console_write( ++ const char *buf, unsigned int count) ++{ ++ ASSERT_UNREACHABLE(); ++} ++ + static inline long xen_hypercall_shutdown(unsigned int reason) + { + ASSERT_UNREACHABLE(); +-- +2.14.3 + + +From 1fa54448348d6cc36b89bb9e1729ea601013b00f Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Wed, 3 Jan 2018 16:38:54 +0000 +Subject: [PATCH 38/77] xen: introduce rangeset_claim_range + +Reserve a hole in a rangeset. + +Signed-off-by: Roger Pau Monne +Signed-off-by: Wei Liu +--- +Changes since v1: + - Change function name. + - Use a local variable instead of *s. + - Add unlikely to the !prev case. + - Move the function prototype position in the header file. +--- + xen/common/rangeset.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++ + xen/include/xen/rangeset.h | 4 +++- + 2 files changed, 55 insertions(+), 1 deletion(-) + +diff --git a/xen/common/rangeset.c b/xen/common/rangeset.c +index 6c6293c15c..ade34f6a50 100644 +--- a/xen/common/rangeset.c ++++ b/xen/common/rangeset.c +@@ -298,6 +298,58 @@ int rangeset_report_ranges( + return rc; + } + ++int rangeset_claim_range(struct rangeset *r, unsigned long size, ++ unsigned long *s) ++{ ++ struct range *prev, *next; ++ unsigned long start = 0; ++ ++ write_lock(&r->lock); ++ ++ for ( prev = NULL, next = first_range(r); ++ next; ++ prev = next, next = next_range(r, next) ) ++ { ++ if ( (next->s - start) >= size ) ++ goto insert; ++ ++ if ( next->e == ~0UL ) ++ goto out; ++ ++ start = next->e + 1; ++ } ++ ++ if ( (~0UL - start) + 1 >= size ) ++ goto insert; ++ ++ out: ++ write_unlock(&r->lock); ++ return -ENOSPC; ++ ++ insert: ++ if ( unlikely(!prev) ) ++ { ++ next = alloc_range(r); ++ if ( !next ) ++ { ++ write_unlock(&r->lock); ++ return -ENOMEM; ++ } ++ ++ next->s = start; ++ next->e = start + size - 1; ++ insert_range(r, prev, next); ++ } ++ else ++ prev->e += size; ++ ++ write_unlock(&r->lock); ++ ++ *s = start; ++ ++ return 0; ++} ++ + int rangeset_add_singleton( + struct rangeset *r, unsigned long s) + { +diff --git a/xen/include/xen/rangeset.h b/xen/include/xen/rangeset.h +index aa6408248b..1f83b1f44b 100644 +--- a/xen/include/xen/rangeset.h ++++ b/xen/include/xen/rangeset.h +@@ -55,9 +55,11 @@ void rangeset_limit( + bool_t __must_check rangeset_is_empty( + const struct rangeset *r); + +-/* Add/remove/query a numeric range. */ ++/* Add/claim/remove/query a numeric range. */ + int __must_check rangeset_add_range( + struct rangeset *r, unsigned long s, unsigned long e); ++int __must_check rangeset_claim_range(struct rangeset *r, unsigned long size, ++ unsigned long *s); + int __must_check rangeset_remove_range( + struct rangeset *r, unsigned long s, unsigned long e); + bool_t __must_check rangeset_contains_range( +-- +2.14.3 + + +From 83186a8e6988b8f218fce57db3a62e35d39b529a Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Wed, 3 Jan 2018 16:50:24 +0000 +Subject: [PATCH 39/77] xen/pvshim: keep track of used PFN ranges + +Simple infrastructure to keep track of PFN space usage, so that we can +use unpopulated PFNs to map special pages like shared info and grant +table. + +As rangeset depends on malloc being ready so hypervisor_setup is +introduced for things that can be initialised late in the process. + +Note that the PFN is marked as reserved at least up to 4GiB (or more +if the guest has more memory). This is not a perfect solution but +avoids using the MMIO hole below 4GiB. Ideally the shim (L1) should +have a way to ask the underlying Xen (L0) which memory regions are +populated, unpopulated, or MMIO space. + +Signed-off-by: Roger Pau Monne +Signed-off-by: Wei Liu +--- + xen/arch/x86/guest/xen.c | 56 +++++++++++++++++++++++++++++++++++++++++ + xen/arch/x86/setup.c | 3 +++ + xen/include/asm-x86/guest/xen.h | 7 ++++++ + 3 files changed, 66 insertions(+) + +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index c253ebd983..abf53ebbc6 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -19,8 +19,12 @@ + * Copyright (c) 2017 Citrix Systems Ltd. + */ + #include ++#include ++#include ++#include + #include + ++#include + #include + #include + #include +@@ -31,6 +35,7 @@ bool __read_mostly xen_guest; + + static __read_mostly uint32_t xen_cpuid_base; + extern char hypercall_page[]; ++static struct rangeset *mem; + + static void __init find_xen_leaves(void) + { +@@ -72,6 +77,57 @@ void __init probe_hypervisor(void) + xen_guest = true; + } + ++static void __init init_memmap(void) ++{ ++ unsigned int i; ++ ++ mem = rangeset_new(NULL, "host memory map", 0); ++ if ( !mem ) ++ panic("failed to allocate PFN usage rangeset"); ++ ++ /* ++ * Mark up to the last memory page (or 4GiB) as RAM. This is done because ++ * Xen doesn't know the position of possible MMIO holes, so at least try to ++ * avoid the know MMIO hole below 4GiB. Note that this is subject to future ++ * discussion and improvements. ++ */ ++ if ( rangeset_add_range(mem, 0, max_t(unsigned long, max_page - 1, ++ PFN_DOWN(GB(4) - 1))) ) ++ panic("unable to add RAM to in-use PFN rangeset"); ++ ++ for ( i = 0; i < e820.nr_map; i++ ) ++ { ++ struct e820entry *e = &e820.map[i]; ++ ++ if ( rangeset_add_range(mem, PFN_DOWN(e->addr), ++ PFN_UP(e->addr + e->size - 1)) ) ++ panic("unable to add range [%#lx, %#lx] to in-use PFN rangeset", ++ PFN_DOWN(e->addr), PFN_UP(e->addr + e->size - 1)); ++ } ++} ++ ++void __init hypervisor_setup(void) ++{ ++ init_memmap(); ++} ++ ++int hypervisor_alloc_unused_page(mfn_t *mfn) ++{ ++ unsigned long m; ++ int rc; ++ ++ rc = rangeset_claim_range(mem, 1, &m); ++ if ( !rc ) ++ *mfn = _mfn(m); ++ ++ return rc; ++} ++ ++int hypervisor_free_unused_page(mfn_t mfn) ++{ ++ return rangeset_remove_range(mem, mfn_x(mfn), mfn_x(mfn)); ++} ++ + /* + * Local variables: + * mode: C +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index edb43bf2cb..b9b97d68f5 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -1472,6 +1472,9 @@ void __init noreturn __start_xen(unsigned long mbi_p) + max_cpus = nr_cpu_ids; + } + ++ if ( xen_guest ) ++ hypervisor_setup(); ++ + /* Low mappings were only needed for some BIOS table parsing. */ + zap_low_mappings(); + +diff --git a/xen/include/asm-x86/guest/xen.h b/xen/include/asm-x86/guest/xen.h +index 97a7c8d531..427837797b 100644 +--- a/xen/include/asm-x86/guest/xen.h ++++ b/xen/include/asm-x86/guest/xen.h +@@ -26,12 +26,19 @@ + extern bool xen_guest; + + void probe_hypervisor(void); ++void hypervisor_setup(void); ++int hypervisor_alloc_unused_page(mfn_t *mfn); ++int hypervisor_free_unused_page(mfn_t mfn); + + #else + + #define xen_guest 0 + + static inline void probe_hypervisor(void) {}; ++static inline void hypervisor_setup(void) ++{ ++ ASSERT_UNREACHABLE(); ++} + + #endif /* CONFIG_XEN_GUEST */ + #endif /* __X86_GUEST_XEN_H__ */ +-- +2.14.3 + + +From efa15c993b600e9636cd091c626ee0c989afc62f Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Tue, 9 Jan 2018 11:19:44 +0000 +Subject: [PATCH 40/77] x86/guest: map shared_info page + +Use an unpopulated PFN in order to map it. + +Signed-off-by: Roger Pau Monne +Signed-off-by: Wei Liu +Signed-off-by: Andrew Cooper +--- +Changes since v1: + - Use an unpopulated PFN to map the shared_info page. + - Mask all event channels. + - Report XENMEM_add_to_physmap error code in case of failure. +--- + xen/arch/x86/guest/xen.c | 27 +++++++++++++++++++++++++++ + xen/include/asm-x86/fixmap.h | 3 +++ + xen/include/asm-x86/guest/xen.h | 5 +++++ + 3 files changed, 35 insertions(+) + +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index abf53ebbc6..f62f93af16 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -77,6 +77,31 @@ void __init probe_hypervisor(void) + xen_guest = true; + } + ++static void map_shared_info(void) ++{ ++ mfn_t mfn; ++ struct xen_add_to_physmap xatp = { ++ .domid = DOMID_SELF, ++ .space = XENMAPSPACE_shared_info, ++ }; ++ unsigned int i; ++ unsigned long rc; ++ ++ if ( hypervisor_alloc_unused_page(&mfn) ) ++ panic("unable to reserve shared info memory page"); ++ ++ xatp.gpfn = mfn_x(mfn); ++ rc = xen_hypercall_memory_op(XENMEM_add_to_physmap, &xatp); ++ if ( rc ) ++ panic("failed to map shared_info page: %ld", rc); ++ ++ set_fixmap(FIX_XEN_SHARED_INFO, mfn_x(mfn) << PAGE_SHIFT); ++ ++ /* Mask all upcalls */ ++ for ( i = 0; i < ARRAY_SIZE(XEN_shared_info->evtchn_mask); i++ ) ++ write_atomic(&XEN_shared_info->evtchn_mask[i], ~0ul); ++} ++ + static void __init init_memmap(void) + { + unsigned int i; +@@ -109,6 +134,8 @@ static void __init init_memmap(void) + void __init hypervisor_setup(void) + { + init_memmap(); ++ ++ map_shared_info(); + } + + int hypervisor_alloc_unused_page(mfn_t *mfn) +diff --git a/xen/include/asm-x86/fixmap.h b/xen/include/asm-x86/fixmap.h +index 51b0e7e945..ded4ddf21b 100644 +--- a/xen/include/asm-x86/fixmap.h ++++ b/xen/include/asm-x86/fixmap.h +@@ -45,6 +45,9 @@ enum fixed_addresses { + FIX_COM_BEGIN, + FIX_COM_END, + FIX_EHCI_DBGP, ++#ifdef CONFIG_XEN_GUEST ++ FIX_XEN_SHARED_INFO, ++#endif /* CONFIG_XEN_GUEST */ + /* Everything else should go further down. */ + FIX_APIC_BASE, + FIX_IO_APIC_BASE_0, +diff --git a/xen/include/asm-x86/guest/xen.h b/xen/include/asm-x86/guest/xen.h +index 427837797b..f25ad4241b 100644 +--- a/xen/include/asm-x86/guest/xen.h ++++ b/xen/include/asm-x86/guest/xen.h +@@ -21,6 +21,11 @@ + + #include + ++#include ++#include ++ ++#define XEN_shared_info ((struct shared_info *)fix_to_virt(FIX_XEN_SHARED_INFO)) ++ + #ifdef CONFIG_XEN_GUEST + + extern bool xen_guest; +-- +2.14.3 + + +From d2df09c92bf988af804b65a1db92d8ea82a60350 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Wed, 27 Dec 2017 09:23:01 +0000 +Subject: [PATCH 41/77] xen/guest: fetch vCPU ID from Xen +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +If available. + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +[ wei: fix non-shim build ] +Signed-off-by: Wei Liu +--- + xen/arch/x86/guest/xen.c | 23 +++++++++++++++++++++++ + xen/arch/x86/smpboot.c | 4 ++++ + xen/include/asm-x86/guest/xen.h | 7 +++++++ + 3 files changed, 34 insertions(+) + +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index f62f93af16..de8cfc6e36 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -37,6 +37,8 @@ static __read_mostly uint32_t xen_cpuid_base; + extern char hypercall_page[]; + static struct rangeset *mem; + ++DEFINE_PER_CPU(unsigned int, vcpu_id); ++ + static void __init find_xen_leaves(void) + { + uint32_t eax, ebx, ecx, edx, base; +@@ -102,6 +104,20 @@ static void map_shared_info(void) + write_atomic(&XEN_shared_info->evtchn_mask[i], ~0ul); + } + ++static void set_vcpu_id(void) ++{ ++ uint32_t eax, ebx, ecx, edx; ++ ++ ASSERT(xen_cpuid_base); ++ ++ /* Fetch vcpu id from cpuid. */ ++ cpuid(xen_cpuid_base + 4, &eax, &ebx, &ecx, &edx); ++ if ( eax & XEN_HVM_CPUID_VCPU_ID_PRESENT ) ++ this_cpu(vcpu_id) = ebx; ++ else ++ this_cpu(vcpu_id) = smp_processor_id(); ++} ++ + static void __init init_memmap(void) + { + unsigned int i; +@@ -136,6 +152,13 @@ void __init hypervisor_setup(void) + init_memmap(); + + map_shared_info(); ++ ++ set_vcpu_id(); ++} ++ ++void hypervisor_ap_setup(void) ++{ ++ set_vcpu_id(); + } + + int hypervisor_alloc_unused_page(mfn_t *mfn) +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index 1609b627ae..5c7863035e 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -38,6 +38,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -373,6 +374,9 @@ void start_secondary(void *unused) + cpumask_set_cpu(cpu, &cpu_online_map); + unlock_vector_lock(); + ++ if ( xen_guest ) ++ hypervisor_ap_setup(); ++ + /* We can take interrupts now: we're officially "up". */ + local_irq_enable(); + mtrr_ap_init(); +diff --git a/xen/include/asm-x86/guest/xen.h b/xen/include/asm-x86/guest/xen.h +index f25ad4241b..db35a9e628 100644 +--- a/xen/include/asm-x86/guest/xen.h ++++ b/xen/include/asm-x86/guest/xen.h +@@ -32,9 +32,12 @@ extern bool xen_guest; + + void probe_hypervisor(void); + void hypervisor_setup(void); ++void hypervisor_ap_setup(void); + int hypervisor_alloc_unused_page(mfn_t *mfn); + int hypervisor_free_unused_page(mfn_t mfn); + ++DECLARE_PER_CPU(unsigned int, vcpu_id); ++ + #else + + #define xen_guest 0 +@@ -44,6 +47,10 @@ static inline void hypervisor_setup(void) + { + ASSERT_UNREACHABLE(); + } ++static inline void hypervisor_ap_setup(void) ++{ ++ ASSERT_UNREACHABLE(); ++} + + #endif /* CONFIG_XEN_GUEST */ + #endif /* __X86_GUEST_XEN_H__ */ +-- +2.14.3 + + +From 68e7a08436ed50f9ba51f9c9e88819ba0fedcc24 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 28 Dec 2017 15:22:34 +0000 +Subject: [PATCH 42/77] x86/guest: map per-cpu vcpu_info area. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Mapping the per-vcpu vcpu_info area is required in order to use more +than XEN_LEGACY_MAX_VCPUS. + +Signed-off-by: Roger Pau Monné +Signed-off-by: Wei Liu +--- +Changes since v1: + - Make vcpu_info_mapped static. + - Add a BUG_ON in case VCPUOP_register_vcpu_info fails. + - Remove one indentation level in hypervisor_setup. + - Make xen_hypercall_vcpu_op return int. +--- + xen/arch/x86/guest/xen.c | 57 +++++++++++++++++++++++++++++++++++ + xen/include/asm-x86/guest/hypercall.h | 8 +++++ + xen/include/asm-x86/guest/xen.h | 1 + + 3 files changed, 66 insertions(+) + +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index de8cfc6e36..60626ec21c 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -39,6 +39,10 @@ static struct rangeset *mem; + + DEFINE_PER_CPU(unsigned int, vcpu_id); + ++static struct vcpu_info *vcpu_info; ++static unsigned long vcpu_info_mapped[BITS_TO_LONGS(NR_CPUS)]; ++DEFINE_PER_CPU(struct vcpu_info *, vcpu_info); ++ + static void __init find_xen_leaves(void) + { + uint32_t eax, ebx, ecx, edx, base; +@@ -104,6 +108,41 @@ static void map_shared_info(void) + write_atomic(&XEN_shared_info->evtchn_mask[i], ~0ul); + } + ++static int map_vcpuinfo(void) ++{ ++ unsigned int vcpu = this_cpu(vcpu_id); ++ struct vcpu_register_vcpu_info info; ++ int rc; ++ ++ if ( !vcpu_info ) ++ { ++ this_cpu(vcpu_info) = &XEN_shared_info->vcpu_info[vcpu]; ++ return 0; ++ } ++ ++ if ( test_bit(vcpu, vcpu_info_mapped) ) ++ { ++ this_cpu(vcpu_info) = &vcpu_info[vcpu]; ++ return 0; ++ } ++ ++ info.mfn = virt_to_mfn(&vcpu_info[vcpu]); ++ info.offset = (unsigned long)&vcpu_info[vcpu] & ~PAGE_MASK; ++ rc = xen_hypercall_vcpu_op(VCPUOP_register_vcpu_info, vcpu, &info); ++ if ( rc ) ++ { ++ BUG_ON(vcpu >= XEN_LEGACY_MAX_VCPUS); ++ this_cpu(vcpu_info) = &XEN_shared_info->vcpu_info[vcpu]; ++ } ++ else ++ { ++ this_cpu(vcpu_info) = &vcpu_info[vcpu]; ++ set_bit(vcpu, vcpu_info_mapped); ++ } ++ ++ return rc; ++} ++ + static void set_vcpu_id(void) + { + uint32_t eax, ebx, ecx, edx; +@@ -154,11 +193,29 @@ void __init hypervisor_setup(void) + map_shared_info(); + + set_vcpu_id(); ++ vcpu_info = xzalloc_array(struct vcpu_info, nr_cpu_ids); ++ if ( map_vcpuinfo() ) ++ { ++ xfree(vcpu_info); ++ vcpu_info = NULL; ++ } ++ if ( !vcpu_info && nr_cpu_ids > XEN_LEGACY_MAX_VCPUS ) ++ { ++ unsigned int i; ++ ++ for ( i = XEN_LEGACY_MAX_VCPUS; i < nr_cpu_ids; i++ ) ++ __cpumask_clear_cpu(i, &cpu_present_map); ++ nr_cpu_ids = XEN_LEGACY_MAX_VCPUS; ++ printk(XENLOG_WARNING ++ "unable to map vCPU info, limiting vCPUs to: %u\n", ++ XEN_LEGACY_MAX_VCPUS); ++ } + } + + void hypervisor_ap_setup(void) + { + set_vcpu_id(); ++ map_vcpuinfo(); + } + + int hypervisor_alloc_unused_page(mfn_t *mfn) +diff --git a/xen/include/asm-x86/guest/hypercall.h b/xen/include/asm-x86/guest/hypercall.h +index 9cd95d2b92..dbc57a566e 100644 +--- a/xen/include/asm-x86/guest/hypercall.h ++++ b/xen/include/asm-x86/guest/hypercall.h +@@ -26,6 +26,8 @@ + #include + #include + ++#include ++ + /* + * Hypercall primatives for 64bit + * +@@ -96,6 +98,12 @@ static inline long xen_hypercall_memory_op(unsigned int cmd, void *arg) + return _hypercall64_2(long, __HYPERVISOR_memory_op, cmd, arg); + } + ++static inline int xen_hypercall_vcpu_op(unsigned int cmd, unsigned int vcpu, ++ void *arg) ++{ ++ return _hypercall64_3(long, __HYPERVISOR_vcpu_op, cmd, vcpu, arg); ++} ++ + /* + * Higher level hypercall helpers + */ +diff --git a/xen/include/asm-x86/guest/xen.h b/xen/include/asm-x86/guest/xen.h +index db35a9e628..b3e684f756 100644 +--- a/xen/include/asm-x86/guest/xen.h ++++ b/xen/include/asm-x86/guest/xen.h +@@ -37,6 +37,7 @@ int hypervisor_alloc_unused_page(mfn_t *mfn); + int hypervisor_free_unused_page(mfn_t mfn); + + DECLARE_PER_CPU(unsigned int, vcpu_id); ++DECLARE_PER_CPU(struct vcpu_info *, vcpu_info); + + #else + +-- +2.14.3 + + +From f5ca36927e87fd4fee647ca567aca01b7ab78004 Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Thu, 16 Nov 2017 17:56:18 +0000 +Subject: [PATCH 43/77] x86: xen pv clock time source +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +It is a variant of TSC clock source. + +Signed-off-by: Wei Liu +Signed-off-by: Andrew Cooper +Signed-off-by: Roger Pau Monné +--- +Changes since v1: + - Use the mapped vcpu_info. +--- + xen/arch/x86/time.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 89 insertions(+) + +diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c +index 6c20b1036d..ab866ad68d 100644 +--- a/xen/arch/x86/time.c ++++ b/xen/arch/x86/time.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -525,6 +526,91 @@ static struct platform_timesource __initdata plt_tsc = + .init = init_tsc, + }; + ++#ifdef CONFIG_XEN_GUEST ++/************************************************************ ++ * PLATFORM TIMER 5: XEN PV CLOCK SOURCE ++ * ++ * Xen clock source is a variant of TSC source. ++ */ ++ ++static uint64_t xen_timer_cpu_frequency(void) ++{ ++ struct vcpu_time_info *info = &this_cpu(vcpu_info)->time; ++ uint64_t freq; ++ ++ freq = 1000000000ULL << 32; ++ do_div(freq, info->tsc_to_system_mul); ++ if ( info->tsc_shift < 0 ) ++ freq <<= -info->tsc_shift; ++ else ++ freq >>= info->tsc_shift; ++ ++ return freq; ++} ++ ++static int64_t __init init_xen_timer(struct platform_timesource *pts) ++{ ++ if ( !xen_guest ) ++ return 0; ++ ++ pts->frequency = xen_timer_cpu_frequency(); ++ ++ return pts->frequency; ++} ++ ++static always_inline uint64_t read_cycle(const struct vcpu_time_info *info, ++ uint64_t tsc) ++{ ++ uint64_t delta = tsc - info->tsc_timestamp; ++ struct time_scale ts = { ++ .shift = info->tsc_shift, ++ .mul_frac = info->tsc_to_system_mul, ++ }; ++ uint64_t offset = scale_delta(delta, &ts); ++ ++ return info->system_time + offset; ++} ++ ++static uint64_t read_xen_timer(void) ++{ ++ struct vcpu_time_info *info = &this_cpu(vcpu_info)->time; ++ uint32_t version; ++ uint64_t ret; ++ uint64_t last; ++ static uint64_t last_value; ++ ++ do { ++ version = info->version & ~1; ++ /* Make sure version is read before the data */ ++ smp_rmb(); ++ ++ ret = read_cycle(info, rdtsc_ordered()); ++ /* Ignore fancy flags for now */ ++ ++ /* Make sure version is reread after the data */ ++ smp_rmb(); ++ } while ( unlikely(version != info->version) ); ++ ++ /* Maintain a monotonic global value */ ++ do { ++ last = read_atomic(&last_value); ++ if ( ret < last ) ++ return last; ++ } while ( unlikely(cmpxchg(&last_value, last, ret) != last) ); ++ ++ return ret; ++} ++ ++static struct platform_timesource __initdata plt_xen_timer = ++{ ++ .id = "xen", ++ .name = "XEN PV CLOCK", ++ .read_counter = read_xen_timer, ++ .init = init_xen_timer, ++ .counter_bits = 63, ++}; ++#endif ++ + /************************************************************ + * GENERIC PLATFORM TIMER INFRASTRUCTURE + */ +@@ -672,6 +758,9 @@ static s64 __init try_platform_timer(struct platform_timesource *pts) + static u64 __init init_platform_timer(void) + { + static struct platform_timesource * __initdata plt_timers[] = { ++#ifdef CONFIG_XEN_GUEST ++ &plt_xen_timer, ++#endif + &plt_hpet, &plt_pmtimer, &plt_pit + }; + +-- +2.14.3 + + +From 949eb11d5813466f1456a6229ff01e294fb1cdeb Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Fri, 17 Nov 2017 12:46:41 +0000 +Subject: [PATCH 44/77] x86: APIC timer calibration when running as a guest + +The timer calibration currently depends on PIT. Introduce a variant +to wait for a tick's worth of time to elapse when running as a PVH +guest. + +Signed-off-by: Wei Liu +Reviewed-by: Jan Beulich +--- + xen/arch/x86/apic.c | 38 ++++++++++++++++++++++++++++++-------- + 1 file changed, 30 insertions(+), 8 deletions(-) + +diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c +index ed59440c45..5039173827 100644 +--- a/xen/arch/x86/apic.c ++++ b/xen/arch/x86/apic.c +@@ -36,6 +36,8 @@ + #include + #include + #include ++#include ++#include + + static bool __read_mostly tdt_enabled; + static bool __initdata tdt_enable = true; +@@ -1091,6 +1093,20 @@ static void setup_APIC_timer(void) + local_irq_restore(flags); + } + ++static void wait_tick_pvh(void) ++{ ++ u64 lapse_ns = 1000000000ULL / HZ; ++ s_time_t start, curr_time; ++ ++ start = NOW(); ++ ++ /* Won't wrap around */ ++ do { ++ cpu_relax(); ++ curr_time = NOW(); ++ } while ( curr_time - start < lapse_ns ); ++} ++ + /* + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq +@@ -1123,12 +1139,15 @@ static int __init calibrate_APIC_clock(void) + */ + __setup_APIC_LVTT(1000000000); + +- /* +- * The timer chip counts down to zero. Let's wait +- * for a wraparound to start exact measurement: +- * (the current tick might have been already half done) +- */ +- wait_8254_wraparound(); ++ if ( !xen_guest ) ++ /* ++ * The timer chip counts down to zero. Let's wait ++ * for a wraparound to start exact measurement: ++ * (the current tick might have been already half done) ++ */ ++ wait_8254_wraparound(); ++ else ++ wait_tick_pvh(); + + /* + * We wrapped around just now. Let's start: +@@ -1137,10 +1156,13 @@ static int __init calibrate_APIC_clock(void) + tt1 = apic_read(APIC_TMCCT); + + /* +- * Let's wait LOOPS wraprounds: ++ * Let's wait LOOPS ticks: + */ + for (i = 0; i < LOOPS; i++) +- wait_8254_wraparound(); ++ if ( !xen_guest ) ++ wait_8254_wraparound(); ++ else ++ wait_tick_pvh(); + + tt2 = apic_read(APIC_TMCCT); + t2 = rdtsc_ordered(); +-- +2.14.3 + + +From 5a543c6f397c9e4f8068e83246967ca7bd92605c Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Fri, 17 Nov 2017 15:19:09 +0000 +Subject: [PATCH 45/77] x86: read wallclock from Xen when running in pvh mode + +Signed-off-by: Wei Liu +Reviewed-by: Jan Beulich +--- + xen/arch/x86/time.c | 32 ++++++++++++++++++++++++++++---- + 1 file changed, 28 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c +index ab866ad68d..2dbf1c7d75 100644 +--- a/xen/arch/x86/time.c ++++ b/xen/arch/x86/time.c +@@ -964,6 +964,30 @@ static unsigned long get_cmos_time(void) + return mktime(rtc.year, rtc.mon, rtc.day, rtc.hour, rtc.min, rtc.sec); + } + ++static unsigned long get_wallclock_time(void) ++{ ++#ifdef CONFIG_XEN_GUEST ++ if ( xen_guest ) ++ { ++ struct shared_info *sh_info = XEN_shared_info; ++ uint32_t wc_version; ++ uint64_t wc_sec; ++ ++ do { ++ wc_version = sh_info->wc_version & ~1; ++ smp_rmb(); ++ ++ wc_sec = sh_info->wc_sec; ++ smp_rmb(); ++ } while ( wc_version != sh_info->wc_version ); ++ ++ return wc_sec + read_xen_timer() / 1000000000; ++ } ++#endif ++ ++ return get_cmos_time(); ++} ++ + /*************************************************************************** + * System Time + ***************************************************************************/ +@@ -1759,8 +1783,8 @@ int __init init_xen_time(void) + + open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration); + +- /* NB. get_cmos_time() can take over one second to execute. */ +- do_settime(get_cmos_time(), 0, NOW()); ++ /* NB. get_wallclock_time() can take over one second to execute. */ ++ do_settime(get_wallclock_time(), 0, NOW()); + + /* Finish platform timer initialization. */ + try_platform_timer_tail(false); +@@ -1870,7 +1894,7 @@ int time_suspend(void) + { + if ( smp_processor_id() == 0 ) + { +- cmos_utc_offset = -get_cmos_time(); ++ cmos_utc_offset = -get_wallclock_time(); + cmos_utc_offset += get_sec(); + kill_timer(&calibration_timer); + +@@ -1897,7 +1921,7 @@ int time_resume(void) + + set_timer(&calibration_timer, NOW() + EPOCH); + +- do_settime(get_cmos_time() + cmos_utc_offset, 0, NOW()); ++ do_settime(get_wallclock_time() + cmos_utc_offset, 0, NOW()); + + update_vcpu_system_time(current); + +-- +2.14.3 + + +From 3b058a3eabf24b4b31521a49a600438b6a511739 Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Thu, 11 Jan 2018 13:45:48 +0000 +Subject: [PATCH 46/77] x86: don't swallow the first command line item in guest + mode + +Signed-off-by: Wei Liu +--- + xen/arch/x86/setup.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index b9b97d68f5..c1f4184e06 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -632,8 +632,8 @@ static char * __init cmdline_cook(char *p, const char *loader_name) + while ( *p == ' ' ) + p++; + +- /* GRUB2 does not include image name as first item on command line. */ +- if ( loader_is_grub2(loader_name) ) ++ /* GRUB2 and PVH don't not include image name as first item on command line. */ ++ if ( xen_guest || loader_is_grub2(loader_name) ) + return p; + + /* Strip image name plus whitespace. */ +-- +2.14.3 + + +From cb5dc94ba74f06c574390b58695dd2b4d4971571 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Tue, 9 Jan 2018 12:51:37 +0000 +Subject: [PATCH 47/77] x86/guest: setup event channel upcall vector +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +And a dummy event channel upcall handler. + +Note that with the current code the underlying Xen (L0) must support +HVMOP_set_evtchn_upcall_vector or else event channel setup is going to +fail. This limitation can be lifted by implementing more event channel +interrupt injection methods as a backup. + +Register callback_irq to trick toolstack to think the domain is +enlightened. + +Signed-off-by: Sergey Dyasli +Signed-off-by: Andrew Cooper +Signed-off-by: Roger Pau Monné +Signed-off-by: Wei Liu +--- + xen/arch/x86/guest/xen.c | 41 +++++++++++++++++++++++++++++++++++ + xen/include/asm-x86/guest/hypercall.h | 17 +++++++++++++++ + 2 files changed, 58 insertions(+) + +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index 60626ec21c..59871170c8 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -24,6 +24,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -186,6 +187,43 @@ static void __init init_memmap(void) + } + } + ++static void xen_evtchn_upcall(struct cpu_user_regs *regs) ++{ ++ struct vcpu_info *vcpu_info = this_cpu(vcpu_info); ++ ++ vcpu_info->evtchn_upcall_pending = 0; ++ write_atomic(&vcpu_info->evtchn_pending_sel, 0); ++ ++ ack_APIC_irq(); ++} ++ ++static void init_evtchn(void) ++{ ++ static uint8_t evtchn_upcall_vector; ++ int rc; ++ ++ if ( !evtchn_upcall_vector ) ++ alloc_direct_apic_vector(&evtchn_upcall_vector, xen_evtchn_upcall); ++ ++ ASSERT(evtchn_upcall_vector); ++ ++ rc = xen_hypercall_set_evtchn_upcall_vector(this_cpu(vcpu_id), ++ evtchn_upcall_vector); ++ if ( rc ) ++ panic("Unable to set evtchn upcall vector: %d", rc); ++ ++ /* Trick toolstack to think we are enlightened */ ++ { ++ struct xen_hvm_param a = { ++ .domid = DOMID_SELF, ++ .index = HVM_PARAM_CALLBACK_IRQ, ++ .value = 1, ++ }; ++ ++ BUG_ON(xen_hypercall_hvm_op(HVMOP_set_param, &a)); ++ } ++} ++ + void __init hypervisor_setup(void) + { + init_memmap(); +@@ -210,12 +248,15 @@ void __init hypervisor_setup(void) + "unable to map vCPU info, limiting vCPUs to: %u\n", + XEN_LEGACY_MAX_VCPUS); + } ++ ++ init_evtchn(); + } + + void hypervisor_ap_setup(void) + { + set_vcpu_id(); + map_vcpuinfo(); ++ init_evtchn(); + } + + int hypervisor_alloc_unused_page(mfn_t *mfn) +diff --git a/xen/include/asm-x86/guest/hypercall.h b/xen/include/asm-x86/guest/hypercall.h +index dbc57a566e..b36a1cc189 100644 +--- a/xen/include/asm-x86/guest/hypercall.h ++++ b/xen/include/asm-x86/guest/hypercall.h +@@ -25,6 +25,7 @@ + + #include + #include ++#include + + #include + +@@ -104,6 +105,11 @@ static inline int xen_hypercall_vcpu_op(unsigned int cmd, unsigned int vcpu, + return _hypercall64_3(long, __HYPERVISOR_vcpu_op, cmd, vcpu, arg); + } + ++static inline long xen_hypercall_hvm_op(unsigned int op, void *arg) ++{ ++ return _hypercall64_2(long, __HYPERVISOR_hvm_op, op, arg); ++} ++ + /* + * Higher level hypercall helpers + */ +@@ -120,6 +126,17 @@ static inline long xen_hypercall_shutdown(unsigned int reason) + return xen_hypercall_sched_op(SCHEDOP_shutdown, &s); + } + ++static inline long xen_hypercall_set_evtchn_upcall_vector( ++ unsigned int cpu, unsigned int vector) ++{ ++ struct xen_hvm_evtchn_upcall_vector a = { ++ .vcpu = cpu, ++ .vector = vector, ++ }; ++ ++ return xen_hypercall_hvm_op(HVMOP_set_evtchn_upcall_vector, &a); ++} ++ + #else /* CONFIG_XEN_GUEST */ + + #include +-- +2.14.3 + + +From 7477359b9a462d066a4819cefb6d6e60bc4defc5 Mon Sep 17 00:00:00 2001 +From: Sergey Dyasli +Date: Fri, 24 Nov 2017 11:07:32 +0000 +Subject: [PATCH 48/77] x86/guest: add PV console code + +Signed-off-by: Sergey Dyasli +Signed-off-by: Andrew Cooper +Signed-off-by: Wei Liu +--- + xen/drivers/char/Makefile | 1 + + xen/drivers/char/xen_pv_console.c | 205 ++++++++++++++++++++++++++++++++++ + xen/include/asm-x86/fixmap.h | 1 + + xen/include/asm-x86/guest/hypercall.h | 33 ++++++ + xen/include/xen/pv_console.h | 32 ++++++ + 5 files changed, 272 insertions(+) + create mode 100644 xen/drivers/char/xen_pv_console.c + create mode 100644 xen/include/xen/pv_console.h + +diff --git a/xen/drivers/char/Makefile b/xen/drivers/char/Makefile +index aa169d7961..9d48d0f2dc 100644 +--- a/xen/drivers/char/Makefile ++++ b/xen/drivers/char/Makefile +@@ -8,3 +8,4 @@ obj-$(CONFIG_HAS_SCIF) += scif-uart.o + obj-$(CONFIG_HAS_EHCI) += ehci-dbgp.o + obj-$(CONFIG_ARM) += arm-uart.o + obj-y += serial.o ++obj-$(CONFIG_XEN_GUEST) += xen_pv_console.o +diff --git a/xen/drivers/char/xen_pv_console.c b/xen/drivers/char/xen_pv_console.c +new file mode 100644 +index 0000000000..f5aca4c69e +--- /dev/null ++++ b/xen/drivers/char/xen_pv_console.c +@@ -0,0 +1,205 @@ ++/****************************************************************************** ++ * drivers/char/xen_pv_console.c ++ * ++ * A frontend driver for Xen's PV console. ++ * Can be used when Xen is running on top of Xen in pv-in-pvh mode. ++ * (Linux's name for this is hvc console) ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; If not, see . ++ * ++ * Copyright (c) 2017 Citrix Systems Ltd. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++ ++static struct xencons_interface *cons_ring; ++static evtchn_port_t cons_evtchn; ++static serial_rx_fn cons_rx_handler; ++static DEFINE_SPINLOCK(tx_lock); ++ ++void __init pv_console_init(void) ++{ ++ long r; ++ uint64_t raw_pfn = 0, raw_evtchn = 0; ++ ++ if ( !xen_guest ) ++ { ++ printk("PV console init failed: xen_guest mode is not active!\n"); ++ return; ++ } ++ ++ r = xen_hypercall_hvm_get_param(HVM_PARAM_CONSOLE_PFN, &raw_pfn); ++ if ( r < 0 ) ++ goto error; ++ ++ r = xen_hypercall_hvm_get_param(HVM_PARAM_CONSOLE_EVTCHN, &raw_evtchn); ++ if ( r < 0 ) ++ goto error; ++ ++ set_fixmap(FIX_PV_CONSOLE, raw_pfn << PAGE_SHIFT); ++ cons_ring = (struct xencons_interface *)fix_to_virt(FIX_PV_CONSOLE); ++ cons_evtchn = raw_evtchn; ++ ++ printk("Initialised PV console at 0x%p with pfn %#lx and evtchn %#x\n", ++ cons_ring, raw_pfn, cons_evtchn); ++ return; ++ ++ error: ++ printk("Couldn't initialise PV console\n"); ++} ++ ++void __init pv_console_set_rx_handler(serial_rx_fn fn) ++{ ++ cons_rx_handler = fn; ++} ++ ++void __init pv_console_init_postirq(void) ++{ ++ if ( !cons_ring ) ++ return; ++ ++ xen_hypercall_evtchn_unmask(cons_evtchn); ++} ++ ++static void notify_daemon(void) ++{ ++ xen_hypercall_evtchn_send(cons_evtchn); ++} ++ ++size_t pv_console_rx(struct cpu_user_regs *regs) ++{ ++ char c; ++ XENCONS_RING_IDX cons, prod; ++ size_t recv = 0; ++ ++ if ( !cons_ring ) ++ return 0; ++ ++ /* TODO: move this somewhere */ ++ if ( !test_bit(cons_evtchn, XEN_shared_info->evtchn_pending) ) ++ return 0; ++ ++ prod = ACCESS_ONCE(cons_ring->in_prod); ++ cons = cons_ring->in_cons; ++ ++ /* ++ * Latch pointers before accessing the ring. Included compiler barrier also ++ * ensures that pointers are really read only once into local variables. ++ */ ++ smp_rmb(); ++ ++ ASSERT((prod - cons) <= sizeof(cons_ring->in)); ++ ++ while ( cons != prod ) ++ { ++ c = cons_ring->in[MASK_XENCONS_IDX(cons++, cons_ring->in)]; ++ if ( cons_rx_handler ) ++ cons_rx_handler(c, regs); ++ recv++; ++ } ++ ++ /* No need for a mem barrier because every character was already consumed */ ++ barrier(); ++ ACCESS_ONCE(cons_ring->in_cons) = cons; ++ notify_daemon(); ++ ++ clear_bit(cons_evtchn, XEN_shared_info->evtchn_pending); ++ ++ return recv; ++} ++ ++static size_t pv_ring_puts(const char *buf) ++{ ++ XENCONS_RING_IDX cons, prod; ++ size_t sent = 0, avail; ++ bool put_r = false; ++ ++ while ( buf[sent] != '\0' || put_r ) ++ { ++ cons = ACCESS_ONCE(cons_ring->out_cons); ++ prod = cons_ring->out_prod; ++ ++ /* ++ * Latch pointers before accessing the ring. Included compiler barrier ++ * ensures that pointers are really read only once into local variables. ++ */ ++ smp_rmb(); ++ ++ ASSERT((prod - cons) <= sizeof(cons_ring->out)); ++ avail = sizeof(cons_ring->out) - (prod - cons); ++ ++ if ( avail == 0 ) ++ { ++ /* Wait for xenconsoled to consume our output */ ++ xen_hypercall_sched_op(SCHEDOP_yield, NULL); ++ continue; ++ } ++ ++ while ( avail && (buf[sent] != '\0' || put_r) ) ++ { ++ if ( put_r ) ++ { ++ cons_ring->out[MASK_XENCONS_IDX(prod++, cons_ring->out)] = '\r'; ++ put_r = false; ++ } ++ else ++ { ++ cons_ring->out[MASK_XENCONS_IDX(prod++, cons_ring->out)] = ++ buf[sent]; ++ ++ /* Send '\r' for every '\n' */ ++ if ( buf[sent] == '\n' ) ++ put_r = true; ++ sent++; ++ } ++ avail--; ++ } ++ ++ /* Write to the ring before updating the pointer */ ++ smp_wmb(); ++ ACCESS_ONCE(cons_ring->out_prod) = prod; ++ notify_daemon(); ++ } ++ ++ return sent; ++} ++ ++void pv_console_puts(const char *buf) ++{ ++ unsigned long flags; ++ ++ if ( !cons_ring ) ++ return; ++ ++ spin_lock_irqsave(&tx_lock, flags); ++ pv_ring_puts(buf); ++ spin_unlock_irqrestore(&tx_lock, flags); ++} ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/include/asm-x86/fixmap.h b/xen/include/asm-x86/fixmap.h +index ded4ddf21b..16ccaa2c77 100644 +--- a/xen/include/asm-x86/fixmap.h ++++ b/xen/include/asm-x86/fixmap.h +@@ -46,6 +46,7 @@ enum fixed_addresses { + FIX_COM_END, + FIX_EHCI_DBGP, + #ifdef CONFIG_XEN_GUEST ++ FIX_PV_CONSOLE, + FIX_XEN_SHARED_INFO, + #endif /* CONFIG_XEN_GUEST */ + /* Everything else should go further down. */ +diff --git a/xen/include/asm-x86/guest/hypercall.h b/xen/include/asm-x86/guest/hypercall.h +index b36a1cc189..81a955d479 100644 +--- a/xen/include/asm-x86/guest/hypercall.h ++++ b/xen/include/asm-x86/guest/hypercall.h +@@ -105,6 +105,11 @@ static inline int xen_hypercall_vcpu_op(unsigned int cmd, unsigned int vcpu, + return _hypercall64_3(long, __HYPERVISOR_vcpu_op, cmd, vcpu, arg); + } + ++static inline long xen_hypercall_event_channel_op(unsigned int cmd, void *arg) ++{ ++ return _hypercall64_2(long, __HYPERVISOR_event_channel_op, cmd, arg); ++} ++ + static inline long xen_hypercall_hvm_op(unsigned int op, void *arg) + { + return _hypercall64_2(long, __HYPERVISOR_hvm_op, op, arg); +@@ -126,6 +131,34 @@ static inline long xen_hypercall_shutdown(unsigned int reason) + return xen_hypercall_sched_op(SCHEDOP_shutdown, &s); + } + ++static inline long xen_hypercall_evtchn_send(evtchn_port_t port) ++{ ++ struct evtchn_send send = { .port = port }; ++ ++ return xen_hypercall_event_channel_op(EVTCHNOP_send, &send); ++} ++ ++static inline long xen_hypercall_evtchn_unmask(evtchn_port_t port) ++{ ++ struct evtchn_unmask unmask = { .port = port }; ++ ++ return xen_hypercall_event_channel_op(EVTCHNOP_unmask, &unmask); ++} ++ ++static inline long xen_hypercall_hvm_get_param(uint32_t index, uint64_t *value) ++{ ++ struct xen_hvm_param xhv = { ++ .domid = DOMID_SELF, ++ .index = index, ++ }; ++ long ret = xen_hypercall_hvm_op(HVMOP_get_param, &xhv); ++ ++ if ( ret == 0 ) ++ *value = xhv.value; ++ ++ return ret; ++} ++ + static inline long xen_hypercall_set_evtchn_upcall_vector( + unsigned int cpu, unsigned int vector) + { +diff --git a/xen/include/xen/pv_console.h b/xen/include/xen/pv_console.h +new file mode 100644 +index 0000000000..e578b56620 +--- /dev/null ++++ b/xen/include/xen/pv_console.h +@@ -0,0 +1,32 @@ ++#ifndef __XEN_PV_CONSOLE_H__ ++#define __XEN_PV_CONSOLE_H__ ++ ++#include ++ ++#ifdef CONFIG_XEN_GUEST ++ ++void pv_console_init(void); ++void pv_console_set_rx_handler(serial_rx_fn fn); ++void pv_console_init_postirq(void); ++void pv_console_puts(const char *buf); ++size_t pv_console_rx(struct cpu_user_regs *regs); ++ ++#else ++ ++static inline void pv_console_init(void) {} ++static inline void pv_console_set_rx_handler(serial_rx_fn fn) { } ++static inline void pv_console_init_postirq(void) { } ++static inline void pv_console_puts(const char *buf) { } ++static inline size_t pv_console_rx(struct cpu_user_regs *regs) { return 0; } ++ ++#endif /* !CONFIG_XEN_GUEST */ ++#endif /* __XEN_PV_CONSOLE_H__ */ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +-- +2.14.3 + + +From aa96a59dc2290fc3084525659282a59b29eff1d5 Mon Sep 17 00:00:00 2001 +From: Sergey Dyasli +Date: Fri, 24 Nov 2017 11:21:17 +0000 +Subject: [PATCH 49/77] x86/guest: use PV console for Xen/Dom0 I/O + +Signed-off-by: Sergey Dyasli +Signed-off-by: Wei Liu +--- + docs/misc/xen-command-line.markdown | 5 ++++- + xen/arch/x86/guest/xen.c | 3 +++ + xen/drivers/char/console.c | 16 ++++++++++++++++ + 3 files changed, 23 insertions(+), 1 deletion(-) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index e5979bceee..da006dd4f7 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -365,7 +365,7 @@ The following are examples of correct specifications: + Specify the size of the console ring buffer. + + ### console +-> `= List of [ vga | com1[H,L] | com2[H,L] | dbgp | none ]` ++> `= List of [ vga | com1[H,L] | com2[H,L] | pv | dbgp | none ]` + + > Default: `console=com1,vga` + +@@ -381,6 +381,9 @@ the converse; transmitted and received characters will have their MSB + cleared. This allows a single port to be shared by two subsystems + (e.g. console and debugger). + ++`pv` indicates that Xen should use Xen's PV console. This option is ++only available when used together with `pv-in-pvh`. ++ + `dbgp` indicates that Xen should use a USB debug port. + + `none` indicates that Xen should not use a console. This option only +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index 59871170c8..d4968b47aa 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -194,6 +195,8 @@ static void xen_evtchn_upcall(struct cpu_user_regs *regs) + vcpu_info->evtchn_upcall_pending = 0; + write_atomic(&vcpu_info->evtchn_pending_sel, 0); + ++ pv_console_rx(regs); ++ + ack_APIC_irq(); + } + +diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c +index d05ebf9f70..8acd358395 100644 +--- a/xen/drivers/char/console.c ++++ b/xen/drivers/char/console.c +@@ -32,6 +32,7 @@ + #include + + #ifdef CONFIG_X86 ++#include + #include + #endif + +@@ -344,6 +345,11 @@ static void sercon_puts(const char *s) + (*serial_steal_fn)(s); + else + serial_puts(sercon_handle, s); ++ ++#ifdef CONFIG_X86 ++ /* Copy all serial output into PV console */ ++ pv_console_puts(s); ++#endif + } + + static void dump_console_ring_key(unsigned char key) +@@ -805,6 +811,8 @@ void __init console_init_preirq(void) + if ( !strncmp(p, "vga", 3) ) + video_init(); + #ifdef CONFIG_X86 ++ else if ( !strncmp(p, "pv", 2) ) ++ pv_console_init(); + else if ( !strncmp(p, "xen", 3) ) + opt_console_xen = true; + #endif +@@ -828,6 +836,10 @@ void __init console_init_preirq(void) + + serial_set_rx_handler(sercon_handle, serial_rx); + ++#ifdef CONFIG_X86 ++ pv_console_set_rx_handler(serial_rx); ++#endif ++ + /* HELLO WORLD --- start-of-day banner text. */ + spin_lock(&console_lock); + __putstr(xen_banner()); +@@ -880,6 +892,10 @@ void __init console_init_postirq(void) + { + serial_init_postirq(); + ++#ifdef CONFIG_X86 ++ pv_console_init_postirq(); ++#endif ++ + if ( conring != _conring ) + return; + +-- +2.14.3 + + +From b5ead1fad3930a3e1034f64f9af416ae211e27da Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Fri, 10 Nov 2017 16:35:26 +0000 +Subject: [PATCH 50/77] x86/shim: Kconfig and command line options + +Signed-off-by: Andrew Cooper +Signed-off-by: Wei Liu +--- + docs/misc/xen-command-line.markdown | 11 ++++++++++ + xen/arch/x86/Kconfig | 22 +++++++++++++++++++ + xen/arch/x86/pv/Makefile | 1 + + xen/arch/x86/pv/shim.c | 39 ++++++++++++++++++++++++++++++++++ + xen/include/asm-x86/guest.h | 1 + + xen/include/asm-x86/pv/shim.h | 42 +++++++++++++++++++++++++++++++++++++ + 6 files changed, 116 insertions(+) + create mode 100644 xen/arch/x86/pv/shim.c + create mode 100644 xen/include/asm-x86/pv/shim.h + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index da006dd4f7..3a1a9c1fba 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -1445,6 +1445,17 @@ do; there may be other custom operating systems which do. If you're + certain you don't plan on having PV guests which use this feature, + turning it off can reduce the attack surface. + ++### pv-shim (x86) ++> `= ` ++ ++> Default: `false` ++ ++This option is intended for use by a toolstack, when choosing to run a PV ++guest compatibly inside an HVM container. ++ ++In this mode, the kernel and initrd passed as modules to the hypervisor are ++constructed into a plain unprivileged PV domain. ++ + ### rcu-idle-timer-period-ms + > `= ` + +diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig +index c0b0bcdcb3..4953533f16 100644 +--- a/xen/arch/x86/Kconfig ++++ b/xen/arch/x86/Kconfig +@@ -133,6 +133,28 @@ config PVH_GUEST + ---help--- + Support booting using the PVH ABI. + ++ If unsure, say N. ++ ++config PV_SHIM ++ def_bool n ++ prompt "PV Shim" ++ depends on PV && XEN_GUEST ++ ---help--- ++ Build Xen with a mode which acts as a shim to allow PV guest to run ++ in an HVM/PVH container. This mode can only be enabled with command ++ line option. ++ ++ If unsure, say N. ++ ++config PV_SHIM_EXCLUSIVE ++ def_bool n ++ prompt "PV Shim Exclusive" ++ depends on PV_SHIM ++ ---help--- ++ Build Xen in a way which unconditionally assumes PV_SHIM mode. This ++ option is only intended for use when building a dedicated PV Shim ++ firmware, and will not function correctly in other scenarios. ++ + If unsure, say N. + endmenu + +diff --git a/xen/arch/x86/pv/Makefile b/xen/arch/x86/pv/Makefile +index bac2792aa2..65bca04175 100644 +--- a/xen/arch/x86/pv/Makefile ++++ b/xen/arch/x86/pv/Makefile +@@ -11,6 +11,7 @@ obj-y += iret.o + obj-y += misc-hypercalls.o + obj-y += mm.o + obj-y += ro-page-fault.o ++obj-$(CONFIG_PV_SHIM) += shim.o + obj-y += traps.o + + obj-bin-y += dom0_build.init.o +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +new file mode 100644 +index 0000000000..4d037355db +--- /dev/null ++++ b/xen/arch/x86/pv/shim.c +@@ -0,0 +1,39 @@ ++/****************************************************************************** ++ * arch/x86/pv/shim.c ++ * ++ * Functionaltiy for PV Shim mode ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; If not, see . ++ * ++ * Copyright (c) 2017 Citrix Systems Ltd. ++ */ ++#include ++#include ++ ++#include ++ ++#ifndef CONFIG_PV_SHIM_EXCLUSIVE ++bool pv_shim; ++boolean_param("pv-shim", pv_shim); ++#endif ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/include/asm-x86/guest.h b/xen/include/asm-x86/guest.h +index 5abdb8c433..a38c6b5b3f 100644 +--- a/xen/include/asm-x86/guest.h ++++ b/xen/include/asm-x86/guest.h +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + + #endif /* __X86_GUEST_H__ */ + +diff --git a/xen/include/asm-x86/pv/shim.h b/xen/include/asm-x86/pv/shim.h +new file mode 100644 +index 0000000000..1468cfd498 +--- /dev/null ++++ b/xen/include/asm-x86/pv/shim.h +@@ -0,0 +1,42 @@ ++/****************************************************************************** ++ * asm-x86/guest/shim.h ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms and conditions of the GNU General Public ++ * License, version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public ++ * License along with this program; If not, see . ++ * ++ * Copyright (c) 2017 Citrix Systems Ltd. ++ */ ++ ++#ifndef __X86_PV_SHIM_H__ ++#define __X86_PV_SHIM_H__ ++ ++#include ++ ++#if defined(CONFIG_PV_SHIM_EXCLUSIVE) ++# define pv_shim 1 ++#elif defined(CONFIG_PV_SHIM) ++extern bool pv_shim; ++#else ++# define pv_shim 0 ++#endif /* CONFIG_PV_SHIM{,_EXCLUSIVE} */ ++ ++#endif /* __X86_PV_SHIM_H__ */ ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +-- +2.14.3 + + +From 378425686619e5fae65988cfedd23d5883206c2b Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Wed, 22 Nov 2017 13:31:26 +0000 +Subject: [PATCH 51/77] tools/firmware: Build and install xen-shim +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Link a minimum set of files to build the shim. The linkfarm rune can +handle creation and deletion of files. Introduce build-shim and +install-shim targets in xen/Makefile. + +We can do better by properly generate the dependency from the list of +files but that's an improvement for later. + +Signed-off-by: Andrew Cooper +Signed-off-by: Wei Liu +[change default scheduler to credit] +Signed-off-by: Roger Pau Monné +--- +v2: Introduce a top-level build-shim target. Split the xen-shim build + with normal build. +--- + .gitignore | 4 ++ + tools/firmware/Makefile | 9 ++++ + tools/firmware/xen-dir/Makefile | 59 ++++++++++++++++++++++++++ + tools/firmware/xen-dir/shim.config | 87 ++++++++++++++++++++++++++++++++++++++ + xen/Makefile | 16 +++++-- + 5 files changed, 172 insertions(+), 3 deletions(-) + create mode 100644 tools/firmware/xen-dir/Makefile + create mode 100644 tools/firmware/xen-dir/shim.config + +diff --git a/.gitignore b/.gitignore +index 8da67daf31..f6cc61a701 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -155,6 +155,10 @@ tools/firmware/rombios/rombios[^/]*.s + tools/firmware/rombios/32bit/32bitbios_flat.h + tools/firmware/vgabios/vbetables-gen + tools/firmware/vgabios/vbetables.h ++tools/firmware/xen-dir/*.old ++tools/firmware/xen-dir/linkfarm.stamp* ++tools/firmware/xen-dir/xen-root ++tools/firmware/xen-dir/xen-shim + tools/flask/utils/flask-getenforce + tools/flask/utils/flask-get-bool + tools/flask/utils/flask-loadpolicy +diff --git a/tools/firmware/Makefile b/tools/firmware/Makefile +index 868b506920..9387cc0878 100644 +--- a/tools/firmware/Makefile ++++ b/tools/firmware/Makefile +@@ -1,6 +1,8 @@ + XEN_ROOT = $(CURDIR)/../.. + include $(XEN_ROOT)/tools/Rules.mk + ++CONFIG_PV_SHIM := y ++ + # hvmloader is a 32-bit protected mode binary. + TARGET := hvmloader/hvmloader + INST_DIR := $(DESTDIR)$(XENFIRMWAREDIR) +@@ -11,6 +13,7 @@ SUBDIRS-$(CONFIG_SEABIOS) += seabios-dir + SUBDIRS-$(CONFIG_ROMBIOS) += rombios + SUBDIRS-$(CONFIG_ROMBIOS) += vgabios + SUBDIRS-$(CONFIG_ROMBIOS) += etherboot ++SUBDIRS-$(CONFIG_PV_SHIM) += xen-dir + SUBDIRS-y += hvmloader + + LD32BIT-$(CONFIG_FreeBSD) := LD32BIT_FLAG=-melf_i386_fbsd +@@ -48,6 +51,9 @@ endif + ifeq ($(CONFIG_OVMF),y) + $(INSTALL_DATA) ovmf-dir/ovmf.bin $(INST_DIR)/ovmf.bin + endif ++ifeq ($(CONFIG_PV_SHIM),y) ++ $(INSTALL_DATA) xen-dir/xen-shim $(INST_DIR)/xen-shim ++endif + + .PHONY: uninstall + uninstall: +@@ -58,6 +64,9 @@ endif + ifeq ($(CONFIG_OVMF),y) + rm -f $(INST_DIR)/ovmf.bin + endif ++ifeq ($(CONFIG_PV_SHIM),y) ++ rm -f $(INST_DIR)/xen-shim ++endif + + .PHONY: clean + clean: subdirs-clean +diff --git a/tools/firmware/xen-dir/Makefile b/tools/firmware/xen-dir/Makefile +new file mode 100644 +index 0000000000..adf6c31e8d +--- /dev/null ++++ b/tools/firmware/xen-dir/Makefile +@@ -0,0 +1,59 @@ ++XEN_ROOT = $(CURDIR)/../../.. ++ ++all: xen-shim ++ ++.PHONY: FORCE ++FORCE: ++ ++D=xen-root ++ ++# Minimun set of files / directories go get Xen to build ++LINK_DIRS=config xen ++LINK_FILES=Config.mk ++ ++DEP_DIRS=$(foreach i, $(LINK_DIRS), $(XEN_ROOT)/$(i)) ++DEP_FILES=$(foreach i, $(LINK_FILES), $(XEN_ROOT)/$(i)) ++ ++linkfarm.stamp: $(DEP_DIRS) $(DEP_FILES) FORCE ++ mkdir -p $(D) ++ set -e ++ rm -f linkfarm.stamp.tmp ++ $(foreach d, $(LINK_DIRS), \ ++ (mkdir -p $(D)/$(d); \ ++ cd $(D)/$(d); \ ++ find $(XEN_ROOT)/$(d)/ -type d -printf "./%P\n" | xargs mkdir -p);) ++ $(foreach d, $(LINK_DIRS), \ ++ (cd $(XEN_ROOT); \ ++ find $(d) ! -type l -type f \ ++ $(addprefix ! -path , '*.[oda1]' '*.d[12]')) \ ++ >> linkfarm.stamp.tmp ; ) ++ $(foreach f, $(LINK_FILES), \ ++ echo $(f) >> linkfarm.stamp.tmp ;) ++ cmp -s linkfarm.stamp.tmp linkfarm.stamp && \ ++ rm linkfarm.stamp.tmp || { \ ++ mv linkfarm.stamp.tmp linkfarm.stamp; \ ++ cat linkfarm.stamp | while read f; \ ++ do rm -f "$(D)/$$f"; ln -s "$(XEN_ROOT)/$$f" "$(D)/$$f"; done \ ++ } ++ ++# Copy enough of the tree to build the shim hypervisor ++$(D): linkfarm.stamp ++ $(MAKE) -C $(D)/xen distclean ++ ++.PHONY: shim-%config ++shim-%config: $(D) FORCE ++ $(MAKE) -C $(D)/xen $*config \ ++ XEN_CONFIG_EXPERT=y \ ++ KCONFIG_CONFIG=$(CURDIR)/shim.config ++ ++xen-shim: $(D) shim-olddefconfig ++ $(MAKE) -C $(D)/xen install-shim \ ++ XEN_CONFIG_EXPERT=y \ ++ KCONFIG_CONFIG=$(CURDIR)/shim.config \ ++ DESTDIR=$(CURDIR) ++ ++.PHONY: distclean clean ++distclean clean: ++ rm -f xen-shim *.old ++ rm -rf $(D) ++ rm -f linkfarm.stamp* +diff --git a/tools/firmware/xen-dir/shim.config b/tools/firmware/xen-dir/shim.config +new file mode 100644 +index 0000000000..151a8b41e5 +--- /dev/null ++++ b/tools/firmware/xen-dir/shim.config +@@ -0,0 +1,87 @@ ++# ++# Automatically generated file; DO NOT EDIT. ++# Xen/x86 4.11-unstable Configuration ++# ++CONFIG_X86_64=y ++CONFIG_X86=y ++CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" ++ ++# ++# Architecture Features ++# ++CONFIG_NR_CPUS=32 ++CONFIG_PV=y ++CONFIG_PV_LINEAR_PT=y ++CONFIG_HVM=y ++# CONFIG_SHADOW_PAGING is not set ++# CONFIG_BIGMEM is not set ++# CONFIG_HVM_FEP is not set ++# CONFIG_TBOOT is not set ++CONFIG_XEN_GUEST=y ++CONFIG_PVH_GUEST=y ++CONFIG_PV_SHIM=y ++CONFIG_PV_SHIM_EXCLUSIVE=y ++ ++# ++# Common Features ++# ++CONFIG_COMPAT=y ++CONFIG_CORE_PARKING=y ++CONFIG_HAS_ALTERNATIVE=y ++CONFIG_HAS_EX_TABLE=y ++CONFIG_HAS_MEM_ACCESS=y ++CONFIG_HAS_MEM_PAGING=y ++CONFIG_HAS_MEM_SHARING=y ++CONFIG_HAS_PDX=y ++CONFIG_HAS_UBSAN=y ++CONFIG_HAS_KEXEC=y ++CONFIG_HAS_GDBSX=y ++CONFIG_HAS_IOPORTS=y ++# CONFIG_KEXEC is not set ++# CONFIG_TMEM is not set ++# CONFIG_XENOPROF is not set ++# CONFIG_XSM is not set ++ ++# ++# Schedulers ++# ++CONFIG_SCHED_CREDIT=y ++# CONFIG_SCHED_CREDIT2 is not set ++# CONFIG_SCHED_RTDS is not set ++# CONFIG_SCHED_ARINC653 is not set ++# CONFIG_SCHED_NULL is not set ++# CONFIG_SCHED_CREDIT_DEFAULT is not set ++CONFIG_SCHED_CREDIT_DEFAULT=y ++CONFIG_SCHED_DEFAULT="credit" ++# CONFIG_LIVEPATCH is not set ++# CONFIG_SUPPRESS_DUPLICATE_SYMBOL_WARNINGS is not set ++CONFIG_CMDLINE="" ++ ++# ++# Device Drivers ++# ++CONFIG_ACPI=y ++CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y ++CONFIG_NUMA=y ++CONFIG_HAS_NS16550=y ++CONFIG_HAS_EHCI=y ++CONFIG_HAS_CPUFREQ=y ++CONFIG_HAS_PASSTHROUGH=y ++CONFIG_HAS_PCI=y ++CONFIG_VIDEO=y ++CONFIG_VGA=y ++CONFIG_DEFCONFIG_LIST="$ARCH_DEFCONFIG" ++CONFIG_ARCH_SUPPORTS_INT128=y ++ ++# ++# Debugging Options ++# ++# CONFIG_DEBUG is not set ++# CONFIG_CRASH_DEBUG is not set ++# CONFIG_FRAME_POINTER is not set ++# CONFIG_GCOV is not set ++# CONFIG_LOCK_PROFILE is not set ++# CONFIG_PERF_COUNTERS is not set ++# CONFIG_VERBOSE_DEBUG is not set ++# CONFIG_SCRUB_DEBUG is not set ++# CONFIG_UBSAN is not set +diff --git a/xen/Makefile b/xen/Makefile +index 58a1f97d7d..623f889082 100644 +--- a/xen/Makefile ++++ b/xen/Makefile +@@ -37,10 +37,10 @@ default: build + .PHONY: dist + dist: install + +-build install:: include/config/auto.conf ++build install build-shim:: include/config/auto.conf + +-.PHONY: build install uninstall clean distclean cscope TAGS tags MAP gtags tests +-build install uninstall debug clean distclean cscope TAGS tags MAP gtags tests:: ++.PHONY: build install uninstall clean distclean cscope TAGS tags MAP gtags tests install-shim build-shim ++build install uninstall debug clean distclean cscope TAGS tags MAP gtags tests install-shim build-shim:: + ifneq ($(XEN_TARGET_ARCH),x86_32) + $(MAKE) -f Rules.mk _$@ + else +@@ -80,6 +80,13 @@ _install: $(TARGET)$(CONFIG_XEN_INSTALL_SUFFIX) + fi; \ + fi + ++.PHONY: _build-shim ++_build-shim: $(TARGET)-shim ++ ++.PHONY: _install-shim ++_install-shim: build-shim ++ $(INSTALL_DATA) $(TARGET)-shim $(DESTDIR) ++ + .PHONY: _tests + _tests: + $(MAKE) -f $(BASEDIR)/Rules.mk -C test tests +@@ -144,6 +151,9 @@ $(TARGET): delete-unfresh-files + $(MAKE) -f $(BASEDIR)/Rules.mk include/asm-$(TARGET_ARCH)/asm-offsets.h + $(MAKE) -f $(BASEDIR)/Rules.mk -C arch/$(TARGET_ARCH) $(TARGET) + ++$(TARGET)-shim: $(TARGET) ++ $(MAKE) -f $(BASEDIR)/Rules.mk -C arch/$(TARGET_ARCH) $(TARGET)-shim ++ + # drivers/char/console.o contains static banner/compile info. Blow it away. + # Don't refresh these files during e.g., 'sudo make install' + .PHONY: delete-unfresh-files +-- +2.14.3 + + +From 2b8a95a2961ba4a5e54b45b49cb6528068a3c0b3 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Tue, 28 Nov 2017 09:54:17 +0000 +Subject: [PATCH 52/77] xen/x86: make VGA support selectable +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Through a Kconfig option. Enable it by default, and disable it for the +PV-in-PVH shim. + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +--- +Changes since v1: + - Make the VGA option dependent on the shim one. +--- + tools/firmware/xen-dir/shim.config | 3 +-- + xen/arch/x86/Kconfig | 1 - + xen/arch/x86/boot/build32.mk | 1 + + xen/arch/x86/boot/cmdline.c | 5 ++++- + xen/arch/x86/boot/trampoline.S | 7 +++++++ + xen/arch/x86/efi/efi-boot.h | 4 ++++ + xen/arch/x86/platform_hypercall.c | 2 ++ + xen/arch/x86/pv/dom0_build.c | 2 ++ + xen/arch/x86/setup.c | 6 ++++++ + xen/drivers/video/Kconfig | 8 +++++++- + xen/include/asm-x86/setup.h | 6 ++++++ + 11 files changed, 40 insertions(+), 5 deletions(-) + +diff --git a/tools/firmware/xen-dir/shim.config b/tools/firmware/xen-dir/shim.config +index 151a8b41e5..d22c2fd2f4 100644 +--- a/tools/firmware/xen-dir/shim.config ++++ b/tools/firmware/xen-dir/shim.config +@@ -68,8 +68,7 @@ CONFIG_HAS_EHCI=y + CONFIG_HAS_CPUFREQ=y + CONFIG_HAS_PASSTHROUGH=y + CONFIG_HAS_PCI=y +-CONFIG_VIDEO=y +-CONFIG_VGA=y ++# CONFIG_VGA is not set + CONFIG_DEFCONFIG_LIST="$ARCH_DEFCONFIG" + CONFIG_ARCH_SUPPORTS_INT128=y + +diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig +index 4953533f16..f621e799ed 100644 +--- a/xen/arch/x86/Kconfig ++++ b/xen/arch/x86/Kconfig +@@ -24,7 +24,6 @@ config X86 + select HAS_PDX + select HAS_UBSAN + select NUMA +- select VGA + + config ARCH_DEFCONFIG + string +diff --git a/xen/arch/x86/boot/build32.mk b/xen/arch/x86/boot/build32.mk +index f7e8ebe67d..48c7407c00 100644 +--- a/xen/arch/x86/boot/build32.mk ++++ b/xen/arch/x86/boot/build32.mk +@@ -5,6 +5,7 @@ include $(XEN_ROOT)/Config.mk + $(call cc-options-add,CFLAGS,CC,$(EMBEDDED_EXTRA_CFLAGS)) + + CFLAGS += -Werror -fno-asynchronous-unwind-tables -fno-builtin -g0 -msoft-float ++CFLAGS += -I$(XEN_ROOT)/xen/include + CFLAGS := $(filter-out -flto,$(CFLAGS)) + + # NB. awk invocation is a portable alternative to 'head -n -1' +diff --git a/xen/arch/x86/boot/cmdline.c b/xen/arch/x86/boot/cmdline.c +index 06aa064e72..51b0659a04 100644 +--- a/xen/arch/x86/boot/cmdline.c ++++ b/xen/arch/x86/boot/cmdline.c +@@ -30,6 +30,7 @@ asm ( + " jmp cmdline_parse_early \n" + ); + ++#include + #include "defs.h" + #include "video.h" + +@@ -336,5 +337,7 @@ void __stdcall cmdline_parse_early(const char *cmdline, early_boot_opts_t *ebo) + ebo->skip_realmode = skip_realmode(cmdline); + ebo->opt_edd = edd_parse(cmdline); + ebo->opt_edid = edid_parse(cmdline); +- vga_parse(cmdline, ebo); ++ ++ if ( IS_ENABLED(CONFIG_VIDEO) ) ++ vga_parse(cmdline, ebo); + } +diff --git a/xen/arch/x86/boot/trampoline.S b/xen/arch/x86/boot/trampoline.S +index 4d640f3fcd..a17a90df5e 100644 +--- a/xen/arch/x86/boot/trampoline.S ++++ b/xen/arch/x86/boot/trampoline.S +@@ -219,7 +219,9 @@ trampoline_boot_cpu_entry: + */ + call get_memory_map + call get_edd ++#ifdef CONFIG_VIDEO + call video ++#endif + + mov $0x0200,%ax + int $0x16 +@@ -267,10 +269,13 @@ opt_edid: + .byte 0 /* EDID parsing option (force/no/default). */ + /* Padding. */ + .byte 0 ++ ++#ifdef CONFIG_VIDEO + GLOBAL(boot_vid_mode) + .word VIDEO_80x25 /* If we don't run at all, assume basic video mode 3 at 80x25. */ + vesa_size: + .word 0,0,0 /* width x depth x height */ ++#endif + + GLOBAL(kbd_shift_flags) + .byte 0 +@@ -279,4 +284,6 @@ rm_idt: .word 256*4-1, 0, 0 + + #include "mem.S" + #include "edd.S" ++#ifdef CONFIG_VIDEO + #include "video.S" ++#endif +diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h +index d30f688a5a..5789d2cb70 100644 +--- a/xen/arch/x86/efi/efi-boot.h ++++ b/xen/arch/x86/efi/efi-boot.h +@@ -479,16 +479,19 @@ static void __init efi_arch_edd(void) + + static void __init efi_arch_console_init(UINTN cols, UINTN rows) + { ++#ifdef CONFIG_VIDEO + vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3; + vga_console_info.u.text_mode_3.columns = cols; + vga_console_info.u.text_mode_3.rows = rows; + vga_console_info.u.text_mode_3.font_height = 16; ++#endif + } + + static void __init efi_arch_video_init(EFI_GRAPHICS_OUTPUT_PROTOCOL *gop, + UINTN info_size, + EFI_GRAPHICS_OUTPUT_MODE_INFORMATION *mode_info) + { ++#ifdef CONFIG_VIDEO + int bpp = 0; + + switch ( mode_info->PixelFormat ) +@@ -550,6 +553,7 @@ static void __init efi_arch_video_init(EFI_GRAPHICS_OUTPUT_PROTOCOL *gop, + vga_console_info.u.vesa_lfb.lfb_size = + (gop->Mode->FrameBufferSize + 0xffff) >> 16; + } ++#endif + } + + static void __init efi_arch_memory_setup(void) +diff --git a/xen/arch/x86/platform_hypercall.c b/xen/arch/x86/platform_hypercall.c +index ebc2f394ee..ea18c3215a 100644 +--- a/xen/arch/x86/platform_hypercall.c ++++ b/xen/arch/x86/platform_hypercall.c +@@ -388,6 +388,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PARAM(xen_platform_op_t) u_xenpf_op) + } + case XEN_FW_VBEDDC_INFO: + ret = -ESRCH; ++#ifdef CONFIG_VIDEO + if ( op->u.firmware_info.index != 0 ) + break; + if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 ) +@@ -406,6 +407,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PARAM(xen_platform_op_t) u_xenpf_op) + copy_to_compat(op->u.firmware_info.u.vbeddc_info.edid, + bootsym(boot_edid_info), 128) ) + ret = -EFAULT; ++#endif + break; + case XEN_FW_EFI_INFO: + ret = efi_get_info(op->u.firmware_info.index, +diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c +index a13412efb9..a3be335b0b 100644 +--- a/xen/arch/x86/pv/dom0_build.c ++++ b/xen/arch/x86/pv/dom0_build.c +@@ -832,11 +832,13 @@ int __init dom0_construct_pv(struct domain *d, + if ( cmdline != NULL ) + strlcpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)); + ++#ifdef CONFIG_VIDEO + if ( fill_console_start_info((void *)(si + 1)) ) + { + si->console.dom0.info_off = sizeof(struct start_info); + si->console.dom0.info_size = sizeof(struct dom0_vga_console_info); + } ++#endif + + if ( is_pv_32bit_domain(d) ) + xlat_start_info(si, XLAT_start_info_console_dom0); +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index c1f4184e06..2279014f74 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -456,6 +456,7 @@ static void __init setup_max_pdx(unsigned long top_page) + /* A temporary copy of the e820 map that we can mess with during bootstrap. */ + static struct e820map __initdata boot_e820; + ++#ifdef CONFIG_VIDEO + struct boot_video_info { + u8 orig_x; /* 0x00 */ + u8 orig_y; /* 0x01 */ +@@ -486,9 +487,11 @@ struct boot_video_info { + u16 vesa_attrib; /* 0x28 */ + }; + extern struct boot_video_info boot_vid_info; ++#endif + + static void __init parse_video_info(void) + { ++#ifdef CONFIG_VIDEO + struct boot_video_info *bvi = &bootsym(boot_vid_info); + + /* vga_console_info is filled directly on EFI platform. */ +@@ -524,6 +527,7 @@ static void __init parse_video_info(void) + vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities; + vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib; + } ++#endif + } + + static void __init kexec_reserve_area(struct e820map *e820) +@@ -741,6 +745,7 @@ void __init noreturn __start_xen(unsigned long mbi_p) + + printk("Xen image load base address: %#lx\n", xen_phys_start); + ++#ifdef CONFIG_VIDEO + printk("Video information:\n"); + + /* Print VGA display mode information. */ +@@ -784,6 +789,7 @@ void __init noreturn __start_xen(unsigned long mbi_p) + printk("of reasons unknown\n"); + } + } ++#endif + + printk("Disc information:\n"); + printk(" Found %d MBR signatures\n", +diff --git a/xen/drivers/video/Kconfig b/xen/drivers/video/Kconfig +index 0ffbbd9a88..52e8ce6c15 100644 +--- a/xen/drivers/video/Kconfig ++++ b/xen/drivers/video/Kconfig +@@ -3,8 +3,14 @@ config VIDEO + bool + + config VGA +- bool ++ bool "VGA support" if !PV_SHIM_EXCLUSIVE + select VIDEO ++ depends on X86 ++ default y if !PV_SHIM_EXCLUSIVE ++ ---help--- ++ Enable VGA output for the Xen hypervisor. ++ ++ If unsure, say Y. + + config HAS_ARM_HDLCD + bool +diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h +index c5b3d4ef18..b68ec9de4d 100644 +--- a/xen/include/asm-x86/setup.h ++++ b/xen/include/asm-x86/setup.h +@@ -31,8 +31,14 @@ void arch_init_memory(void); + void subarch_init_memory(void); + + void init_IRQ(void); ++ ++#ifdef CONFIG_VIDEO + void vesa_init(void); + void vesa_mtrr_init(void); ++#else ++static inline void vesa_init(void) {}; ++static inline void vesa_mtrr_init(void) {}; ++#endif + + int construct_dom0( + struct domain *d, +-- +2.14.3 + + +From 4ba6447e7ddbee91c3781c2630ca1d28e080857c Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:18 +0000 +Subject: [PATCH 53/77] xen/pvh: do not mark the low 1MB as IO mem +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +On PVH there's nothing special on the low 1MB. + +This is an optional patch that doesn't affect the functionality of the +shim. + +Signed-off-by: Roger Pau Monné +Signed-off-by: Andrew Cooper +--- + xen/arch/x86/mm.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 0569342200..371c764027 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -122,6 +122,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -288,8 +289,12 @@ void __init arch_init_memory(void) + dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0, NULL); + BUG_ON(IS_ERR(dom_cow)); + +- /* First 1MB of RAM is historically marked as I/O. */ +- for ( i = 0; i < 0x100; i++ ) ++ /* ++ * First 1MB of RAM is historically marked as I/O. If we booted PVH, ++ * reclaim the space. Irrespective, leave MFN 0 as special for the sake ++ * of 0 being a very common default value. ++ */ ++ for ( i = 0; i < (pvh_boot ? 1 : 0x100); i++ ) + share_xen_page_with_guest(mfn_to_page(_mfn(i)), + dom_io, XENSHARE_writable); + +-- +2.14.3 + + +From 0ba5d8c27509ba2011591cfab2715e8ca6b7b402 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:18 +0000 +Subject: [PATCH 54/77] xen/pvshim: skip Dom0-only domain builder parts +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Do not allow access to any iomem or ioport by the shim, and also +remove the check for Dom0 kernel support. + +Signed-off-by: Roger Pau Monné +Acked-by: Jan Beulich +--- + xen/arch/x86/dom0_build.c | 4 ++++ + xen/arch/x86/pv/dom0_build.c | 3 ++- + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/dom0_build.c b/xen/arch/x86/dom0_build.c +index bf992fef6d..e2bf81b4e7 100644 +--- a/xen/arch/x86/dom0_build.c ++++ b/xen/arch/x86/dom0_build.c +@@ -13,6 +13,7 @@ + #include + + #include ++#include + #include + #include + #include +@@ -385,6 +386,9 @@ int __init dom0_setup_permissions(struct domain *d) + unsigned int i; + int rc; + ++ if ( pv_shim ) ++ return 0; ++ + /* The hardware domain is initially permitted full I/O capabilities. */ + rc = ioports_permit_access(d, 0, 0xFFFF); + rc |= iomem_permit_access(d, 0UL, (1UL << (paddr_bits - PAGE_SHIFT)) - 1); +diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c +index a3be335b0b..852d00a8be 100644 +--- a/xen/arch/x86/pv/dom0_build.c ++++ b/xen/arch/x86/pv/dom0_build.c +@@ -17,6 +17,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -373,7 +374,7 @@ int __init dom0_construct_pv(struct domain *d, + + if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE ) + { +- if ( !test_bit(XENFEAT_dom0, parms.f_supported) ) ++ if ( !pv_shim && !test_bit(XENFEAT_dom0, parms.f_supported) ) + { + printk("Kernel does not support Dom0 operation\n"); + rc = -EINVAL; +-- +2.14.3 + + +From 60dd95357cca09c5ed3c4f3d57c11b732ea8befd Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:18 +0000 +Subject: [PATCH 55/77] xen: mark xenstore/console pages as RAM +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This si required so that later they can be shared with the guest if +Xen is running in shim mode. + +Also prevent them from being used by Xen by marking them as bad pages +in init_boot_pages. + +Signed-off-by: Roger Pau Monné +Signed-off-by: Wei Liu +--- +Changes since v1: + - Remove adding the pages to dom_io, there's no need since they are + already marked as bad pages. + - Use a static global array to store the memory address of this + special pages, so Xen avoids having to call + xen_hypercall_hvm_get_param twice. +--- + xen/arch/x86/e820.c | 4 ++++ + xen/arch/x86/guest/xen.c | 43 +++++++++++++++++++++++++++++++++++++++ + xen/common/page_alloc.c | 15 ++++++++++++++ + xen/drivers/char/xen_pv_console.c | 4 ++++ + xen/include/asm-x86/guest/xen.h | 14 +++++++++++++ + 5 files changed, 80 insertions(+) + +diff --git a/xen/arch/x86/e820.c b/xen/arch/x86/e820.c +index b422a684ee..590ea985ef 100644 +--- a/xen/arch/x86/e820.c ++++ b/xen/arch/x86/e820.c +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + + /* + * opt_mem: Limit maximum address of physical RAM. +@@ -699,6 +700,9 @@ unsigned long __init init_e820(const char *str, struct e820map *raw) + + machine_specific_memory_setup(raw); + ++ if ( xen_guest ) ++ hypervisor_fixup_e820(&e820); ++ + printk("%s RAM map:\n", str); + print_e820_memory_map(e820.map, e820.nr_map); + +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index d4968b47aa..27a6c47753 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -32,12 +32,14 @@ + #include + + #include ++#include + + bool __read_mostly xen_guest; + + static __read_mostly uint32_t xen_cpuid_base; + extern char hypercall_page[]; + static struct rangeset *mem; ++static unsigned long __initdata reserved_pages[2]; + + DEFINE_PER_CPU(unsigned int, vcpu_id); + +@@ -279,6 +281,47 @@ int hypervisor_free_unused_page(mfn_t mfn) + return rangeset_remove_range(mem, mfn_x(mfn), mfn_x(mfn)); + } + ++static void __init mark_pfn_as_ram(struct e820map *e820, uint64_t pfn) ++{ ++ if ( !e820_add_range(e820, pfn << PAGE_SHIFT, ++ (pfn << PAGE_SHIFT) + PAGE_SIZE, E820_RAM) ) ++ if ( !e820_change_range_type(e820, pfn << PAGE_SHIFT, ++ (pfn << PAGE_SHIFT) + PAGE_SIZE, ++ E820_RESERVED, E820_RAM) ) ++ panic("Unable to add/change memory type of pfn %#lx to RAM", pfn); ++} ++ ++void __init hypervisor_fixup_e820(struct e820map *e820) ++{ ++ uint64_t pfn = 0; ++ unsigned int i = 0; ++ long rc; ++ ++ ASSERT(xen_guest); ++ ++#define MARK_PARAM_RAM(p) ({ \ ++ rc = xen_hypercall_hvm_get_param(p, &pfn); \ ++ if ( rc ) \ ++ panic("Unable to get " #p); \ ++ mark_pfn_as_ram(e820, pfn); \ ++ ASSERT(i < ARRAY_SIZE(reserved_pages)); \ ++ reserved_pages[i++] = pfn << PAGE_SHIFT; \ ++}) ++ MARK_PARAM_RAM(HVM_PARAM_STORE_PFN); ++ if ( !pv_console ) ++ MARK_PARAM_RAM(HVM_PARAM_CONSOLE_PFN); ++#undef MARK_PARAM_RAM ++} ++ ++const unsigned long *__init hypervisor_reserved_pages(unsigned int *size) ++{ ++ ASSERT(xen_guest); ++ ++ *size = ARRAY_SIZE(reserved_pages); ++ ++ return reserved_pages; ++} ++ + /* + * Local variables: + * mode: C +diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c +index 5616a82263..49b2a91751 100644 +--- a/xen/common/page_alloc.c ++++ b/xen/common/page_alloc.c +@@ -143,6 +143,7 @@ + #include + #include + #ifdef CONFIG_X86 ++#include + #include + #include /* for highmem_start only */ + #else +@@ -303,6 +304,20 @@ void __init init_boot_pages(paddr_t ps, paddr_t pe) + badpage++; + } + } ++ ++ if ( xen_guest ) ++ { ++ badpage = hypervisor_reserved_pages(&array_size); ++ if ( badpage ) ++ { ++ for ( i = 0; i < array_size; i++ ) ++ { ++ bootmem_region_zap(*badpage >> PAGE_SHIFT, ++ (*badpage >> PAGE_SHIFT) + 1); ++ badpage++; ++ } ++ } ++ } + #endif + + /* Check new pages against the bad-page list. */ +diff --git a/xen/drivers/char/xen_pv_console.c b/xen/drivers/char/xen_pv_console.c +index f5aca4c69e..d4f0532101 100644 +--- a/xen/drivers/char/xen_pv_console.c ++++ b/xen/drivers/char/xen_pv_console.c +@@ -35,6 +35,8 @@ static evtchn_port_t cons_evtchn; + static serial_rx_fn cons_rx_handler; + static DEFINE_SPINLOCK(tx_lock); + ++bool pv_console; ++ + void __init pv_console_init(void) + { + long r; +@@ -60,6 +62,8 @@ void __init pv_console_init(void) + + printk("Initialised PV console at 0x%p with pfn %#lx and evtchn %#x\n", + cons_ring, raw_pfn, cons_evtchn); ++ pv_console = true; ++ + return; + + error: +diff --git a/xen/include/asm-x86/guest/xen.h b/xen/include/asm-x86/guest/xen.h +index b3e684f756..62255fda8b 100644 +--- a/xen/include/asm-x86/guest/xen.h ++++ b/xen/include/asm-x86/guest/xen.h +@@ -29,12 +29,15 @@ + #ifdef CONFIG_XEN_GUEST + + extern bool xen_guest; ++extern bool pv_console; + + void probe_hypervisor(void); + void hypervisor_setup(void); + void hypervisor_ap_setup(void); + int hypervisor_alloc_unused_page(mfn_t *mfn); + int hypervisor_free_unused_page(mfn_t mfn); ++void hypervisor_fixup_e820(struct e820map *e820); ++const unsigned long *hypervisor_reserved_pages(unsigned int *size); + + DECLARE_PER_CPU(unsigned int, vcpu_id); + DECLARE_PER_CPU(struct vcpu_info *, vcpu_info); +@@ -42,6 +45,7 @@ DECLARE_PER_CPU(struct vcpu_info *, vcpu_info); + #else + + #define xen_guest 0 ++#define pv_console 0 + + static inline void probe_hypervisor(void) {}; + static inline void hypervisor_setup(void) +@@ -53,6 +57,16 @@ static inline void hypervisor_ap_setup(void) + ASSERT_UNREACHABLE(); + } + ++static inline void hypervisor_fixup_e820(struct e820map *e820) ++{ ++ ASSERT_UNREACHABLE(); ++} ++static inline const unsigned long *hypervisor_reserved_pages(unsigned int *size) ++{ ++ ASSERT_UNREACHABLE(); ++ return NULL; ++}; ++ + #endif /* CONFIG_XEN_GUEST */ + #endif /* __X86_GUEST_XEN_H__ */ + +-- +2.14.3 + + +From 1cd703979f73778403d0b0cf5c77c87534c544db Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:18 +0000 +Subject: [PATCH 56/77] xen/pvshim: modify Dom0 builder in order to build a + DomU +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +According to the PV ABI the initial virtual memory regions should +contain the xenstore and console pages after the start_info. Also set +the correct values in the start_info for DomU operation. + +Signed-off-by: Roger Pau Monné +--- +Changes since v1: + - Modify the position of the __init attribute in dom0_update_physmap. + - Move the addition of sizeof(struct dom0_vga_console_info) to + vstartinfo_end with an existing if branch. + - Add a TODO item for fill_console_start_info in the !CONFIG_VIDEO + case. + - s/replace_va/replace_va_mapping/. + - Remove call to free_domheap_pages in replace_va_mapping. + put_page_and_type should already take care of freeing the page. + - Use PFN_DOWN in SET_AND_MAP_PARAM macro. + - Parenthesize va in SET_AND_MAP_PARAM macro when required. +--- + xen/arch/x86/pv/dom0_build.c | 48 +++++++++++++++++++++++------- + xen/arch/x86/pv/shim.c | 63 ++++++++++++++++++++++++++++++++++++++++ + xen/include/asm-x86/dom0_build.h | 4 +++ + xen/include/asm-x86/pv/shim.h | 21 ++++++++++++++ + 4 files changed, 126 insertions(+), 10 deletions(-) + +diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c +index 852d00a8be..72752b8656 100644 +--- a/xen/arch/x86/pv/dom0_build.c ++++ b/xen/arch/x86/pv/dom0_build.c +@@ -31,9 +31,8 @@ + #define L3_PROT (BASE_PROT|_PAGE_DIRTY) + #define L4_PROT (BASE_PROT|_PAGE_DIRTY) + +-static __init void dom0_update_physmap(struct domain *d, unsigned long pfn, +- unsigned long mfn, +- unsigned long vphysmap_s) ++void __init dom0_update_physmap(struct domain *d, unsigned long pfn, ++ unsigned long mfn, unsigned long vphysmap_s) + { + if ( !is_pv_32bit_domain(d) ) + ((unsigned long *)vphysmap_s)[pfn] = mfn; +@@ -316,6 +315,10 @@ int __init dom0_construct_pv(struct domain *d, + unsigned long vphysmap_end; + unsigned long vstartinfo_start; + unsigned long vstartinfo_end; ++ unsigned long vxenstore_start = 0; ++ unsigned long vxenstore_end = 0; ++ unsigned long vconsole_start = 0; ++ unsigned long vconsole_end = 0; + unsigned long vstack_start; + unsigned long vstack_end; + unsigned long vpt_start; +@@ -441,11 +444,22 @@ int __init dom0_construct_pv(struct domain *d, + if ( parms.p2m_base != UNSET_ADDR ) + vphysmap_end = vphysmap_start; + vstartinfo_start = round_pgup(vphysmap_end); +- vstartinfo_end = (vstartinfo_start + +- sizeof(struct start_info) + +- sizeof(struct dom0_vga_console_info)); ++ vstartinfo_end = vstartinfo_start + sizeof(struct start_info); ++ ++ if ( pv_shim ) ++ { ++ vxenstore_start = round_pgup(vstartinfo_end); ++ vxenstore_end = vxenstore_start + PAGE_SIZE; ++ vconsole_start = vxenstore_end; ++ vconsole_end = vconsole_start + PAGE_SIZE; ++ vpt_start = vconsole_end; ++ } ++ else ++ { ++ vpt_start = round_pgup(vstartinfo_end); ++ vstartinfo_end += sizeof(struct dom0_vga_console_info); ++ } + +- vpt_start = round_pgup(vstartinfo_end); + for ( nr_pt_pages = 2; ; nr_pt_pages++ ) + { + vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE); +@@ -538,6 +552,8 @@ int __init dom0_construct_pv(struct domain *d, + " Init. ramdisk: %p->%p\n" + " Phys-Mach map: %p->%p\n" + " Start info: %p->%p\n" ++ " Xenstore ring: %p->%p\n" ++ " Console ring: %p->%p\n" + " Page tables: %p->%p\n" + " Boot stack: %p->%p\n" + " TOTAL: %p->%p\n", +@@ -545,6 +561,8 @@ int __init dom0_construct_pv(struct domain *d, + _p(vinitrd_start), _p(vinitrd_end), + _p(vphysmap_start), _p(vphysmap_end), + _p(vstartinfo_start), _p(vstartinfo_end), ++ _p(vxenstore_start), _p(vxenstore_end), ++ _p(vconsole_start), _p(vconsole_end), + _p(vpt_start), _p(vpt_end), + _p(vstack_start), _p(vstack_end), + _p(v_start), _p(v_end)); +@@ -742,7 +760,8 @@ int __init dom0_construct_pv(struct domain *d, + + si->shared_info = virt_to_maddr(d->shared_info); + +- si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN; ++ if ( !pv_shim ) ++ si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN; + if ( !vinitrd_start && initrd_len ) + si->flags |= SIF_MOD_START_PFN; + si->flags |= (xen_processor_pmbits << 8) & SIF_PM_MASK; +@@ -834,15 +853,24 @@ int __init dom0_construct_pv(struct domain *d, + strlcpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)); + + #ifdef CONFIG_VIDEO +- if ( fill_console_start_info((void *)(si + 1)) ) ++ if ( !pv_shim && fill_console_start_info((void *)(si + 1)) ) + { + si->console.dom0.info_off = sizeof(struct start_info); + si->console.dom0.info_size = sizeof(struct dom0_vga_console_info); + } + #endif + ++ /* ++ * TODO: provide an empty stub for fill_console_start_info in the ++ * !CONFIG_VIDEO case so the logic here can be simplified. ++ */ ++ if ( pv_shim ) ++ pv_shim_setup_dom(d, l4start, v_start, vxenstore_start, vconsole_start, ++ vphysmap_start, si); ++ + if ( is_pv_32bit_domain(d) ) +- xlat_start_info(si, XLAT_start_info_console_dom0); ++ xlat_start_info(si, pv_shim ? XLAT_start_info_console_domU ++ : XLAT_start_info_console_dom0); + + /* Return to idle domain's page tables. */ + mapcache_override_current(NULL); +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +index 4d037355db..75365b0697 100644 +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -18,16 +18,79 @@ + * + * Copyright (c) 2017 Citrix Systems Ltd. + */ ++#include + #include + #include + + #include ++#include ++#include ++#include + + #ifndef CONFIG_PV_SHIM_EXCLUSIVE + bool pv_shim; + boolean_param("pv-shim", pv_shim); + #endif + ++#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER| \ ++ _PAGE_GUEST_KERNEL) ++#define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) ++ ++static void __init replace_va_mapping(struct domain *d, l4_pgentry_t *l4start, ++ unsigned long va, unsigned long mfn) ++{ ++ struct page_info *page; ++ l4_pgentry_t *pl4e; ++ l3_pgentry_t *pl3e; ++ l2_pgentry_t *pl2e; ++ l1_pgentry_t *pl1e; ++ ++ pl4e = l4start + l4_table_offset(va); ++ pl3e = l4e_to_l3e(*pl4e); ++ pl3e += l3_table_offset(va); ++ pl2e = l3e_to_l2e(*pl3e); ++ pl2e += l2_table_offset(va); ++ pl1e = l2e_to_l1e(*pl2e); ++ pl1e += l1_table_offset(va); ++ ++ page = mfn_to_page(l1e_get_pfn(*pl1e)); ++ put_page_and_type(page); ++ ++ *pl1e = l1e_from_pfn(mfn, (!is_pv_32bit_domain(d) ? L1_PROT ++ : COMPAT_L1_PROT)); ++} ++ ++void __init pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, ++ unsigned long va_start, unsigned long store_va, ++ unsigned long console_va, unsigned long vphysmap, ++ start_info_t *si) ++{ ++ uint64_t param = 0; ++ long rc; ++ ++#define SET_AND_MAP_PARAM(p, si, va) ({ \ ++ rc = xen_hypercall_hvm_get_param(p, ¶m); \ ++ if ( rc ) \ ++ panic("Unable to get " #p "\n"); \ ++ (si) = param; \ ++ if ( va ) \ ++ { \ ++ share_xen_page_with_guest(mfn_to_page(param), d, XENSHARE_writable); \ ++ replace_va_mapping(d, l4start, va, param); \ ++ dom0_update_physmap(d, PFN_DOWN((va) - va_start), param, vphysmap); \ ++ } \ ++}) ++ SET_AND_MAP_PARAM(HVM_PARAM_STORE_PFN, si->store_mfn, store_va); ++ SET_AND_MAP_PARAM(HVM_PARAM_STORE_EVTCHN, si->store_evtchn, 0); ++ if ( !pv_console ) ++ { ++ SET_AND_MAP_PARAM(HVM_PARAM_CONSOLE_PFN, si->console.domU.mfn, ++ console_va); ++ SET_AND_MAP_PARAM(HVM_PARAM_CONSOLE_EVTCHN, si->console.domU.evtchn, 0); ++ } ++#undef SET_AND_MAP_PARAM ++} ++ + /* + * Local variables: + * mode: C +diff --git a/xen/include/asm-x86/dom0_build.h b/xen/include/asm-x86/dom0_build.h +index d83d2b4387..d985406503 100644 +--- a/xen/include/asm-x86/dom0_build.h ++++ b/xen/include/asm-x86/dom0_build.h +@@ -1,6 +1,7 @@ + #ifndef _DOM0_BUILD_H_ + #define _DOM0_BUILD_H_ + ++#include + #include + + #include +@@ -29,6 +30,9 @@ int dom0_construct_pvh(struct domain *d, const module_t *image, + unsigned long dom0_paging_pages(const struct domain *d, + unsigned long nr_pages); + ++void dom0_update_physmap(struct domain *d, unsigned long pfn, ++ unsigned long mfn, unsigned long vphysmap_s); ++ + #endif /* _DOM0_BUILD_H_ */ + + /* +diff --git a/xen/include/asm-x86/pv/shim.h b/xen/include/asm-x86/pv/shim.h +index 1468cfd498..b0c361cba1 100644 +--- a/xen/include/asm-x86/pv/shim.h ++++ b/xen/include/asm-x86/pv/shim.h +@@ -29,6 +29,27 @@ extern bool pv_shim; + # define pv_shim 0 + #endif /* CONFIG_PV_SHIM{,_EXCLUSIVE} */ + ++#ifdef CONFIG_PV_SHIM ++ ++void pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, ++ unsigned long va_start, unsigned long store_va, ++ unsigned long console_va, unsigned long vphysmap, ++ start_info_t *si); ++ ++#else ++ ++static inline void pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, ++ unsigned long va_start, ++ unsigned long store_va, ++ unsigned long console_va, ++ unsigned long vphysmap, ++ start_info_t *si) ++{ ++ ASSERT_UNREACHABLE(); ++} ++ ++#endif ++ + #endif /* __X86_PV_SHIM_H__ */ + + /* +-- +2.14.3 + + +From da4518c5595c048a5c030225533e44e021fffaab Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:19 +0000 +Subject: [PATCH 57/77] xen/pvshim: set correct domid value +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +If domid is not provided by L0 set domid to 1 by default. Note that L0 +not provinding the domid can cause trouble if the guest tries to use +it's domid instead of DOMID_SELF when performing hypercalls that are +forwarded to the L0 hypervisor. + +Since the domain created is no longer the hardware domain add a hook +to the domain shutdown path in order to forward shutdown operations to +the L0 hypervisor. + +Signed-off-by: Roger Pau Monné +Signed-off-by: Sergey Dyasli +--- +Changes since v1: + - s/get_dom0_domid/get_initial_domain_id/. + - Add a comment regarding why dom0 needs to be global. + - Fix compilation of xen/common/domain.c on ARM. +--- + xen/arch/x86/dom0_build.c | 2 +- + xen/arch/x86/guest/xen.c | 5 +++++ + xen/arch/x86/pv/shim.c | 21 +++++++++++++++++++++ + xen/arch/x86/setup.c | 16 +++++++++++----- + xen/common/domain.c | 12 ++++++++++++ + xen/include/asm-x86/guest/xen.h | 6 ++++++ + xen/include/asm-x86/pv/shim.h | 10 ++++++++++ + 7 files changed, 66 insertions(+), 6 deletions(-) + +diff --git a/xen/arch/x86/dom0_build.c b/xen/arch/x86/dom0_build.c +index e2bf81b4e7..452298c624 100644 +--- a/xen/arch/x86/dom0_build.c ++++ b/xen/arch/x86/dom0_build.c +@@ -473,7 +473,7 @@ int __init construct_dom0(struct domain *d, const module_t *image, + int rc; + + /* Sanity! */ +- BUG_ON(d->domain_id != 0); ++ BUG_ON(!pv_shim && d->domain_id != 0); + BUG_ON(d->vcpu[0] == NULL); + BUG_ON(d->vcpu[0]->is_initialised); + +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index 27a6c47753..aff16a0e35 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -322,6 +322,11 @@ const unsigned long *__init hypervisor_reserved_pages(unsigned int *size) + return reserved_pages; + } + ++uint32_t hypervisor_cpuid_base(void) ++{ ++ return xen_cpuid_base; ++} ++ + /* + * Local variables: + * mode: C +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +index 75365b0697..78351c9ee0 100644 +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -20,6 +20,7 @@ + */ + #include + #include ++#include + #include + + #include +@@ -27,6 +28,8 @@ + #include + #include + ++#include ++ + #ifndef CONFIG_PV_SHIM_EXCLUSIVE + bool pv_shim; + boolean_param("pv-shim", pv_shim); +@@ -91,6 +94,24 @@ void __init pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + #undef SET_AND_MAP_PARAM + } + ++void pv_shim_shutdown(uint8_t reason) ++{ ++ /* XXX: handle suspend */ ++ xen_hypercall_shutdown(reason); ++} ++ ++domid_t get_initial_domain_id(void) ++{ ++ uint32_t eax, ebx, ecx, edx; ++ ++ if ( !pv_shim ) ++ return 0; ++ ++ cpuid(hypervisor_cpuid_base() + 4, &eax, &ebx, &ecx, &edx); ++ ++ return (eax & XEN_HVM_CPUID_DOMID_PRESENT) ? ecx : 1; ++} ++ + /* + * Local variables: + * mode: C +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index 2279014f74..7091c38047 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -104,6 +104,12 @@ unsigned long __read_mostly mmu_cr4_features = XEN_MINIMAL_CR4; + #define SMEP_HVM_ONLY (-1) + static s8 __initdata opt_smep = 1; + ++/* ++ * Initial domain place holder. Needs to be global so it can be created in ++ * __start_xen and unpaused in init_done. ++ */ ++static struct domain *__initdata dom0; ++ + static int __init parse_smep_param(const char *s) + { + if ( !*s ) +@@ -576,11 +582,11 @@ static void noinline init_done(void) + + system_state = SYS_STATE_active; + ++ domain_unpause_by_systemcontroller(dom0); ++ + /* MUST be done prior to removing .init data. */ + unregister_init_virtual_region(); + +- domain_unpause_by_systemcontroller(hardware_domain); +- + /* Zero the .init code and data. */ + for ( va = __init_begin; va < _p(__init_end); va += PAGE_SIZE ) + clear_page(va); +@@ -659,7 +665,6 @@ void __init noreturn __start_xen(unsigned long mbi_p) + unsigned long nr_pages, raw_max_page, modules_headroom, *module_map; + int i, j, e820_warn = 0, bytes = 0; + bool acpi_boot_table_init_done = false, relocated = false; +- struct domain *dom0; + struct ns16550_defaults ns16550 = { + .data_bits = 8, + .parity = 'n', +@@ -1617,11 +1622,12 @@ void __init noreturn __start_xen(unsigned long mbi_p) + } + + /* Create initial domain 0. */ +- dom0 = domain_create(0, domcr_flags, 0, &config); ++ dom0 = domain_create(get_initial_domain_id(), domcr_flags, 0, &config); + if ( IS_ERR(dom0) || (alloc_dom0_vcpu0(dom0) == NULL) ) + panic("Error creating domain 0"); + +- dom0->is_privileged = 1; ++ if ( !pv_shim ) ++ dom0->is_privileged = 1; + dom0->target = NULL; + + /* Grab the DOM0 command line. */ +diff --git a/xen/common/domain.c b/xen/common/domain.c +index 7484693a87..1ba05fa3a1 100644 +--- a/xen/common/domain.c ++++ b/xen/common/domain.c +@@ -43,6 +43,10 @@ + #include + #include + ++#ifdef CONFIG_X86 ++#include ++#endif ++ + /* Linux config option: propageted to domain0 */ + /* xen_processor_pmbits: xen control Cx, Px, ... */ + unsigned int xen_processor_pmbits = XEN_PROCESSOR_PM_PX; +@@ -689,6 +693,14 @@ void domain_shutdown(struct domain *d, u8 reason) + { + struct vcpu *v; + ++#ifdef CONFIG_X86 ++ if ( pv_shim ) ++ { ++ pv_shim_shutdown(reason); ++ return; ++ } ++#endif ++ + spin_lock(&d->shutdown_lock); + + if ( d->shutdown_code == SHUTDOWN_CODE_INVALID ) +diff --git a/xen/include/asm-x86/guest/xen.h b/xen/include/asm-x86/guest/xen.h +index 62255fda8b..ac48dcbe44 100644 +--- a/xen/include/asm-x86/guest/xen.h ++++ b/xen/include/asm-x86/guest/xen.h +@@ -38,6 +38,7 @@ int hypervisor_alloc_unused_page(mfn_t *mfn); + int hypervisor_free_unused_page(mfn_t mfn); + void hypervisor_fixup_e820(struct e820map *e820); + const unsigned long *hypervisor_reserved_pages(unsigned int *size); ++uint32_t hypervisor_cpuid_base(void); + + DECLARE_PER_CPU(unsigned int, vcpu_id); + DECLARE_PER_CPU(struct vcpu_info *, vcpu_info); +@@ -66,6 +67,11 @@ static inline const unsigned long *hypervisor_reserved_pages(unsigned int *size) + ASSERT_UNREACHABLE(); + return NULL; + }; ++static inline uint32_t hypervisor_cpuid_base(void) ++{ ++ ASSERT_UNREACHABLE(); ++ return 0; ++}; + + #endif /* CONFIG_XEN_GUEST */ + #endif /* __X86_GUEST_XEN_H__ */ +diff --git a/xen/include/asm-x86/pv/shim.h b/xen/include/asm-x86/pv/shim.h +index b0c361cba1..ff7c050dc6 100644 +--- a/xen/include/asm-x86/pv/shim.h ++++ b/xen/include/asm-x86/pv/shim.h +@@ -35,6 +35,8 @@ void pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + unsigned long va_start, unsigned long store_va, + unsigned long console_va, unsigned long vphysmap, + start_info_t *si); ++void pv_shim_shutdown(uint8_t reason); ++domid_t get_initial_domain_id(void); + + #else + +@@ -47,6 +49,14 @@ static inline void pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + { + ASSERT_UNREACHABLE(); + } ++static inline void pv_shim_shutdown(uint8_t reason) ++{ ++ ASSERT_UNREACHABLE(); ++} ++static inline domid_t get_initial_domain_id(void) ++{ ++ return 0; ++} + + #endif + +-- +2.14.3 + + +From bbad376ab1c1c57ba31059bd2269aa9f213579d6 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:19 +0000 +Subject: [PATCH 58/77] xen/pvshim: forward evtchn ops between L0 Xen and L2 + DomU +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Note that the unmask and the virq operations are handled by the shim +itself, and that FIFO event channels are not exposed to the guest. + +Signed-off-by: Roger Pau Monné +Signed-off-by: Anthony Liguori +Signed-off-by: Sergey Dyasli +--- +Changes since v1: + - Use find_first_set_bit instead of ffsl. + - Indent macro one more level. + - Have a single evtchn_close struct in pv_shim_event_channel_op. + - Add blank lines between switch cases. + - Use -EOPNOTSUPP in order to signal lack of FIFO or PIRQ support. + - Switch evtchn_bind_virq parameter to evtchn_port_t and use 0 signal + allocation needed. + - Switch evtchn helpers return type to int instead of long. + - Re-write event channel hypercall table handler instead of adding + hooks. + - Remove the pv_domain variable and instead use a static variable in + shim code. +--- + xen/arch/x86/compat.c | 4 +- + xen/arch/x86/guest/xen.c | 25 +++- + xen/arch/x86/pv/hypercall.c | 17 +++ + xen/arch/x86/pv/shim.c | 263 ++++++++++++++++++++++++++++++++++++++ + xen/common/event_channel.c | 99 ++++++++------ + xen/drivers/char/xen_pv_console.c | 11 +- + xen/include/asm-x86/hypercall.h | 3 + + xen/include/asm-x86/pv/shim.h | 5 + + xen/include/xen/event.h | 15 +++ + xen/include/xen/pv_console.h | 6 + + 10 files changed, 402 insertions(+), 46 deletions(-) + +diff --git a/xen/arch/x86/compat.c b/xen/arch/x86/compat.c +index f417cd5034..9d376a4589 100644 +--- a/xen/arch/x86/compat.c ++++ b/xen/arch/x86/compat.c +@@ -69,8 +69,8 @@ long do_event_channel_op_compat(XEN_GUEST_HANDLE_PARAM(evtchn_op_t) uop) + case EVTCHNOP_bind_ipi: + case EVTCHNOP_bind_vcpu: + case EVTCHNOP_unmask: +- return do_event_channel_op(op.cmd, +- guest_handle_from_ptr(&uop.p->u, void)); ++ return pv_get_hypercall_handler(__HYPERVISOR_event_channel_op, false) ++ (op.cmd, (unsigned long)&uop.p->u, 0, 0, 0, 0); + + default: + return -ENOSYS; +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index aff16a0e35..57b297ad47 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -18,6 +18,7 @@ + * + * Copyright (c) 2017 Citrix Systems Ltd. + */ ++#include + #include + #include + #include +@@ -193,11 +194,31 @@ static void __init init_memmap(void) + static void xen_evtchn_upcall(struct cpu_user_regs *regs) + { + struct vcpu_info *vcpu_info = this_cpu(vcpu_info); ++ unsigned long pending; + + vcpu_info->evtchn_upcall_pending = 0; +- write_atomic(&vcpu_info->evtchn_pending_sel, 0); ++ pending = xchg(&vcpu_info->evtchn_pending_sel, 0); + +- pv_console_rx(regs); ++ while ( pending ) ++ { ++ unsigned int l1 = find_first_set_bit(pending); ++ unsigned long evtchn = xchg(&XEN_shared_info->evtchn_pending[l1], 0); ++ ++ __clear_bit(l1, &pending); ++ evtchn &= ~XEN_shared_info->evtchn_mask[l1]; ++ while ( evtchn ) ++ { ++ unsigned int port = find_first_set_bit(evtchn); ++ ++ __clear_bit(port, &evtchn); ++ port += l1 * BITS_PER_LONG; ++ ++ if ( pv_console && port == pv_console_evtchn() ) ++ pv_console_rx(regs); ++ else if ( pv_shim ) ++ pv_shim_inject_evtchn(port); ++ } ++ } + + ack_APIC_irq(); + } +diff --git a/xen/arch/x86/pv/hypercall.c b/xen/arch/x86/pv/hypercall.c +index f79f7eef62..3b72d6a44d 100644 +--- a/xen/arch/x86/pv/hypercall.c ++++ b/xen/arch/x86/pv/hypercall.c +@@ -320,6 +320,23 @@ void hypercall_page_initialise_ring1_kernel(void *hypercall_page) + *(u16 *)(p+ 6) = (HYPERCALL_VECTOR << 8) | 0xcd; /* int $xx */ + } + ++void __init pv_hypercall_table_replace(unsigned int hypercall, ++ hypercall_fn_t * native, ++ hypercall_fn_t *compat) ++{ ++#define HANDLER_POINTER(f) \ ++ ((unsigned long *)__va(__pa(&pv_hypercall_table[hypercall].f))) ++ write_atomic(HANDLER_POINTER(native), (unsigned long)native); ++ write_atomic(HANDLER_POINTER(compat), (unsigned long)compat); ++#undef HANDLER_POINTER ++} ++ ++hypercall_fn_t *pv_get_hypercall_handler(unsigned int hypercall, bool compat) ++{ ++ return compat ? pv_hypercall_table[hypercall].compat ++ : pv_hypercall_table[hypercall].native; ++} ++ + /* + * Local variables: + * mode: C +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +index 78351c9ee0..36f3a366d3 100644 +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -18,6 +18,8 @@ + * + * Copyright (c) 2017 Citrix Systems Ltd. + */ ++#include ++#include + #include + #include + #include +@@ -35,6 +37,10 @@ bool pv_shim; + boolean_param("pv-shim", pv_shim); + #endif + ++static struct domain *guest; ++ ++static long pv_shim_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); ++ + #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER| \ + _PAGE_GUEST_KERNEL) + #define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) +@@ -63,6 +69,27 @@ static void __init replace_va_mapping(struct domain *d, l4_pgentry_t *l4start, + : COMPAT_L1_PROT)); + } + ++static void evtchn_reserve(struct domain *d, unsigned int port) ++{ ++ ASSERT(port_is_valid(d, port)); ++ evtchn_from_port(d, port)->state = ECS_RESERVED; ++ BUG_ON(xen_hypercall_evtchn_unmask(port)); ++} ++ ++static bool evtchn_handled(struct domain *d, unsigned int port) ++{ ++ ASSERT(port_is_valid(d, port)); ++ /* The shim manages VIRQs, the rest is forwarded to L0. */ ++ return evtchn_from_port(d, port)->state == ECS_VIRQ; ++} ++ ++static void evtchn_assign_vcpu(struct domain *d, unsigned int port, ++ unsigned int vcpu) ++{ ++ ASSERT(port_is_valid(d, port)); ++ evtchn_from_port(d, port)->notify_vcpu_id = vcpu; ++} ++ + void __init pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + unsigned long va_start, unsigned long store_va, + unsigned long console_va, unsigned long vphysmap, +@@ -82,6 +109,11 @@ void __init pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + replace_va_mapping(d, l4start, va, param); \ + dom0_update_physmap(d, PFN_DOWN((va) - va_start), param, vphysmap); \ + } \ ++ else \ ++ { \ ++ BUG_ON(evtchn_allocate_port(d, param)); \ ++ evtchn_reserve(d, param); \ ++ } \ + }) + SET_AND_MAP_PARAM(HVM_PARAM_STORE_PFN, si->store_mfn, store_va); + SET_AND_MAP_PARAM(HVM_PARAM_STORE_EVTCHN, si->store_evtchn, 0); +@@ -92,6 +124,10 @@ void __init pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + SET_AND_MAP_PARAM(HVM_PARAM_CONSOLE_EVTCHN, si->console.domU.evtchn, 0); + } + #undef SET_AND_MAP_PARAM ++ pv_hypercall_table_replace(__HYPERVISOR_event_channel_op, ++ (hypercall_fn_t *)pv_shim_event_channel_op, ++ (hypercall_fn_t *)pv_shim_event_channel_op); ++ guest = d; + } + + void pv_shim_shutdown(uint8_t reason) +@@ -100,6 +136,233 @@ void pv_shim_shutdown(uint8_t reason) + xen_hypercall_shutdown(reason); + } + ++static long pv_shim_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) ++{ ++ struct domain *d = current->domain; ++ struct evtchn_close close; ++ long rc; ++ ++ switch ( cmd ) ++ { ++#define EVTCHN_FORWARD(cmd, port_field) \ ++ case EVTCHNOP_##cmd: { \ ++ struct evtchn_##cmd op; \ ++ \ ++ if ( copy_from_guest(&op, arg, 1) != 0 ) \ ++ return -EFAULT; \ ++ \ ++ rc = xen_hypercall_event_channel_op(EVTCHNOP_##cmd, &op); \ ++ if ( rc ) \ ++ break; \ ++ \ ++ spin_lock(&d->event_lock); \ ++ rc = evtchn_allocate_port(d, op.port_field); \ ++ if ( rc ) \ ++ { \ ++ close.port = op.port_field; \ ++ BUG_ON(xen_hypercall_event_channel_op(EVTCHNOP_close, &close)); \ ++ } \ ++ else \ ++ evtchn_reserve(d, op.port_field); \ ++ spin_unlock(&d->event_lock); \ ++ \ ++ if ( !rc && __copy_to_guest(arg, &op, 1) ) \ ++ rc = -EFAULT; \ ++ \ ++ break; \ ++ } ++ ++ EVTCHN_FORWARD(alloc_unbound, port) ++ EVTCHN_FORWARD(bind_interdomain, local_port) ++#undef EVTCHN_FORWARD ++ ++ case EVTCHNOP_bind_virq: { ++ struct evtchn_bind_virq virq; ++ struct evtchn_alloc_unbound alloc = { ++ .dom = DOMID_SELF, ++ .remote_dom = DOMID_SELF, ++ }; ++ ++ if ( copy_from_guest(&virq, arg, 1) != 0 ) ++ return -EFAULT; ++ /* ++ * The event channel space is actually controlled by L0 Xen, so ++ * allocate a port from L0 and then force the VIRQ to be bound to that ++ * specific port. ++ * ++ * This is only required for VIRQ because the rest of the event channel ++ * operations are handled directly by L0. ++ */ ++ rc = xen_hypercall_event_channel_op(EVTCHNOP_alloc_unbound, &alloc); ++ if ( rc ) ++ break; ++ ++ /* Force L1 to use the event channel port allocated on L0. */ ++ rc = evtchn_bind_virq(&virq, alloc.port); ++ if ( rc ) ++ { ++ close.port = alloc.port; ++ BUG_ON(xen_hypercall_event_channel_op(EVTCHNOP_close, &close)); ++ } ++ ++ if ( !rc && __copy_to_guest(arg, &virq, 1) ) ++ rc = -EFAULT; ++ ++ break; ++ } ++ ++ case EVTCHNOP_status: { ++ struct evtchn_status status; ++ ++ if ( copy_from_guest(&status, arg, 1) != 0 ) ++ return -EFAULT; ++ ++ /* ++ * NB: if the event channel is not handled by the shim, just forward ++ * the status request to L0, even if the port is not valid. ++ */ ++ if ( port_is_valid(d, status.port) && evtchn_handled(d, status.port) ) ++ rc = evtchn_status(&status); ++ else ++ rc = xen_hypercall_event_channel_op(EVTCHNOP_status, &status); ++ ++ break; ++ } ++ ++ case EVTCHNOP_bind_vcpu: { ++ struct evtchn_bind_vcpu vcpu; ++ ++ if ( copy_from_guest(&vcpu, arg, 1) != 0 ) ++ return -EFAULT; ++ ++ if ( !port_is_valid(d, vcpu.port) ) ++ return -EINVAL; ++ ++ if ( evtchn_handled(d, vcpu.port) ) ++ rc = evtchn_bind_vcpu(vcpu.port, vcpu.vcpu); ++ else ++ { ++ rc = xen_hypercall_event_channel_op(EVTCHNOP_bind_vcpu, &vcpu); ++ if ( !rc ) ++ evtchn_assign_vcpu(d, vcpu.port, vcpu.vcpu); ++ } ++ ++ break; ++ } ++ ++ case EVTCHNOP_close: { ++ if ( copy_from_guest(&close, arg, 1) != 0 ) ++ return -EFAULT; ++ ++ if ( !port_is_valid(d, close.port) ) ++ return -EINVAL; ++ ++ set_bit(close.port, XEN_shared_info->evtchn_mask); ++ ++ if ( evtchn_handled(d, close.port) ) ++ { ++ rc = evtchn_close(d, close.port, true); ++ if ( rc ) ++ break; ++ } ++ else ++ evtchn_free(d, evtchn_from_port(d, close.port)); ++ ++ rc = xen_hypercall_event_channel_op(EVTCHNOP_close, &close); ++ if ( rc ) ++ /* ++ * If the port cannot be closed on the L0 mark it as reserved ++ * in the shim to avoid re-using it. ++ */ ++ evtchn_reserve(d, close.port); ++ ++ break; ++ } ++ ++ case EVTCHNOP_bind_ipi: { ++ struct evtchn_bind_ipi ipi; ++ ++ if ( copy_from_guest(&ipi, arg, 1) != 0 ) ++ return -EFAULT; ++ ++ rc = xen_hypercall_event_channel_op(EVTCHNOP_bind_ipi, &ipi); ++ if ( rc ) ++ break; ++ ++ spin_lock(&d->event_lock); ++ rc = evtchn_allocate_port(d, ipi.port); ++ if ( rc ) ++ { ++ spin_unlock(&d->event_lock); ++ ++ close.port = ipi.port; ++ BUG_ON(xen_hypercall_event_channel_op(EVTCHNOP_close, &close)); ++ break; ++ } ++ ++ evtchn_assign_vcpu(d, ipi.port, ipi.vcpu); ++ evtchn_reserve(d, ipi.port); ++ spin_unlock(&d->event_lock); ++ ++ if ( __copy_to_guest(arg, &ipi, 1) ) ++ rc = -EFAULT; ++ ++ break; ++ } ++ ++ case EVTCHNOP_unmask: { ++ struct evtchn_unmask unmask; ++ ++ if ( copy_from_guest(&unmask, arg, 1) != 0 ) ++ return -EFAULT; ++ ++ /* Unmask is handled in L1 */ ++ rc = evtchn_unmask(unmask.port); ++ ++ break; ++ } ++ ++ case EVTCHNOP_send: { ++ struct evtchn_send send; ++ ++ if ( copy_from_guest(&send, arg, 1) != 0 ) ++ return -EFAULT; ++ ++ rc = xen_hypercall_event_channel_op(EVTCHNOP_send, &send); ++ ++ break; ++ } ++ ++ case EVTCHNOP_reset: { ++ struct evtchn_reset reset; ++ ++ if ( copy_from_guest(&reset, arg, 1) != 0 ) ++ return -EFAULT; ++ ++ rc = xen_hypercall_event_channel_op(EVTCHNOP_reset, &reset); ++ ++ break; ++ } ++ ++ default: ++ /* No FIFO or PIRQ support for now */ ++ rc = -EOPNOTSUPP; ++ break; ++ } ++ ++ return rc; ++} ++ ++void pv_shim_inject_evtchn(unsigned int port) ++{ ++ if ( port_is_valid(guest, port) ) ++ { ++ struct evtchn *chn = evtchn_from_port(guest, port); ++ ++ evtchn_port_set_pending(guest, chn->notify_vcpu_id, chn); ++ } ++} ++ + domid_t get_initial_domain_id(void) + { + uint32_t eax, ebx, ecx, edx; +diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c +index c69f9db6db..be834c5c78 100644 +--- a/xen/common/event_channel.c ++++ b/xen/common/event_channel.c +@@ -156,46 +156,62 @@ static void free_evtchn_bucket(struct domain *d, struct evtchn *bucket) + xfree(bucket); + } + ++int evtchn_allocate_port(struct domain *d, evtchn_port_t port) ++{ ++ if ( port > d->max_evtchn_port || port >= d->max_evtchns ) ++ return -ENOSPC; ++ ++ if ( port_is_valid(d, port) ) ++ { ++ if ( evtchn_from_port(d, port)->state != ECS_FREE || ++ evtchn_port_is_busy(d, port) ) ++ return -EBUSY; ++ } ++ else ++ { ++ struct evtchn *chn; ++ struct evtchn **grp; ++ ++ if ( !group_from_port(d, port) ) ++ { ++ grp = xzalloc_array(struct evtchn *, BUCKETS_PER_GROUP); ++ if ( !grp ) ++ return -ENOMEM; ++ group_from_port(d, port) = grp; ++ } ++ ++ chn = alloc_evtchn_bucket(d, port); ++ if ( !chn ) ++ return -ENOMEM; ++ bucket_from_port(d, port) = chn; ++ ++ write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET); ++ } ++ ++ return 0; ++} ++ + static int get_free_port(struct domain *d) + { +- struct evtchn *chn; +- struct evtchn **grp; + int port; + + if ( d->is_dying ) + return -EINVAL; + +- for ( port = 0; port_is_valid(d, port); port++ ) ++ for ( port = 0; port <= d->max_evtchn_port; port++ ) + { +- if ( port > d->max_evtchn_port ) +- return -ENOSPC; +- if ( evtchn_from_port(d, port)->state == ECS_FREE +- && !evtchn_port_is_busy(d, port) ) +- return port; +- } ++ int rc = evtchn_allocate_port(d, port); + +- if ( port == d->max_evtchns || port > d->max_evtchn_port ) +- return -ENOSPC; ++ if ( rc == -EBUSY ) ++ continue; + +- if ( !group_from_port(d, port) ) +- { +- grp = xzalloc_array(struct evtchn *, BUCKETS_PER_GROUP); +- if ( !grp ) +- return -ENOMEM; +- group_from_port(d, port) = grp; ++ return port; + } + +- chn = alloc_evtchn_bucket(d, port); +- if ( !chn ) +- return -ENOMEM; +- bucket_from_port(d, port) = chn; +- +- write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET); +- +- return port; ++ return -ENOSPC; + } + +-static void free_evtchn(struct domain *d, struct evtchn *chn) ++void evtchn_free(struct domain *d, struct evtchn *chn) + { + /* Clear pending event to avoid unexpected behavior on re-bind. */ + evtchn_port_clear_pending(d, chn); +@@ -345,13 +361,13 @@ static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind) + } + + +-static long evtchn_bind_virq(evtchn_bind_virq_t *bind) ++int evtchn_bind_virq(evtchn_bind_virq_t *bind, evtchn_port_t port) + { + struct evtchn *chn; + struct vcpu *v; + struct domain *d = current->domain; +- int port, virq = bind->virq, vcpu = bind->vcpu; +- long rc = 0; ++ int virq = bind->virq, vcpu = bind->vcpu; ++ int rc = 0; + + if ( (virq < 0) || (virq >= ARRAY_SIZE(v->virq_to_evtchn)) ) + return -EINVAL; +@@ -368,8 +384,19 @@ static long evtchn_bind_virq(evtchn_bind_virq_t *bind) + if ( v->virq_to_evtchn[virq] != 0 ) + ERROR_EXIT(-EEXIST); + +- if ( (port = get_free_port(d)) < 0 ) +- ERROR_EXIT(port); ++ if ( port != 0 ) ++ { ++ if ( (rc = evtchn_allocate_port(d, port)) != 0 ) ++ ERROR_EXIT(rc); ++ } ++ else ++ { ++ int alloc_port = get_free_port(d); ++ ++ if ( alloc_port < 0 ) ++ ERROR_EXIT(alloc_port); ++ port = alloc_port; ++ } + + chn = evtchn_from_port(d, port); + +@@ -511,7 +538,7 @@ static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind) + } + + +-static long evtchn_close(struct domain *d1, int port1, bool_t guest) ++int evtchn_close(struct domain *d1, int port1, bool guest) + { + struct domain *d2 = NULL; + struct vcpu *v; +@@ -619,7 +646,7 @@ static long evtchn_close(struct domain *d1, int port1, bool_t guest) + + double_evtchn_lock(chn1, chn2); + +- free_evtchn(d1, chn1); ++ evtchn_free(d1, chn1); + + chn2->state = ECS_UNBOUND; + chn2->u.unbound.remote_domid = d1->domain_id; +@@ -633,7 +660,7 @@ static long evtchn_close(struct domain *d1, int port1, bool_t guest) + } + + spin_lock(&chn1->lock); +- free_evtchn(d1, chn1); ++ evtchn_free(d1, chn1); + spin_unlock(&chn1->lock); + + out: +@@ -839,7 +866,7 @@ static void clear_global_virq_handlers(struct domain *d) + } + } + +-static long evtchn_status(evtchn_status_t *status) ++int evtchn_status(evtchn_status_t *status) + { + struct domain *d; + domid_t dom = status->dom; +@@ -1056,7 +1083,7 @@ long do_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) + struct evtchn_bind_virq bind_virq; + if ( copy_from_guest(&bind_virq, arg, 1) != 0 ) + return -EFAULT; +- rc = evtchn_bind_virq(&bind_virq); ++ rc = evtchn_bind_virq(&bind_virq, 0); + if ( !rc && __copy_to_guest(arg, &bind_virq, 1) ) + rc = -EFAULT; /* Cleaning up here would be a mess! */ + break; +diff --git a/xen/drivers/char/xen_pv_console.c b/xen/drivers/char/xen_pv_console.c +index d4f0532101..948343303e 100644 +--- a/xen/drivers/char/xen_pv_console.c ++++ b/xen/drivers/char/xen_pv_console.c +@@ -88,6 +88,11 @@ static void notify_daemon(void) + xen_hypercall_evtchn_send(cons_evtchn); + } + ++evtchn_port_t pv_console_evtchn(void) ++{ ++ return cons_evtchn; ++} ++ + size_t pv_console_rx(struct cpu_user_regs *regs) + { + char c; +@@ -97,10 +102,6 @@ size_t pv_console_rx(struct cpu_user_regs *regs) + if ( !cons_ring ) + return 0; + +- /* TODO: move this somewhere */ +- if ( !test_bit(cons_evtchn, XEN_shared_info->evtchn_pending) ) +- return 0; +- + prod = ACCESS_ONCE(cons_ring->in_prod); + cons = cons_ring->in_cons; + +@@ -125,8 +126,6 @@ size_t pv_console_rx(struct cpu_user_regs *regs) + ACCESS_ONCE(cons_ring->in_cons) = cons; + notify_daemon(); + +- clear_bit(cons_evtchn, XEN_shared_info->evtchn_pending); +- + return recv; + } + +diff --git a/xen/include/asm-x86/hypercall.h b/xen/include/asm-x86/hypercall.h +index 3eb4a8db89..b9f3ecf9a3 100644 +--- a/xen/include/asm-x86/hypercall.h ++++ b/xen/include/asm-x86/hypercall.h +@@ -28,6 +28,9 @@ extern const hypercall_args_t hypercall_args_table[NR_hypercalls]; + void pv_hypercall(struct cpu_user_regs *regs); + void hypercall_page_initialise_ring3_kernel(void *hypercall_page); + void hypercall_page_initialise_ring1_kernel(void *hypercall_page); ++void pv_hypercall_table_replace(unsigned int hypercall, hypercall_fn_t * native, ++ hypercall_fn_t *compat); ++hypercall_fn_t *pv_get_hypercall_handler(unsigned int hypercall, bool compat); + + /* + * Both do_mmuext_op() and do_mmu_update(): +diff --git a/xen/include/asm-x86/pv/shim.h b/xen/include/asm-x86/pv/shim.h +index ff7c050dc6..ab656fd854 100644 +--- a/xen/include/asm-x86/pv/shim.h ++++ b/xen/include/asm-x86/pv/shim.h +@@ -36,6 +36,7 @@ void pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + unsigned long console_va, unsigned long vphysmap, + start_info_t *si); + void pv_shim_shutdown(uint8_t reason); ++void pv_shim_inject_evtchn(unsigned int port); + domid_t get_initial_domain_id(void); + + #else +@@ -53,6 +54,10 @@ static inline void pv_shim_shutdown(uint8_t reason) + { + ASSERT_UNREACHABLE(); + } ++static inline void pv_shim_inject_evtchn(unsigned int port) ++{ ++ ASSERT_UNREACHABLE(); ++} + static inline domid_t get_initial_domain_id(void) + { + return 0; +diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h +index 87915ead69..ebb879e88d 100644 +--- a/xen/include/xen/event.h ++++ b/xen/include/xen/event.h +@@ -48,6 +48,21 @@ int evtchn_send(struct domain *d, unsigned int lport); + /* Bind a local event-channel port to the specified VCPU. */ + long evtchn_bind_vcpu(unsigned int port, unsigned int vcpu_id); + ++/* Bind a VIRQ. */ ++int evtchn_bind_virq(evtchn_bind_virq_t *bind, evtchn_port_t port); ++ ++/* Get the status of an event channel port. */ ++int evtchn_status(evtchn_status_t *status); ++ ++/* Close an event channel. */ ++int evtchn_close(struct domain *d1, int port1, bool guest); ++ ++/* Free an event channel. */ ++void evtchn_free(struct domain *d, struct evtchn *chn); ++ ++/* Allocate a specific event channel port. */ ++int evtchn_allocate_port(struct domain *d, unsigned int port); ++ + /* Unmask a local event-channel port. */ + int evtchn_unmask(unsigned int port); + +diff --git a/xen/include/xen/pv_console.h b/xen/include/xen/pv_console.h +index e578b56620..cb92539666 100644 +--- a/xen/include/xen/pv_console.h ++++ b/xen/include/xen/pv_console.h +@@ -10,6 +10,7 @@ void pv_console_set_rx_handler(serial_rx_fn fn); + void pv_console_init_postirq(void); + void pv_console_puts(const char *buf); + size_t pv_console_rx(struct cpu_user_regs *regs); ++evtchn_port_t pv_console_evtchn(void); + + #else + +@@ -18,6 +19,11 @@ static inline void pv_console_set_rx_handler(serial_rx_fn fn) { } + static inline void pv_console_init_postirq(void) { } + static inline void pv_console_puts(const char *buf) { } + static inline size_t pv_console_rx(struct cpu_user_regs *regs) { return 0; } ++evtchn_port_t pv_console_evtchn(void) ++{ ++ ASSERT_UNREACHABLE(); ++ return 0; ++} + + #endif /* !CONFIG_XEN_GUEST */ + #endif /* __XEN_PV_CONSOLE_H__ */ +-- +2.14.3 + + +From 7f5eb7d04ef2616051b82437d3c9595208a7dec1 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:19 +0000 +Subject: [PATCH 59/77] xen/pvshim: add grant table operations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Roger Pau Monné +Signed-off-by: Anthony Liguori +Signed-off-by: Andrew Cooper +--- +Changes since v1: + - Use __ of copy_to_guest. + - Return EOPNOTSUPP for not implemented grant table hypercalls. + - Forward user provided buffer in GNTTABOP_query_size. + - Rewrite grant table hypercall handler. +--- + xen/arch/x86/pv/shim.c | 164 ++++++++++++++++++++++++++++++++++ + xen/include/asm-x86/guest/hypercall.h | 6 ++ + 2 files changed, 170 insertions(+) + +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +index 36f3a366d3..eb8b146785 100644 +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -32,6 +33,8 @@ + + #include + ++#include ++ + #ifndef CONFIG_PV_SHIM_EXCLUSIVE + bool pv_shim; + boolean_param("pv-shim", pv_shim); +@@ -39,7 +42,14 @@ boolean_param("pv-shim", pv_shim); + + static struct domain *guest; + ++static unsigned int nr_grant_list; ++static unsigned long *grant_frames; ++static DEFINE_SPINLOCK(grant_lock); ++ + static long pv_shim_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); ++static long pv_shim_grant_table_op(unsigned int cmd, ++ XEN_GUEST_HANDLE_PARAM(void) uop, ++ unsigned int count); + + #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER| \ + _PAGE_GUEST_KERNEL) +@@ -127,6 +137,9 @@ void __init pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + pv_hypercall_table_replace(__HYPERVISOR_event_channel_op, + (hypercall_fn_t *)pv_shim_event_channel_op, + (hypercall_fn_t *)pv_shim_event_channel_op); ++ pv_hypercall_table_replace(__HYPERVISOR_grant_table_op, ++ (hypercall_fn_t *)pv_shim_grant_table_op, ++ (hypercall_fn_t *)pv_shim_grant_table_op); + guest = d; + } + +@@ -363,6 +376,157 @@ void pv_shim_inject_evtchn(unsigned int port) + } + } + ++static long pv_shim_grant_table_op(unsigned int cmd, ++ XEN_GUEST_HANDLE_PARAM(void) uop, ++ unsigned int count) ++{ ++ struct domain *d = current->domain; ++ long rc = 0; ++ ++ if ( count != 1 ) ++ return -EINVAL; ++ ++ switch ( cmd ) ++ { ++ case GNTTABOP_setup_table: ++ { ++ bool compat = is_pv_32bit_domain(d); ++ struct gnttab_setup_table nat; ++ struct compat_gnttab_setup_table cmp; ++ unsigned int i; ++ ++ if ( unlikely(compat ? copy_from_guest(&cmp, uop, 1) ++ : copy_from_guest(&nat, uop, 1)) || ++ unlikely(compat ? !compat_handle_okay(cmp.frame_list, ++ cmp.nr_frames) ++ : !guest_handle_okay(nat.frame_list, ++ nat.nr_frames)) ) ++ { ++ rc = -EFAULT; ++ break; ++ } ++ if ( compat ) ++#define XLAT_gnttab_setup_table_HNDL_frame_list(d, s) ++ XLAT_gnttab_setup_table(&nat, &cmp); ++#undef XLAT_gnttab_setup_table_HNDL_frame_list ++ ++ nat.status = GNTST_okay; ++ ++ spin_lock(&grant_lock); ++ if ( !nr_grant_list ) ++ { ++ struct gnttab_query_size query_size = { ++ .dom = DOMID_SELF, ++ }; ++ ++ rc = xen_hypercall_grant_table_op(GNTTABOP_query_size, ++ &query_size, 1); ++ if ( rc ) ++ { ++ spin_unlock(&grant_lock); ++ break; ++ } ++ ++ ASSERT(!grant_frames); ++ grant_frames = xzalloc_array(unsigned long, ++ query_size.max_nr_frames); ++ if ( !grant_frames ) ++ { ++ spin_unlock(&grant_lock); ++ rc = -ENOMEM; ++ break; ++ } ++ ++ nr_grant_list = query_size.max_nr_frames; ++ } ++ ++ if ( nat.nr_frames > nr_grant_list ) ++ { ++ spin_unlock(&grant_lock); ++ rc = -EINVAL; ++ break; ++ } ++ ++ for ( i = 0; i < nat.nr_frames; i++ ) ++ { ++ if ( !grant_frames[i] ) ++ { ++ struct xen_add_to_physmap xatp = { ++ .domid = DOMID_SELF, ++ .idx = i, ++ .space = XENMAPSPACE_grant_table, ++ }; ++ mfn_t mfn; ++ ++ rc = hypervisor_alloc_unused_page(&mfn); ++ if ( rc ) ++ { ++ gprintk(XENLOG_ERR, ++ "unable to get memory for grant table\n"); ++ break; ++ } ++ ++ xatp.gpfn = mfn_x(mfn); ++ rc = xen_hypercall_memory_op(XENMEM_add_to_physmap, &xatp); ++ if ( rc ) ++ { ++ hypervisor_free_unused_page(mfn); ++ break; ++ } ++ ++ BUG_ON(iomem_permit_access(d, mfn_x(mfn), mfn_x(mfn))); ++ grant_frames[i] = mfn_x(mfn); ++ } ++ ++ ASSERT(grant_frames[i]); ++ if ( compat ) ++ { ++ compat_pfn_t pfn = grant_frames[i]; ++ ++ if ( __copy_to_compat_offset(cmp.frame_list, i, &pfn, 1) ) ++ { ++ nat.status = GNTST_bad_virt_addr; ++ rc = -EFAULT; ++ break; ++ } ++ } ++ else if ( __copy_to_guest_offset(nat.frame_list, i, ++ &grant_frames[i], 1) ) ++ { ++ nat.status = GNTST_bad_virt_addr; ++ rc = -EFAULT; ++ break; ++ } ++ } ++ spin_unlock(&grant_lock); ++ ++ if ( compat ) ++#define XLAT_gnttab_setup_table_HNDL_frame_list(d, s) ++ XLAT_gnttab_setup_table(&cmp, &nat); ++#undef XLAT_gnttab_setup_table_HNDL_frame_list ++ ++ if ( unlikely(compat ? __copy_to_guest(uop, &cmp, 1) ++ : __copy_to_guest(uop, &nat, 1)) ) ++ { ++ rc = -EFAULT; ++ break; ++ } ++ ++ break; ++ } ++ ++ case GNTTABOP_query_size: ++ rc = xen_hypercall_grant_table_op(GNTTABOP_query_size, uop.p, count); ++ break; ++ ++ default: ++ rc = -EOPNOTSUPP; ++ break; ++ } ++ ++ return rc; ++} ++ + domid_t get_initial_domain_id(void) + { + uint32_t eax, ebx, ecx, edx; +diff --git a/xen/include/asm-x86/guest/hypercall.h b/xen/include/asm-x86/guest/hypercall.h +index 81a955d479..e9e626b474 100644 +--- a/xen/include/asm-x86/guest/hypercall.h ++++ b/xen/include/asm-x86/guest/hypercall.h +@@ -110,6 +110,12 @@ static inline long xen_hypercall_event_channel_op(unsigned int cmd, void *arg) + return _hypercall64_2(long, __HYPERVISOR_event_channel_op, cmd, arg); + } + ++static inline long xen_hypercall_grant_table_op(unsigned int cmd, void *arg, ++ unsigned int count) ++{ ++ return _hypercall64_3(long, __HYPERVISOR_grant_table_op, cmd, arg, count); ++} ++ + static inline long xen_hypercall_hvm_op(unsigned int op, void *arg) + { + return _hypercall64_2(long, __HYPERVISOR_hvm_op, op, arg); +-- +2.14.3 + + +From cc7d96b98cf02540edf6f387286100a50d6f3d04 Mon Sep 17 00:00:00 2001 +From: Sergey Dyasli +Date: Thu, 11 Jan 2018 11:45:23 +0000 +Subject: [PATCH 60/77] x86/pv-shim: shadow PV console's page for L2 DomU +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Sergey Dyasli +Signed-off-by: Wei Liu +[remove notify_guest helper and directly use pv_shim_inject_evtchn] +Signed-off-by: Roger Pau Monné +Signed-off-by: Wei Liu +--- +Changes since v1: + - Use pv_shim_inject_evtchn. +--- + xen/arch/x86/pv/shim.c | 31 ++++++++-- + xen/drivers/char/Makefile | 1 + + xen/drivers/char/console.c | 6 ++ + xen/drivers/char/consoled.c | 148 ++++++++++++++++++++++++++++++++++++++++++++ + xen/include/xen/consoled.h | 27 ++++++++ + 5 files changed, 209 insertions(+), 4 deletions(-) + create mode 100644 xen/drivers/char/consoled.c + create mode 100644 xen/include/xen/consoled.h + +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +index eb8b146785..986f9da58a 100644 +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -25,6 +25,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -127,13 +129,28 @@ void __init pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + }) + SET_AND_MAP_PARAM(HVM_PARAM_STORE_PFN, si->store_mfn, store_va); + SET_AND_MAP_PARAM(HVM_PARAM_STORE_EVTCHN, si->store_evtchn, 0); ++ SET_AND_MAP_PARAM(HVM_PARAM_CONSOLE_EVTCHN, si->console.domU.evtchn, 0); + if ( !pv_console ) +- { + SET_AND_MAP_PARAM(HVM_PARAM_CONSOLE_PFN, si->console.domU.mfn, + console_va); +- SET_AND_MAP_PARAM(HVM_PARAM_CONSOLE_EVTCHN, si->console.domU.evtchn, 0); +- } + #undef SET_AND_MAP_PARAM ++ else ++ { ++ /* Allocate a new page for DomU's PV console */ ++ void *page = alloc_xenheap_pages(0, MEMF_bits(32)); ++ uint64_t console_mfn; ++ ++ ASSERT(page); ++ clear_page(page); ++ console_mfn = virt_to_mfn(page); ++ si->console.domU.mfn = console_mfn; ++ share_xen_page_with_guest(mfn_to_page(console_mfn), d, ++ XENSHARE_writable); ++ replace_va_mapping(d, l4start, console_va, console_mfn); ++ dom0_update_physmap(d, (console_va - va_start) >> PAGE_SHIFT, ++ console_mfn, vphysmap); ++ consoled_set_ring_addr(page); ++ } + pv_hypercall_table_replace(__HYPERVISOR_event_channel_op, + (hypercall_fn_t *)pv_shim_event_channel_op, + (hypercall_fn_t *)pv_shim_event_channel_op); +@@ -341,7 +358,13 @@ static long pv_shim_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) + if ( copy_from_guest(&send, arg, 1) != 0 ) + return -EFAULT; + +- rc = xen_hypercall_event_channel_op(EVTCHNOP_send, &send); ++ if ( pv_console && send.port == pv_console_evtchn() ) ++ { ++ consoled_guest_rx(); ++ rc = 0; ++ } ++ else ++ rc = xen_hypercall_event_channel_op(EVTCHNOP_send, &send); + + break; + } +diff --git a/xen/drivers/char/Makefile b/xen/drivers/char/Makefile +index 9d48d0f2dc..0d48b16e8d 100644 +--- a/xen/drivers/char/Makefile ++++ b/xen/drivers/char/Makefile +@@ -9,3 +9,4 @@ obj-$(CONFIG_HAS_EHCI) += ehci-dbgp.o + obj-$(CONFIG_ARM) += arm-uart.o + obj-y += serial.o + obj-$(CONFIG_XEN_GUEST) += xen_pv_console.o ++obj-$(CONFIG_PV_SHIM) += consoled.o +diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c +index 8acd358395..18f5b7f7aa 100644 +--- a/xen/drivers/char/console.c ++++ b/xen/drivers/char/console.c +@@ -32,6 +32,7 @@ + #include + + #ifdef CONFIG_X86 ++#include + #include + #include + #endif +@@ -415,6 +416,11 @@ static void __serial_rx(char c, struct cpu_user_regs *regs) + serial_rx_ring[SERIAL_RX_MASK(serial_rx_prod++)] = c; + /* Always notify the guest: prevents receive path from getting stuck. */ + send_global_virq(VIRQ_CONSOLE); ++ ++#ifdef CONFIG_X86 ++ if ( pv_shim && pv_console ) ++ consoled_guest_tx(c); ++#endif + } + + static void serial_rx(char c, struct cpu_user_regs *regs) +diff --git a/xen/drivers/char/consoled.c b/xen/drivers/char/consoled.c +new file mode 100644 +index 0000000000..552abf5766 +--- /dev/null ++++ b/xen/drivers/char/consoled.c +@@ -0,0 +1,148 @@ ++/****************************************************************************** ++ * drivers/char/consoled.c ++ * ++ * A backend driver for Xen's PV console. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; If not, see . ++ * ++ * Copyright (c) 2017 Citrix Systems Ltd. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++ ++static struct xencons_interface *cons_ring; ++static DEFINE_SPINLOCK(rx_lock); ++ ++void consoled_set_ring_addr(struct xencons_interface *ring) ++{ ++ cons_ring = ring; ++} ++ ++struct xencons_interface *consoled_get_ring_addr(void) ++{ ++ return cons_ring; ++} ++ ++#define BUF_SZ 255 ++static char buf[BUF_SZ + 1]; ++ ++/* Receives characters from a domain's PV console */ ++size_t consoled_guest_rx(void) ++{ ++ size_t recv = 0, idx = 0; ++ XENCONS_RING_IDX cons, prod; ++ ++ if ( !cons_ring ) ++ return 0; ++ ++ spin_lock(&rx_lock); ++ ++ cons = cons_ring->out_cons; ++ prod = ACCESS_ONCE(cons_ring->out_prod); ++ ++ /* ++ * Latch pointers before accessing the ring. Included compiler barrier also ++ * ensures that pointers are really read only once into local variables. ++ */ ++ smp_rmb(); ++ ++ ASSERT((prod - cons) <= sizeof(cons_ring->out)); ++ ++ /* Is the ring empty? */ ++ if ( cons == prod ) ++ goto out; ++ ++ while ( cons != prod ) ++ { ++ char c = cons_ring->out[MASK_XENCONS_IDX(cons++, cons_ring->out)]; ++ ++ buf[idx++] = c; ++ recv++; ++ ++ if ( idx >= BUF_SZ ) ++ { ++ pv_console_puts(buf); ++ idx = 0; ++ } ++ } ++ ++ if ( idx ) ++ { ++ buf[idx] = '\0'; ++ pv_console_puts(buf); ++ } ++ ++ /* No need for a mem barrier because every character was already consumed */ ++ barrier(); ++ ACCESS_ONCE(cons_ring->out_cons) = cons; ++ pv_shim_inject_evtchn(pv_console_evtchn()); ++ ++ out: ++ spin_unlock(&rx_lock); ++ ++ return recv; ++} ++ ++/* Sends a character into a domain's PV console */ ++size_t consoled_guest_tx(char c) ++{ ++ size_t sent = 0; ++ XENCONS_RING_IDX cons, prod; ++ ++ if ( !cons_ring ) ++ return 0; ++ ++ cons = ACCESS_ONCE(cons_ring->in_cons); ++ prod = cons_ring->in_prod; ++ ++ /* ++ * Latch pointers before accessing the ring. Included compiler barrier also ++ * ensures that pointers are really read only once into local variables. ++ */ ++ smp_rmb(); ++ ++ ASSERT((prod - cons) <= sizeof(cons_ring->in)); ++ ++ /* Is the ring out of space? */ ++ if ( sizeof(cons_ring->in) - (prod - cons) == 0 ) ++ goto notify; ++ ++ cons_ring->in[MASK_XENCONS_IDX(prod++, cons_ring->in)] = c; ++ sent++; ++ ++ /* Write to the ring before updating the pointer */ ++ smp_wmb(); ++ ACCESS_ONCE(cons_ring->in_prod) = prod; ++ ++ notify: ++ /* Always notify the guest: prevents receive path from getting stuck. */ ++ pv_shim_inject_evtchn(pv_console_evtchn()); ++ ++ return sent; ++} ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/include/xen/consoled.h b/xen/include/xen/consoled.h +new file mode 100644 +index 0000000000..fd5d220a8a +--- /dev/null ++++ b/xen/include/xen/consoled.h +@@ -0,0 +1,27 @@ ++#ifndef __XEN_CONSOLED_H__ ++#define __XEN_CONSOLED_H__ ++ ++#include ++ ++#ifdef CONFIG_PV_SHIM ++ ++void consoled_set_ring_addr(struct xencons_interface *ring); ++struct xencons_interface *consoled_get_ring_addr(void); ++size_t consoled_guest_rx(void); ++size_t consoled_guest_tx(char c); ++ ++#else ++ ++size_t consoled_guest_tx(char c) { return 0; } ++ ++#endif /* !CONFIG_PV_SHIM */ ++#endif /* __XEN_CONSOLED_H__ */ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +-- +2.14.3 + + +From 83c838c9f853712ac5d36c9dc001eb8903b1e1e2 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:19 +0000 +Subject: [PATCH 61/77] xen/pvshim: add migration support +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Roger Pau Monné +--- +Changes since v1: + - Use bitmap_zero instead of memset. + - Don't drop the __init attribute of unshare_xen_page_with_guest, + it's not needed for migration. + - Remove BUG_ON to check correct mapping, map_domain_page cannot + fail. + - Reduce indentation level of pv_shim_shutdown. +--- + xen/arch/x86/guest/xen.c | 29 +++++++ + xen/arch/x86/pv/shim.c | 155 +++++++++++++++++++++++++++++++++++++- + xen/common/domain.c | 11 ++- + xen/common/schedule.c | 3 +- + xen/drivers/char/xen_pv_console.c | 2 +- + xen/include/asm-x86/guest/xen.h | 5 ++ + xen/include/asm-x86/pv/shim.h | 5 +- + xen/include/xen/sched.h | 2 +- + 8 files changed, 197 insertions(+), 15 deletions(-) + +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index 57b297ad47..2a5554ab26 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -348,6 +348,35 @@ uint32_t hypervisor_cpuid_base(void) + return xen_cpuid_base; + } + ++static void ap_resume(void *unused) ++{ ++ map_vcpuinfo(); ++ init_evtchn(); ++} ++ ++void hypervisor_resume(void) ++{ ++ /* Reset shared info page. */ ++ map_shared_info(); ++ ++ /* ++ * Reset vcpu_info. Just clean the mapped bitmap and try to map the vcpu ++ * area again. On failure to map (when it was previously mapped) panic ++ * since it's impossible to safely shut down running guest vCPUs in order ++ * to meet the new XEN_LEGACY_MAX_VCPUS requirement. ++ */ ++ bitmap_zero(vcpu_info_mapped, NR_CPUS); ++ if ( map_vcpuinfo() && nr_cpu_ids > XEN_LEGACY_MAX_VCPUS ) ++ panic("unable to remap vCPU info and vCPUs > legacy limit"); ++ ++ /* Setup event channel upcall vector. */ ++ init_evtchn(); ++ smp_call_function(ap_resume, NULL, 1); ++ ++ if ( pv_console ) ++ pv_console_init(); ++} ++ + /* + * Local variables: + * mode: C +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +index 986f9da58a..c53a4ca407 100644 +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -160,10 +160,159 @@ void __init pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + guest = d; + } + +-void pv_shim_shutdown(uint8_t reason) ++static void write_start_info(struct domain *d) + { +- /* XXX: handle suspend */ +- xen_hypercall_shutdown(reason); ++ struct cpu_user_regs *regs = guest_cpu_user_regs(); ++ start_info_t *si = map_domain_page(_mfn(is_pv_32bit_domain(d) ? regs->edx ++ : regs->rdx)); ++ uint64_t param; ++ ++ snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%s", ++ is_pv_32bit_domain(d) ? "32p" : "64"); ++ si->nr_pages = d->tot_pages; ++ si->shared_info = virt_to_maddr(d->shared_info); ++ si->flags = 0; ++ BUG_ON(xen_hypercall_hvm_get_param(HVM_PARAM_STORE_PFN, &si->store_mfn)); ++ BUG_ON(xen_hypercall_hvm_get_param(HVM_PARAM_STORE_EVTCHN, ¶m)); ++ si->store_evtchn = param; ++ BUG_ON(xen_hypercall_hvm_get_param(HVM_PARAM_CONSOLE_EVTCHN, ¶m)); ++ si->console.domU.evtchn = param; ++ if ( pv_console ) ++ si->console.domU.mfn = virt_to_mfn(consoled_get_ring_addr()); ++ else if ( xen_hypercall_hvm_get_param(HVM_PARAM_CONSOLE_PFN, ++ &si->console.domU.mfn) ) ++ BUG(); ++ ++ if ( is_pv_32bit_domain(d) ) ++ xlat_start_info(si, XLAT_start_info_console_domU); ++ ++ unmap_domain_page(si); ++} ++ ++int pv_shim_shutdown(uint8_t reason) ++{ ++ struct domain *d = current->domain; ++ struct vcpu *v; ++ unsigned int i; ++ uint64_t old_store_pfn, old_console_pfn = 0, store_pfn, console_pfn; ++ uint64_t store_evtchn, console_evtchn; ++ long rc; ++ ++ if ( reason != SHUTDOWN_suspend ) ++ /* Forward to L0. */ ++ return xen_hypercall_shutdown(reason); ++ ++ BUG_ON(current->vcpu_id != 0); ++ ++ BUG_ON(xen_hypercall_hvm_get_param(HVM_PARAM_STORE_PFN, &old_store_pfn)); ++ if ( !pv_console ) ++ BUG_ON(xen_hypercall_hvm_get_param(HVM_PARAM_CONSOLE_PFN, ++ &old_console_pfn)); ++ ++ /* Pause the other vcpus before starting the migration. */ ++ for_each_vcpu(d, v) ++ if ( v != current ) ++ vcpu_pause_by_systemcontroller(v); ++ ++ rc = xen_hypercall_shutdown(SHUTDOWN_suspend); ++ if ( rc ) ++ { ++ for_each_vcpu(d, v) ++ if ( v != current ) ++ vcpu_unpause_by_systemcontroller(v); ++ ++ return rc; ++ } ++ ++ /* Resume the shim itself first. */ ++ hypervisor_resume(); ++ ++ /* ++ * ATM there's nothing Xen can do if the console/store pfn changes, ++ * because Xen won't have a page_info struct for it. ++ */ ++ BUG_ON(xen_hypercall_hvm_get_param(HVM_PARAM_STORE_PFN, &store_pfn)); ++ BUG_ON(old_store_pfn != store_pfn); ++ if ( !pv_console ) ++ { ++ BUG_ON(xen_hypercall_hvm_get_param(HVM_PARAM_CONSOLE_PFN, ++ &console_pfn)); ++ BUG_ON(old_console_pfn != console_pfn); ++ } ++ ++ /* Update domain id. */ ++ d->domain_id = get_initial_domain_id(); ++ ++ /* Clean the iomem range. */ ++ BUG_ON(iomem_deny_access(d, 0, ~0UL)); ++ ++ /* Clean grant frames. */ ++ xfree(grant_frames); ++ grant_frames = NULL; ++ nr_grant_list = 0; ++ ++ /* Clean event channels. */ ++ for ( i = 0; i < EVTCHN_2L_NR_CHANNELS; i++ ) ++ { ++ if ( !port_is_valid(d, i) ) ++ continue; ++ ++ if ( evtchn_handled(d, i) ) ++ evtchn_close(d, i, false); ++ else ++ evtchn_free(d, evtchn_from_port(d, i)); ++ } ++ ++ /* Reserve store/console event channel. */ ++ BUG_ON(xen_hypercall_hvm_get_param(HVM_PARAM_STORE_EVTCHN, &store_evtchn)); ++ BUG_ON(evtchn_allocate_port(d, store_evtchn)); ++ evtchn_reserve(d, store_evtchn); ++ BUG_ON(xen_hypercall_hvm_get_param(HVM_PARAM_CONSOLE_EVTCHN, ++ &console_evtchn)); ++ BUG_ON(evtchn_allocate_port(d, console_evtchn)); ++ evtchn_reserve(d, console_evtchn); ++ ++ /* Clean watchdogs. */ ++ watchdog_domain_destroy(d); ++ watchdog_domain_init(d); ++ ++ /* Clean the PIRQ EOI page. */ ++ if ( d->arch.pirq_eoi_map != NULL ) ++ { ++ unmap_domain_page_global(d->arch.pirq_eoi_map); ++ put_page_and_type(mfn_to_page(d->arch.pirq_eoi_map_mfn)); ++ d->arch.pirq_eoi_map = NULL; ++ d->arch.pirq_eoi_map_mfn = 0; ++ d->arch.auto_unmask = 0; ++ } ++ ++ /* ++ * NB: there's no need to fixup the p2m, since the mfns assigned ++ * to the PV guest have not changed at all. Just re-write the ++ * start_info fields with the appropriate value. ++ */ ++ write_start_info(d); ++ ++ for_each_vcpu(d, v) ++ { ++ /* Unmap guest vcpu_info pages. */ ++ unmap_vcpu_info(v); ++ ++ /* Reset the periodic timer to the default value. */ ++ v->periodic_period = MILLISECS(10); ++ /* Stop the singleshot timer. */ ++ stop_timer(&v->singleshot_timer); ++ ++ if ( test_bit(_VPF_down, &v->pause_flags) ) ++ BUG_ON(vcpu_reset(v)); ++ ++ if ( v != current ) ++ vcpu_unpause_by_systemcontroller(v); ++ else ++ vcpu_force_reschedule(v); ++ } ++ ++ return 0; + } + + static long pv_shim_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) +diff --git a/xen/common/domain.c b/xen/common/domain.c +index 1ba05fa3a1..9a703734eb 100644 +--- a/xen/common/domain.c ++++ b/xen/common/domain.c +@@ -689,16 +689,13 @@ void __domain_crash_synchronous(void) + } + + +-void domain_shutdown(struct domain *d, u8 reason) ++int domain_shutdown(struct domain *d, u8 reason) + { + struct vcpu *v; + + #ifdef CONFIG_X86 + if ( pv_shim ) +- { +- pv_shim_shutdown(reason); +- return; +- } ++ return pv_shim_shutdown(reason); + #endif + + spin_lock(&d->shutdown_lock); +@@ -713,7 +710,7 @@ void domain_shutdown(struct domain *d, u8 reason) + if ( d->is_shutting_down ) + { + spin_unlock(&d->shutdown_lock); +- return; ++ return 0; + } + + d->is_shutting_down = 1; +@@ -735,6 +732,8 @@ void domain_shutdown(struct domain *d, u8 reason) + __domain_finalise_shutdown(d); + + spin_unlock(&d->shutdown_lock); ++ ++ return 0; + } + + void domain_resume(struct domain *d) +diff --git a/xen/common/schedule.c b/xen/common/schedule.c +index 88279213e8..b7884263f2 100644 +--- a/xen/common/schedule.c ++++ b/xen/common/schedule.c +@@ -1149,11 +1149,10 @@ ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) + if ( copy_from_guest(&sched_shutdown, arg, 1) ) + break; + +- ret = 0; + TRACE_3D(TRC_SCHED_SHUTDOWN, + current->domain->domain_id, current->vcpu_id, + sched_shutdown.reason); +- domain_shutdown(current->domain, (u8)sched_shutdown.reason); ++ ret = domain_shutdown(current->domain, (u8)sched_shutdown.reason); + + break; + } +diff --git a/xen/drivers/char/xen_pv_console.c b/xen/drivers/char/xen_pv_console.c +index 948343303e..cc1c1d743f 100644 +--- a/xen/drivers/char/xen_pv_console.c ++++ b/xen/drivers/char/xen_pv_console.c +@@ -37,7 +37,7 @@ static DEFINE_SPINLOCK(tx_lock); + + bool pv_console; + +-void __init pv_console_init(void) ++void pv_console_init(void) + { + long r; + uint64_t raw_pfn = 0, raw_evtchn = 0; +diff --git a/xen/include/asm-x86/guest/xen.h b/xen/include/asm-x86/guest/xen.h +index ac48dcbe44..11243fe60d 100644 +--- a/xen/include/asm-x86/guest/xen.h ++++ b/xen/include/asm-x86/guest/xen.h +@@ -39,6 +39,7 @@ int hypervisor_free_unused_page(mfn_t mfn); + void hypervisor_fixup_e820(struct e820map *e820); + const unsigned long *hypervisor_reserved_pages(unsigned int *size); + uint32_t hypervisor_cpuid_base(void); ++void hypervisor_resume(void); + + DECLARE_PER_CPU(unsigned int, vcpu_id); + DECLARE_PER_CPU(struct vcpu_info *, vcpu_info); +@@ -72,6 +73,10 @@ static inline uint32_t hypervisor_cpuid_base(void) + ASSERT_UNREACHABLE(); + return 0; + }; ++static inline void hypervisor_resume(void) ++{ ++ ASSERT_UNREACHABLE(); ++}; + + #endif /* CONFIG_XEN_GUEST */ + #endif /* __X86_GUEST_XEN_H__ */ +diff --git a/xen/include/asm-x86/pv/shim.h b/xen/include/asm-x86/pv/shim.h +index ab656fd854..4d5f0b43fc 100644 +--- a/xen/include/asm-x86/pv/shim.h ++++ b/xen/include/asm-x86/pv/shim.h +@@ -35,7 +35,7 @@ void pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + unsigned long va_start, unsigned long store_va, + unsigned long console_va, unsigned long vphysmap, + start_info_t *si); +-void pv_shim_shutdown(uint8_t reason); ++int pv_shim_shutdown(uint8_t reason); + void pv_shim_inject_evtchn(unsigned int port); + domid_t get_initial_domain_id(void); + +@@ -50,9 +50,10 @@ static inline void pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + { + ASSERT_UNREACHABLE(); + } +-static inline void pv_shim_shutdown(uint8_t reason) ++static inline int pv_shim_shutdown(uint8_t reason) + { + ASSERT_UNREACHABLE(); ++ return 0; + } + static inline void pv_shim_inject_evtchn(unsigned int port) + { +diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h +index 64abc1df6c..2541ecb04f 100644 +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -603,7 +603,7 @@ static inline struct domain *rcu_lock_current_domain(void) + struct domain *get_domain_by_id(domid_t dom); + void domain_destroy(struct domain *d); + int domain_kill(struct domain *d); +-void domain_shutdown(struct domain *d, u8 reason); ++int domain_shutdown(struct domain *d, u8 reason); + void domain_resume(struct domain *d); + void domain_pause_for_debugger(void); + +-- +2.14.3 + + +From 7dcc20e0c8cf6fa30f483b0c91c8566a97a61031 Mon Sep 17 00:00:00 2001 +From: Sergey Dyasli +Date: Thu, 11 Jan 2018 11:41:20 +0000 +Subject: [PATCH 62/77] xen/pvshim: add shim_mem cmdline parameter + +Signed-off-by: Sergey Dyasli +--- + docs/misc/xen-command-line.markdown | 16 +++++++++++++ + xen/arch/x86/dom0_build.c | 18 ++++++++++++++- + xen/arch/x86/pv/shim.c | 46 +++++++++++++++++++++++++++++++++++++ + xen/include/asm-x86/pv/shim.h | 6 +++++ + 4 files changed, 85 insertions(+), 1 deletion(-) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 3a1a9c1fba..9f51710a46 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -686,6 +686,8 @@ any dom0 autoballooning feature present in your toolstack. See the + _xl.conf(5)_ man page or [Xen Best + Practices](http://wiki.xen.org/wiki/Xen_Best_Practices#Xen_dom0_dedicated_memory_and_preventing_dom0_memory_ballooning). + ++This option doesn't have effect if pv-shim mode is enabled. ++ + ### dom0\_nodes + + > `= List of [ | relaxed | strict ]` +@@ -1456,6 +1458,20 @@ guest compatibly inside an HVM container. + In this mode, the kernel and initrd passed as modules to the hypervisor are + constructed into a plain unprivileged PV domain. + ++### shim\_mem (x86) ++> `= List of ( min: | max: | )` ++ ++Set the amount of memory that xen-shim reserves for itself. Only has effect ++if pv-shim mode is enabled. ++ ++* `min:` specifies the minimum amount of memory. Ignored if greater ++ than max. Default: 10M. ++* `max:` specifies the maximum amount of memory. Default: 128M. ++* `` specifies the exact amount of memory. Overrides both min and max. ++ ++By default, 1/16th of total HVM container's memory is reserved for xen-shim ++with minimum amount being 10MB and maximum amount 128MB. ++ + ### rcu-idle-timer-period-ms + > `= ` + +diff --git a/xen/arch/x86/dom0_build.c b/xen/arch/x86/dom0_build.c +index 452298c624..bc713fb2b5 100644 +--- a/xen/arch/x86/dom0_build.c ++++ b/xen/arch/x86/dom0_build.c +@@ -51,6 +51,13 @@ static long __init parse_amt(const char *s, const char **ps) + + static int __init parse_dom0_mem(const char *s) + { ++ /* xen-shim uses shim_mem parameter instead of dom0_mem */ ++ if ( pv_shim ) ++ { ++ printk("Ignoring dom0_mem param in pv-shim mode\n"); ++ return 0; ++ } ++ + do { + if ( !strncmp(s, "min:", 4) ) + dom0_min_nrpages = parse_amt(s+4, &s); +@@ -284,7 +291,16 @@ unsigned long __init dom0_compute_nr_pages( + * maximum of 128MB. + */ + if ( nr_pages == 0 ) +- nr_pages = -min(avail / 16, 128UL << (20 - PAGE_SHIFT)); ++ { ++ uint64_t rsvd = min(avail / 16, 128UL << (20 - PAGE_SHIFT)); ++ if ( pv_shim ) ++ { ++ rsvd = pv_shim_mem(avail); ++ printk("Reserved %lu pages for xen-shim\n", rsvd); ++ ++ } ++ nr_pages = -rsvd; ++ } + + /* Negative specification means "all memory - specified amount". */ + if ( (long)nr_pages < 0 ) nr_pages += avail; +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +index c53a4ca407..6dc1ee45d7 100644 +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -53,6 +53,52 @@ static long pv_shim_grant_table_op(unsigned int cmd, + XEN_GUEST_HANDLE_PARAM(void) uop, + unsigned int count); + ++/* ++ * By default, 1/16th of total HVM container's memory is reserved for xen-shim ++ * with minimum amount being 10MB and maximum amount 128MB. Some users may wish ++ * to tune this constants for better memory utilization. This can be achieved ++ * using the following xen-shim's command line option: ++ * ++ * shim_mem=[min:,][max:,][] ++ * ++ * : The minimum amount of memory that should be allocated for xen-shim ++ * (ignored if greater than max) ++ * : The maximum amount of memory that should be allocated for xen-shim ++ * : The precise amount of memory to allocate for xen-shim ++ * (overrides both min and max) ++ */ ++static uint64_t __initdata shim_nrpages; ++static uint64_t __initdata shim_min_nrpages = 10UL << (20 - PAGE_SHIFT); ++static uint64_t __initdata shim_max_nrpages = 128UL << (20 - PAGE_SHIFT); ++ ++static int __init parse_shim_mem(const char *s) ++{ ++ do { ++ if ( !strncmp(s, "min:", 4) ) ++ shim_min_nrpages = parse_size_and_unit(s+4, &s) >> PAGE_SHIFT; ++ else if ( !strncmp(s, "max:", 4) ) ++ shim_max_nrpages = parse_size_and_unit(s+4, &s) >> PAGE_SHIFT; ++ else ++ shim_nrpages = parse_size_and_unit(s, &s) >> PAGE_SHIFT; ++ } while ( *s++ == ',' ); ++ ++ return s[-1] ? -EINVAL : 0; ++} ++custom_param("shim_mem", parse_shim_mem); ++ ++uint64_t pv_shim_mem(uint64_t avail) ++{ ++ uint64_t rsvd = min(avail / 16, shim_max_nrpages); ++ ++ if ( shim_nrpages ) ++ return shim_nrpages; ++ ++ if ( shim_min_nrpages <= shim_max_nrpages ) ++ rsvd = max(rsvd, shim_min_nrpages); ++ ++ return rsvd; ++} ++ + #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER| \ + _PAGE_GUEST_KERNEL) + #define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) +diff --git a/xen/include/asm-x86/pv/shim.h b/xen/include/asm-x86/pv/shim.h +index 4d5f0b43fc..0ef452158e 100644 +--- a/xen/include/asm-x86/pv/shim.h ++++ b/xen/include/asm-x86/pv/shim.h +@@ -38,6 +38,7 @@ void pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + int pv_shim_shutdown(uint8_t reason); + void pv_shim_inject_evtchn(unsigned int port); + domid_t get_initial_domain_id(void); ++uint64_t pv_shim_mem(uint64_t avail); + + #else + +@@ -63,6 +64,11 @@ static inline domid_t get_initial_domain_id(void) + { + return 0; + } ++static inline uint64_t pv_shim_mem(uint64_t avail) ++{ ++ ASSERT_UNREACHABLE(); ++ return 0; ++} + + #endif + +-- +2.14.3 + + +From 004646a1dd4ff2f768d942689545dd3b6e2135e2 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:20 +0000 +Subject: [PATCH 63/77] xen/pvshim: set max_pages to the value of tot_pages +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +So that the guest is not able to deplete the memory pool of the shim +itself by trying to balloon up. + +Signed-off-by: Roger Pau Monné +Acked-by: Jan Beulich +--- + xen/arch/x86/pv/shim.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +index 6dc1ee45d7..e3e101a5b1 100644 +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -204,6 +204,12 @@ void __init pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + (hypercall_fn_t *)pv_shim_grant_table_op, + (hypercall_fn_t *)pv_shim_grant_table_op); + guest = d; ++ ++ /* ++ * Set the max pages to the current number of pages to prevent the ++ * guest from depleting the shim memory pool. ++ */ ++ d->max_pages = d->tot_pages; + } + + static void write_start_info(struct domain *d) +-- +2.14.3 + + +From 5b6c3ffa1d291724a329b57658783fc30b93b479 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:20 +0000 +Subject: [PATCH 64/77] xen/pvshim: support vCPU hotplug +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Roger Pau Monné +--- +Changes since v1: + - Fix hotplug so that v->initialized is checked before attempting to + bring up the physical CPU. + - Fix ARM compilation. +--- + xen/arch/x86/pv/shim.c | 63 +++++++++++++++++++++++++++++++++++++++++++ + xen/common/domain.c | 38 +++++++++++++++++--------- + xen/include/asm-x86/pv/shim.h | 12 +++++++++ + xen/include/xen/domain.h | 1 + + 4 files changed, 102 insertions(+), 12 deletions(-) + +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +index e3e101a5b1..68ec7bed8e 100644 +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -751,6 +751,69 @@ static long pv_shim_grant_table_op(unsigned int cmd, + return rc; + } + ++long pv_shim_cpu_up(void *data) ++{ ++ struct vcpu *v = data; ++ struct domain *d = v->domain; ++ bool wake; ++ ++ BUG_ON(smp_processor_id() != 0); ++ ++ domain_lock(d); ++ if ( !v->is_initialised ) ++ { ++ domain_unlock(d); ++ return -EINVAL; ++ } ++ ++ if ( !cpu_online(v->vcpu_id) ) ++ { ++ long rc = cpu_up_helper((void *)(unsigned long)v->vcpu_id); ++ ++ if ( rc ) ++ { ++ domain_unlock(d); ++ gprintk(XENLOG_ERR, "Failed to bring up CPU#%u: %ld\n", ++ v->vcpu_id, rc); ++ return rc; ++ } ++ } ++ ++ wake = test_and_clear_bit(_VPF_down, &v->pause_flags); ++ domain_unlock(d); ++ if ( wake ) ++ vcpu_wake(v); ++ ++ return 0; ++} ++ ++long pv_shim_cpu_down(void *data) ++{ ++ struct vcpu *v = data; ++ long rc; ++ ++ BUG_ON(smp_processor_id() != 0); ++ ++ if ( !test_and_set_bit(_VPF_down, &v->pause_flags) ) ++ vcpu_sleep_sync(v); ++ ++ if ( cpu_online(v->vcpu_id) ) ++ { ++ rc = cpu_down_helper((void *)(unsigned long)v->vcpu_id); ++ if ( rc ) ++ gprintk(XENLOG_ERR, "Failed to bring down CPU#%u: %ld\n", ++ v->vcpu_id, rc); ++ /* ++ * NB: do not propagate errors from cpu_down_helper failing. The shim ++ * is going to run with extra CPUs, but that's not going to prevent ++ * normal operation. OTOH most guests are not prepared to handle an ++ * error on VCPUOP_down failing, and will likely panic. ++ */ ++ } ++ ++ return 0; ++} ++ + domid_t get_initial_domain_id(void) + { + uint32_t eax, ebx, ecx, edx; +diff --git a/xen/common/domain.c b/xen/common/domain.c +index 9a703734eb..8fbd33d4c6 100644 +--- a/xen/common/domain.c ++++ b/xen/common/domain.c +@@ -1293,22 +1293,36 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg) + + break; + +- case VCPUOP_up: { +- bool_t wake = 0; +- domain_lock(d); +- if ( !v->is_initialised ) +- rc = -EINVAL; ++ case VCPUOP_up: ++#ifdef CONFIG_X86 ++ if ( pv_shim ) ++ rc = continue_hypercall_on_cpu(0, pv_shim_cpu_up, v); + else +- wake = test_and_clear_bit(_VPF_down, &v->pause_flags); +- domain_unlock(d); +- if ( wake ) +- vcpu_wake(v); ++#endif ++ { ++ bool wake = false; ++ ++ domain_lock(d); ++ if ( !v->is_initialised ) ++ rc = -EINVAL; ++ else ++ wake = test_and_clear_bit(_VPF_down, &v->pause_flags); ++ domain_unlock(d); ++ if ( wake ) ++ vcpu_wake(v); ++ } ++ + break; +- } + + case VCPUOP_down: +- if ( !test_and_set_bit(_VPF_down, &v->pause_flags) ) +- vcpu_sleep_nosync(v); ++#ifdef CONFIG_X86 ++ if ( pv_shim ) ++ rc = continue_hypercall_on_cpu(0, pv_shim_cpu_down, v); ++ else ++#endif ++ if ( !test_and_set_bit(_VPF_down, &v->pause_flags) ) ++ vcpu_sleep_nosync(v); ++ + break; + + case VCPUOP_is_up: +diff --git a/xen/include/asm-x86/pv/shim.h b/xen/include/asm-x86/pv/shim.h +index 0ef452158e..eb59ddd38a 100644 +--- a/xen/include/asm-x86/pv/shim.h ++++ b/xen/include/asm-x86/pv/shim.h +@@ -37,6 +37,8 @@ void pv_shim_setup_dom(struct domain *d, l4_pgentry_t *l4start, + start_info_t *si); + int pv_shim_shutdown(uint8_t reason); + void pv_shim_inject_evtchn(unsigned int port); ++long pv_shim_cpu_up(void *data); ++long pv_shim_cpu_down(void *data); + domid_t get_initial_domain_id(void); + uint64_t pv_shim_mem(uint64_t avail); + +@@ -60,6 +62,16 @@ static inline void pv_shim_inject_evtchn(unsigned int port) + { + ASSERT_UNREACHABLE(); + } ++static inline long pv_shim_cpu_up(void *data) ++{ ++ ASSERT_UNREACHABLE(); ++ return 0; ++} ++static inline long pv_shim_cpu_down(void *data) ++{ ++ ASSERT_UNREACHABLE(); ++ return 0; ++} + static inline domid_t get_initial_domain_id(void) + { + return 0; +diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h +index 347f264047..eb62f1dab1 100644 +--- a/xen/include/xen/domain.h ++++ b/xen/include/xen/domain.h +@@ -17,6 +17,7 @@ struct vcpu *alloc_vcpu( + struct domain *d, unsigned int vcpu_id, unsigned int cpu_id); + struct vcpu *alloc_dom0_vcpu0(struct domain *dom0); + int vcpu_reset(struct vcpu *); ++int vcpu_up(struct vcpu *v); + + struct xen_domctl_getdomaininfo; + void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info); +-- +2.14.3 + + +From 29dd3142bf7115d45836a6de7a72c17a4dac7cc8 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:20 +0000 +Subject: [PATCH 65/77] xen/pvshim: memory hotplug +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Roger Pau Monné +--- +Changes since v1: + - Add an order parameter to batch_memory_op. + - Add a TODO item regarding high order memory chunks to + pv_shim_online_memory. + - Use page_list_splice. + - Make sure the shim handlers are not called multiple times when + the hypercall is preempted. +--- + xen/arch/x86/pv/shim.c | 112 ++++++++++++++++++++++++++++++++++++++++++ + xen/common/memory.c | 21 ++++++++ + xen/include/asm-x86/pv/shim.h | 10 ++++ + 3 files changed, 143 insertions(+) + +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +index 68ec7bed8e..4120cc550e 100644 +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -48,6 +48,9 @@ static unsigned int nr_grant_list; + static unsigned long *grant_frames; + static DEFINE_SPINLOCK(grant_lock); + ++static PAGE_LIST_HEAD(balloon); ++static DEFINE_SPINLOCK(balloon_lock); ++ + static long pv_shim_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); + static long pv_shim_grant_table_op(unsigned int cmd, + XEN_GUEST_HANDLE_PARAM(void) uop, +@@ -814,6 +817,115 @@ long pv_shim_cpu_down(void *data) + return 0; + } + ++static unsigned long batch_memory_op(unsigned int cmd, unsigned int order, ++ const struct page_list_head *list) ++{ ++ struct xen_memory_reservation xmr = { ++ .domid = DOMID_SELF, ++ .extent_order = order, ++ }; ++ unsigned long pfns[64]; ++ const struct page_info *pg; ++ unsigned long done = 0; ++ ++ set_xen_guest_handle(xmr.extent_start, pfns); ++ page_list_for_each ( pg, list ) ++ { ++ pfns[xmr.nr_extents++] = page_to_mfn(pg); ++ if ( xmr.nr_extents == ARRAY_SIZE(pfns) || !page_list_next(pg, list) ) ++ { ++ long nr = xen_hypercall_memory_op(cmd, &xmr); ++ ++ done += nr > 0 ? nr : 0; ++ if ( nr != xmr.nr_extents ) ++ break; ++ xmr.nr_extents = 0; ++ } ++ } ++ ++ return done; ++} ++ ++void pv_shim_online_memory(unsigned int nr, unsigned int order) ++{ ++ struct page_info *page, *tmp; ++ PAGE_LIST_HEAD(list); ++ ++ spin_lock(&balloon_lock); ++ page_list_for_each_safe ( page, tmp, &balloon ) ++ { ++ /* TODO: add support for splitting high order memory chunks. */ ++ if ( page->v.free.order != order ) ++ continue; ++ ++ page_list_del(page, &balloon); ++ page_list_add_tail(page, &list); ++ if ( !--nr ) ++ break; ++ } ++ spin_unlock(&balloon_lock); ++ ++ if ( nr ) ++ gprintk(XENLOG_WARNING, ++ "failed to allocate %u extents of order %u for onlining\n", ++ nr, order); ++ ++ nr = batch_memory_op(XENMEM_populate_physmap, order, &list); ++ while ( nr-- ) ++ { ++ BUG_ON((page = page_list_remove_head(&list)) == NULL); ++ free_domheap_pages(page, order); ++ } ++ ++ if ( !page_list_empty(&list) ) ++ { ++ gprintk(XENLOG_WARNING, ++ "failed to online some of the memory regions\n"); ++ spin_lock(&balloon_lock); ++ page_list_splice(&list, &balloon); ++ spin_unlock(&balloon_lock); ++ } ++} ++ ++void pv_shim_offline_memory(unsigned int nr, unsigned int order) ++{ ++ struct page_info *page; ++ PAGE_LIST_HEAD(list); ++ ++ while ( nr-- ) ++ { ++ page = alloc_domheap_pages(NULL, order, 0); ++ if ( !page ) ++ break; ++ ++ page_list_add_tail(page, &list); ++ page->v.free.order = order; ++ } ++ ++ if ( nr + 1 ) ++ gprintk(XENLOG_WARNING, ++ "failed to reserve %u extents of order %u for offlining\n", ++ nr + 1, order); ++ ++ ++ nr = batch_memory_op(XENMEM_decrease_reservation, order, &list); ++ spin_lock(&balloon_lock); ++ while ( nr-- ) ++ { ++ BUG_ON((page = page_list_remove_head(&list)) == NULL); ++ page_list_add_tail(page, &balloon); ++ } ++ spin_unlock(&balloon_lock); ++ ++ if ( !page_list_empty(&list) ) ++ { ++ gprintk(XENLOG_WARNING, ++ "failed to offline some of the memory regions\n"); ++ while ( (page = page_list_remove_head(&list)) != NULL ) ++ free_domheap_pages(page, order); ++ } ++} ++ + domid_t get_initial_domain_id(void) + { + uint32_t eax, ebx, ecx, edx; +diff --git a/xen/common/memory.c b/xen/common/memory.c +index a6ba33fdcb..9eed96a9ce 100644 +--- a/xen/common/memory.c ++++ b/xen/common/memory.c +@@ -29,6 +29,10 @@ + #include + #include + ++#ifdef CONFIG_X86 ++#include ++#endif ++ + struct memop_args { + /* INPUT */ + struct domain *domain; /* Domain to be affected. */ +@@ -1019,6 +1023,12 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) + return start_extent; + } + ++#ifdef CONFIG_X86 ++ if ( pv_shim && op != XENMEM_decrease_reservation && !args.preempted ) ++ /* Avoid calling pv_shim_online_memory when preempted. */ ++ pv_shim_online_memory(args.nr_extents, args.extent_order); ++#endif ++ + switch ( op ) + { + case XENMEM_increase_reservation: +@@ -1041,6 +1051,17 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) + __HYPERVISOR_memory_op, "lh", + op | (rc << MEMOP_EXTENT_SHIFT), arg); + ++#ifdef CONFIG_X86 ++ if ( pv_shim && op == XENMEM_decrease_reservation ) ++ /* ++ * Only call pv_shim_offline_memory when the hypercall has ++ * finished. Note that nr_done is used to cope in case the ++ * hypercall has failed and only part of the extents where ++ * processed. ++ */ ++ pv_shim_offline_memory(args.nr_extents, args.nr_done); ++#endif ++ + break; + + case XENMEM_exchange: +diff --git a/xen/include/asm-x86/pv/shim.h b/xen/include/asm-x86/pv/shim.h +index eb59ddd38a..fb739772df 100644 +--- a/xen/include/asm-x86/pv/shim.h ++++ b/xen/include/asm-x86/pv/shim.h +@@ -39,6 +39,8 @@ int pv_shim_shutdown(uint8_t reason); + void pv_shim_inject_evtchn(unsigned int port); + long pv_shim_cpu_up(void *data); + long pv_shim_cpu_down(void *data); ++void pv_shim_online_memory(unsigned int nr, unsigned int order); ++void pv_shim_offline_memory(unsigned int nr, unsigned int order); + domid_t get_initial_domain_id(void); + uint64_t pv_shim_mem(uint64_t avail); + +@@ -72,6 +74,14 @@ static inline long pv_shim_cpu_down(void *data) + ASSERT_UNREACHABLE(); + return 0; + } ++static inline void pv_shim_online_memory(unsigned int nr, unsigned int order) ++{ ++ ASSERT_UNREACHABLE(); ++} ++static inline void pv_shim_offline_memory(unsigned int nr, unsigned int order) ++{ ++ ASSERT_UNREACHABLE(); ++} + static inline domid_t get_initial_domain_id(void) + { + return 0; +-- +2.14.3 + + +From 9d60bc96bef01444e30a9653ebf06b24c5bc8be5 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:20 +0000 +Subject: [PATCH 66/77] xen/shim: modify shim_mem parameter behaviour +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +shim_mem will now account for both the memory used by the hypervisor +loaded in memory and the free memory slack given to the shim for +runtime usage. + +From experimental testing it seems like the total amount of MiB used +by the shim (giving it ~1MB of free memory for runtime) is: + +memory/113 + 20 + +Signed-off-by: Roger Pau Monné +--- + docs/misc/xen-command-line.markdown | 13 +++++++------ + xen/arch/x86/dom0_build.c | 14 +++----------- + xen/arch/x86/pv/shim.c | 30 +++++++++++++++++++----------- + 3 files changed, 29 insertions(+), 28 deletions(-) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 9f51710a46..68ec52b5c2 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -1461,16 +1461,17 @@ constructed into a plain unprivileged PV domain. + ### shim\_mem (x86) + > `= List of ( min: | max: | )` + +-Set the amount of memory that xen-shim reserves for itself. Only has effect +-if pv-shim mode is enabled. ++Set the amount of memory that xen-shim uses. Only has effect if pv-shim mode is ++enabled. Note that this value accounts for the memory used by the shim itself ++plus the free memory slack given to the shim for runtime allocations. + + * `min:` specifies the minimum amount of memory. Ignored if greater +- than max. Default: 10M. +-* `max:` specifies the maximum amount of memory. Default: 128M. ++ than max. ++* `max:` specifies the maximum amount of memory. + * `` specifies the exact amount of memory. Overrides both min and max. + +-By default, 1/16th of total HVM container's memory is reserved for xen-shim +-with minimum amount being 10MB and maximum amount 128MB. ++By default, the amount of free memory slack given to the shim for runtime usage ++is 1MB. + + ### rcu-idle-timer-period-ms + > `= ` +diff --git a/xen/arch/x86/dom0_build.c b/xen/arch/x86/dom0_build.c +index bc713fb2b5..d77c6b40de 100644 +--- a/xen/arch/x86/dom0_build.c ++++ b/xen/arch/x86/dom0_build.c +@@ -290,17 +290,9 @@ unsigned long __init dom0_compute_nr_pages( + * for things like DMA buffers. This reservation is clamped to a + * maximum of 128MB. + */ +- if ( nr_pages == 0 ) +- { +- uint64_t rsvd = min(avail / 16, 128UL << (20 - PAGE_SHIFT)); +- if ( pv_shim ) +- { +- rsvd = pv_shim_mem(avail); +- printk("Reserved %lu pages for xen-shim\n", rsvd); +- +- } +- nr_pages = -rsvd; +- } ++ if ( !nr_pages ) ++ nr_pages = -(pv_shim ? pv_shim_mem(avail) ++ : min(avail / 16, 128UL << (20 - PAGE_SHIFT))); + + /* Negative specification means "all memory - specified amount". */ + if ( (long)nr_pages < 0 ) nr_pages += avail; +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +index 4120cc550e..702249719e 100644 +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -57,9 +57,8 @@ static long pv_shim_grant_table_op(unsigned int cmd, + unsigned int count); + + /* +- * By default, 1/16th of total HVM container's memory is reserved for xen-shim +- * with minimum amount being 10MB and maximum amount 128MB. Some users may wish +- * to tune this constants for better memory utilization. This can be achieved ++ * By default give the shim 1MB of free memory slack. Some users may wish to ++ * tune this constants for better memory utilization. This can be achieved + * using the following xen-shim's command line option: + * + * shim_mem=[min:,][max:,][] +@@ -71,8 +70,8 @@ static long pv_shim_grant_table_op(unsigned int cmd, + * (overrides both min and max) + */ + static uint64_t __initdata shim_nrpages; +-static uint64_t __initdata shim_min_nrpages = 10UL << (20 - PAGE_SHIFT); +-static uint64_t __initdata shim_max_nrpages = 128UL << (20 - PAGE_SHIFT); ++static uint64_t __initdata shim_min_nrpages; ++static uint64_t __initdata shim_max_nrpages; + + static int __init parse_shim_mem(const char *s) + { +@@ -91,15 +90,24 @@ custom_param("shim_mem", parse_shim_mem); + + uint64_t pv_shim_mem(uint64_t avail) + { +- uint64_t rsvd = min(avail / 16, shim_max_nrpages); ++ if ( !shim_nrpages ) ++ { ++ shim_nrpages = max(shim_min_nrpages, ++ total_pages - avail + (1UL << (20 - PAGE_SHIFT))); ++ if ( shim_max_nrpages ) ++ shim_max_nrpages = min(shim_nrpages, shim_max_nrpages); ++ } ++ ++ if ( total_pages - avail > shim_nrpages ) ++ panic("pages used by shim > shim_nrpages (%#lx > %#lx)", ++ total_pages - avail, shim_nrpages); + +- if ( shim_nrpages ) +- return shim_nrpages; ++ shim_nrpages -= total_pages - avail; + +- if ( shim_min_nrpages <= shim_max_nrpages ) +- rsvd = max(rsvd, shim_min_nrpages); ++ printk("shim used pages %#lx reserving %#lx free pages\n", ++ total_pages - avail, shim_nrpages); + +- return rsvd; ++ return shim_nrpages; + } + + #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER| \ +-- +2.14.3 + + +From b5be9c817d04b006886a0d7b87eacf7bd78f504d Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:20 +0000 +Subject: [PATCH 67/77] xen/pvshim: use default position for the m2p mappings +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When running a 32bit kernel as Dom0 on a 64bit hypervisor the +hypervisor will try to shrink the hypervisor hole to the minimum +needed, and thus requires the Dom0 to use XENMEM_machphys_mapping in +order to fetch the position of the start of the hypervisor virtual +mappings. + +Disable this feature when running as a PV shim, since some DomU +kernels don't implemented XENMEM_machphys_mapping and break if the m2p +doesn't begin at the default address. + +NB: support for the XENMEM_machphys_mapping was added in Linux by +commit 7e7750. + +Signed-off-by: Roger Pau Monné +Acked-by: Jan Beulich +--- + xen/arch/x86/pv/dom0_build.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c +index 72752b8656..ebcb47bf26 100644 +--- a/xen/arch/x86/pv/dom0_build.c ++++ b/xen/arch/x86/pv/dom0_build.c +@@ -398,7 +398,8 @@ int __init dom0_construct_pv(struct domain *d, + if ( parms.pae == XEN_PAE_EXTCR3 ) + set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist); + +- if ( (parms.virt_hv_start_low != UNSET_ADDR) && elf_32bit(&elf) ) ++ if ( !pv_shim && (parms.virt_hv_start_low != UNSET_ADDR) && ++ elf_32bit(&elf) ) + { + unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1; + value = (parms.virt_hv_start_low + mask) & ~mask; +-- +2.14.3 + + +From c9083de0ae6b0f5b42e7f92f6d43edc3bd09d4f1 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:21 +0000 +Subject: [PATCH 68/77] xen/shim: crash instead of reboot in shim mode +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +All guest shutdown operations are forwarded to L0, so the only native +calls to machine_restart happen from crash related paths inside the +hypervisor, hence switch the reboot code to instead issue a crash +shutdown. + +Signed-off-by: Roger Pau Monné +Acked-by: Jan Beulich +[ wei: fix arm build ] +Signed-off-by: Wei Liu +--- +Changes since v1: + - Use the ternary operator. +--- + xen/arch/x86/shutdown.c | 7 ++++++- + xen/drivers/char/console.c | 4 ++++ + 2 files changed, 10 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/shutdown.c b/xen/arch/x86/shutdown.c +index 689f6f137d..a350714319 100644 +--- a/xen/arch/x86/shutdown.c ++++ b/xen/arch/x86/shutdown.c +@@ -642,7 +642,12 @@ void machine_restart(unsigned int delay_millisecs) + break; + + case BOOT_XEN: +- xen_hypercall_shutdown(SHUTDOWN_reboot); ++ /* ++ * When running in PV shim mode guest shutdown calls are ++ * forwarded to L0, hence the only way to get here is if a ++ * shim crash happens. ++ */ ++ xen_hypercall_shutdown(pv_shim ? SHUTDOWN_crash : SHUTDOWN_reboot); + break; + } + } +diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c +index 18f5b7f7aa..121073c8ed 100644 +--- a/xen/drivers/char/console.c ++++ b/xen/drivers/char/console.c +@@ -1253,7 +1253,11 @@ void panic(const char *fmt, ...) + if ( opt_noreboot ) + printk("Manual reset required ('noreboot' specified)\n"); + else ++#ifdef CONFIG_X86 ++ printk("%s in five seconds...\n", pv_shim ? "Crash" : "Reboot"); ++#else + printk("Reboot in five seconds...\n"); ++#endif + + spin_unlock_irqrestore(&lock, flags); + +-- +2.14.3 + + +From 321ef983a06bc14570b79da1ab60344e3feb2c2b Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 11 Jan 2018 11:41:21 +0000 +Subject: [PATCH 69/77] xen/shim: allow DomU to have as many vcpus as available +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Since the shim VCPUOP_{up/down} hypercall is wired to the plug/unplug +of CPUs to the shim itself, start the shim DomU with only the BSP +online, and let the guest bring up other CPUs as it needs them. + +Signed-off-by: Roger Pau Monné +--- +Changes since v1: + - Fix single line comment style. + - Print Dom%u d->domain_id. + - Change position of __start_xen comment. +--- + xen/arch/x86/dom0_build.c | 30 +++++++++++++++++++++++++++--- + xen/arch/x86/pv/dom0_build.c | 2 +- + xen/arch/x86/setup.c | 28 ++++++++++++++++++---------- + 3 files changed, 46 insertions(+), 14 deletions(-) + +diff --git a/xen/arch/x86/dom0_build.c b/xen/arch/x86/dom0_build.c +index d77c6b40de..b4f4a4ac4a 100644 +--- a/xen/arch/x86/dom0_build.c ++++ b/xen/arch/x86/dom0_build.c +@@ -138,9 +138,18 @@ struct vcpu *__init dom0_setup_vcpu(struct domain *d, + + if ( v ) + { +- if ( !d->is_pinned && !dom0_affinity_relaxed ) +- cpumask_copy(v->cpu_hard_affinity, &dom0_cpus); +- cpumask_copy(v->cpu_soft_affinity, &dom0_cpus); ++ if ( pv_shim ) ++ { ++ ++ cpumask_setall(v->cpu_hard_affinity); ++ cpumask_setall(v->cpu_soft_affinity); ++ } ++ else ++ { ++ if ( !d->is_pinned && !dom0_affinity_relaxed ) ++ cpumask_copy(v->cpu_hard_affinity, &dom0_cpus); ++ cpumask_copy(v->cpu_soft_affinity, &dom0_cpus); ++ } + } + + return v; +@@ -153,6 +162,21 @@ unsigned int __init dom0_max_vcpus(void) + unsigned int i, max_vcpus, limit; + nodeid_t node; + ++ if ( pv_shim ) ++ { ++ nodes_setall(dom0_nodes); ++ ++ /* ++ * When booting in shim mode APs are not started until the guest brings ++ * other vCPUs up. ++ */ ++ cpumask_set_cpu(0, &dom0_cpus); ++ ++ /* On PV shim mode allow the guest to have as many CPUs as available. */ ++ return nr_cpu_ids; ++ } ++ ++ + for ( i = 0; i < dom0_nr_pxms; ++i ) + if ( (node = pxm_to_node(dom0_pxms[i])) != NUMA_NO_NODE ) + node_set(node, dom0_nodes); +diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c +index ebcb47bf26..5d8909fa13 100644 +--- a/xen/arch/x86/pv/dom0_build.c ++++ b/xen/arch/x86/pv/dom0_build.c +@@ -701,7 +701,7 @@ int __init dom0_construct_pv(struct domain *d, + for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) + shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1; + +- printk("Dom0 has maximum %u VCPUs\n", d->max_vcpus); ++ printk("Dom%u has maximum %u VCPUs\n", d->domain_id, d->max_vcpus); + + cpu = v->processor; + for ( i = 1; i < d->max_vcpus; i++ ) +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index 7091c38047..cf07e5045d 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -1584,18 +1584,26 @@ void __init noreturn __start_xen(unsigned long mbi_p) + + do_presmp_initcalls(); + +- for_each_present_cpu ( i ) ++ /* ++ * NB: when running as a PV shim VCPUOP_up/down is wired to the shim ++ * physical cpu_add/remove functions, so launch the guest with only ++ * the BSP online and let it bring up the other CPUs as required. ++ */ ++ if ( !pv_shim ) + { +- /* Set up cpu_to_node[]. */ +- srat_detect_node(i); +- /* Set up node_to_cpumask based on cpu_to_node[]. */ +- numa_add_cpu(i); +- +- if ( (num_online_cpus() < max_cpus) && !cpu_online(i) ) ++ for_each_present_cpu ( i ) + { +- int ret = cpu_up(i); +- if ( ret != 0 ) +- printk("Failed to bring up CPU %u (error %d)\n", i, ret); ++ /* Set up cpu_to_node[]. */ ++ srat_detect_node(i); ++ /* Set up node_to_cpumask based on cpu_to_node[]. */ ++ numa_add_cpu(i); ++ ++ if ( (num_online_cpus() < max_cpus) && !cpu_online(i) ) ++ { ++ int ret = cpu_up(i); ++ if ( ret != 0 ) ++ printk("Failed to bring up CPU %u (error %d)\n", i, ret); ++ } + } + } + +-- +2.14.3 + + +From abdde49edc15cc4dc61356d7f3f8f52a2d14e2d8 Mon Sep 17 00:00:00 2001 +From: Ian Jackson +Date: Thu, 14 Dec 2017 16:16:20 +0000 +Subject: [PATCH 70/77] libxl: pvshim: Provide first-class config settings to + enable shim mode + +This is API-compatible because old callers are supposed to call +libxl_*_init to initialise the struct; and the updated function clears +these members. + +It is ABI-compatible because the new fields make this member of the +guest type union larger but only within the existing size of that +union. + +Unfortunately it is not easy to backport because it depends on the PVH +domain type. Attempts to avoid use of the PVH domain type involved +working with two views of the configuration: the "underlying" domain +type and the "visible" type (and corresponding config info). Also +there are different sets of config settings for PV and PVH, which +callers would have to know to set. + +And, unfortunately, it will not be possible, with this approach, to +enable the shim by default for all libxl callers. (Although it could +perhaps be done in xl.) + +For now, our config defaults are: + * if enabled, path is "xen-shim" in the xen firmware directory + * if enabled, cmdline is the one we are currently debugging with + +The debugging arguments will be rationalised in a moment. + +Signed-off-by: Ian Jackson +Signed-off-by: George Dunlap +Signed-off-by: Wei Liu +--- +v2: pvshim, not pvhshim + works with type "pvh", not type "pv" +--- + tools/libxl/libxl.h | 8 +++++++ + tools/libxl/libxl_create.c | 15 ++++++++++++ + tools/libxl/libxl_dom.c | 57 +++++++++++++++++++++++++++++++++++--------- + tools/libxl/libxl_internal.h | 4 ++++ + tools/libxl/libxl_types.idl | 5 +++- + 5 files changed, 77 insertions(+), 12 deletions(-) + +diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h +index 5e9aed739d..9632fd6d2f 100644 +--- a/tools/libxl/libxl.h ++++ b/tools/libxl/libxl.h +@@ -1101,6 +1101,14 @@ void libxl_mac_copy(libxl_ctx *ctx, libxl_mac *dst, const libxl_mac *src); + */ + #define LIBXL_HAVE_SET_PARAMETERS 1 + ++/* ++ * LIBXL_HAVE_PV_SHIM ++ * ++ * If this is defined, libxl_domain_build_info's pvh type information ++ * contains members pvshim, pvshim_path, pvshim_cmdline. ++ */ ++#define LIBXL_HAVE_PV_SHIM 1 ++ + typedef char **libxl_string_list; + void libxl_string_list_dispose(libxl_string_list *sl); + int libxl_string_list_length(const libxl_string_list *sl); +diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c +index f15fb215c2..576c61ffab 100644 +--- a/tools/libxl/libxl_create.c ++++ b/tools/libxl/libxl_create.c +@@ -389,6 +389,18 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc, + } + break; + case LIBXL_DOMAIN_TYPE_PVH: ++ libxl_defbool_setdefault(&b_info->u.pvh.pvshim, false); ++ if (libxl_defbool_val(b_info->u.pvh.pvshim)) { ++ if (!b_info->u.pvh.pvshim_path) ++ b_info->u.pvh.pvshim_path = ++ libxl__sprintf(NOGC, "%s/%s", ++ libxl__xenfirmwaredir_path(), ++ PVSHIM_BASENAME); ++ if (!b_info->u.pvh.pvshim_cmdline) ++ b_info->u.pvh.pvshim_cmdline = ++ libxl__strdup(NOGC, PVSHIM_CMDLINE); ++ } ++ + break; + default: + LOG(ERROR, "invalid domain type %s in create info", +@@ -499,6 +511,9 @@ int libxl__domain_build(libxl__gc *gc, + + break; + case LIBXL_DOMAIN_TYPE_PVH: ++ state->shim_path = info->u.pvh.pvshim_path; ++ state->shim_cmdline = info->u.pvh.pvshim_cmdline; ++ + ret = libxl__build_hvm(gc, domid, d_config, state); + if (ret) + goto out; +diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c +index fbbdb9ec2f..b03386409f 100644 +--- a/tools/libxl/libxl_dom.c ++++ b/tools/libxl/libxl_dom.c +@@ -1025,22 +1025,51 @@ static int libxl__domain_firmware(libxl__gc *gc, + + if (state->pv_kernel.path != NULL && + info->type == LIBXL_DOMAIN_TYPE_PVH) { +- /* Try to load a kernel instead of the firmware. */ +- if (state->pv_kernel.mapped) { +- rc = xc_dom_kernel_mem(dom, state->pv_kernel.data, +- state->pv_kernel.size); ++ ++ if (state->shim_path) { ++ rc = xc_dom_kernel_file(dom, state->shim_path); + if (rc) { +- LOGE(ERROR, "xc_dom_kernel_mem failed"); ++ LOGE(ERROR, "xc_dom_kernel_file failed"); + goto out; + } ++ ++ /* We've loaded the shim, so load the kernel as a secondary module */ ++ if (state->pv_kernel.mapped) { ++ LOG(WARN, "xc_dom_module_mem, cmdline %s", ++ state->pv_cmdline); ++ rc = xc_dom_module_mem(dom, state->pv_kernel.data, ++ state->pv_kernel.size, state->pv_cmdline); ++ if (rc) { ++ LOGE(ERROR, "xc_dom_kernel_mem failed"); ++ goto out; ++ } ++ } else { ++ LOG(WARN, "xc_dom_module_file, path %s cmdline %s", ++ state->pv_kernel.path, state->pv_cmdline); ++ rc = xc_dom_module_file(dom, state->pv_kernel.path, state->pv_cmdline); ++ if (rc) { ++ LOGE(ERROR, "xc_dom_kernel_file failed"); ++ goto out; ++ } ++ } + } else { +- rc = xc_dom_kernel_file(dom, state->pv_kernel.path); +- if (rc) { +- LOGE(ERROR, "xc_dom_kernel_file failed"); +- goto out; ++ /* No shim, so load the kernel directly */ ++ if (state->pv_kernel.mapped) { ++ rc = xc_dom_kernel_mem(dom, state->pv_kernel.data, ++ state->pv_kernel.size); ++ if (rc) { ++ LOGE(ERROR, "xc_dom_kernel_mem failed"); ++ goto out; ++ } ++ } else { ++ rc = xc_dom_kernel_file(dom, state->pv_kernel.path); ++ if (rc) { ++ LOGE(ERROR, "xc_dom_kernel_file failed"); ++ goto out; ++ } + } + } +- ++ + if (state->pv_ramdisk.path && strlen(state->pv_ramdisk.path)) { + if (state->pv_ramdisk.mapped) { + rc = xc_dom_module_mem(dom, state->pv_ramdisk.data, +@@ -1154,8 +1183,14 @@ int libxl__build_hvm(libxl__gc *gc, uint32_t domid, + + xc_dom_loginit(ctx->xch); + ++ /* ++ * If PVH and we have a shim override, use the shim cmdline. ++ * If PVH and no shim override, use the pv cmdline. ++ * If not PVH, use info->cmdline. ++ */ + dom = xc_dom_allocate(ctx->xch, info->type == LIBXL_DOMAIN_TYPE_PVH ? +- state->pv_cmdline : info->cmdline, NULL); ++ (state->shim_path ? state->shim_cmdline : state->pv_cmdline) : ++ info->cmdline, NULL); + if (!dom) { + LOGE(ERROR, "xc_dom_allocate failed"); + rc = ERROR_NOMEM; +diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h +index bfa95d8619..2454efa621 100644 +--- a/tools/libxl/libxl_internal.h ++++ b/tools/libxl/libxl_internal.h +@@ -118,6 +118,8 @@ + #define TAP_DEVICE_SUFFIX "-emu" + #define DOMID_XS_PATH "domid" + #define INVALID_DOMID ~0 ++#define PVSHIM_BASENAME "xen-shim" ++#define PVSHIM_CMDLINE "pv-shim console=xen,pv sched=null loglvl=all guest_loglvl=all apic_verbosity=debug e820-verbose" + + /* Size macros. */ + #define __AC(X,Y) (X##Y) +@@ -1136,6 +1138,8 @@ typedef struct { + + libxl__file_reference pv_kernel; + libxl__file_reference pv_ramdisk; ++ const char * shim_path; ++ const char * shim_cmdline; + const char * pv_cmdline; + + xen_vmemrange_t *vmemranges; +diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl +index a239324341..6d060edc0d 100644 +--- a/tools/libxl/libxl_types.idl ++++ b/tools/libxl/libxl_types.idl +@@ -592,7 +592,10 @@ libxl_domain_build_info = Struct("domain_build_info",[ + # Use host's E820 for PCI passthrough. + ("e820_host", libxl_defbool), + ])), +- ("pvh", None), ++ ("pvh", Struct(None, [("pvshim", libxl_defbool), ++ ("pvshim_path", string), ++ ("pvshim_cmdline", string), ++ ])), + ("invalid", None), + ], keyvar_init_val = "LIBXL_DOMAIN_TYPE_INVALID")), + +-- +2.14.3 + + +From ab9e3854ddb2fad2b86aaf5144a26f5569b63cfc Mon Sep 17 00:00:00 2001 +From: Ian Jackson +Date: Fri, 5 Jan 2018 15:59:29 +0000 +Subject: [PATCH 71/77] libxl: pvshim: Introduce pvshim_extra + +And move the debugging options from the default config into a doc +comment in libxl_types.idl. + +Signed-off-by: Ian Jackson +--- +v2: pvshim, not pvhshim + works with type "pvh", not type "pv" +--- + tools/libxl/libxl.h | 2 +- + tools/libxl/libxl_create.c | 5 ++++- + tools/libxl/libxl_internal.h | 2 +- + tools/libxl/libxl_types.idl | 1 + + 4 files changed, 7 insertions(+), 3 deletions(-) + +diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h +index 9632fd6d2f..3c0ae6de47 100644 +--- a/tools/libxl/libxl.h ++++ b/tools/libxl/libxl.h +@@ -1105,7 +1105,7 @@ void libxl_mac_copy(libxl_ctx *ctx, libxl_mac *dst, const libxl_mac *src); + * LIBXL_HAVE_PV_SHIM + * + * If this is defined, libxl_domain_build_info's pvh type information +- * contains members pvshim, pvshim_path, pvshim_cmdline. ++ * contains members pvshim, pvshim_path, pvshim_cmdline, pvshim_extra. + */ + #define LIBXL_HAVE_PV_SHIM 1 + +diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c +index 576c61ffab..1fa1d3a621 100644 +--- a/tools/libxl/libxl_create.c ++++ b/tools/libxl/libxl_create.c +@@ -512,7 +512,10 @@ int libxl__domain_build(libxl__gc *gc, + break; + case LIBXL_DOMAIN_TYPE_PVH: + state->shim_path = info->u.pvh.pvshim_path; +- state->shim_cmdline = info->u.pvh.pvshim_cmdline; ++ state->shim_cmdline = GCSPRINTF("%s%s%s", ++ info->u.pvh.pvshim_cmdline, ++ info->u.pvh.pvshim_extra ? " " : "", ++ info->u.pvh.pvshim_extra ? info->u.pvh.pvshim_extra : ""); + + ret = libxl__build_hvm(gc, domid, d_config, state); + if (ret) +diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h +index 2454efa621..0f89364466 100644 +--- a/tools/libxl/libxl_internal.h ++++ b/tools/libxl/libxl_internal.h +@@ -119,7 +119,7 @@ + #define DOMID_XS_PATH "domid" + #define INVALID_DOMID ~0 + #define PVSHIM_BASENAME "xen-shim" +-#define PVSHIM_CMDLINE "pv-shim console=xen,pv sched=null loglvl=all guest_loglvl=all apic_verbosity=debug e820-verbose" ++#define PVSHIM_CMDLINE "pv-shim console=xen,pv sched=null" + + /* Size macros. */ + #define __AC(X,Y) (X##Y) +diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl +index 6d060edc0d..d74fac7d30 100644 +--- a/tools/libxl/libxl_types.idl ++++ b/tools/libxl/libxl_types.idl +@@ -595,6 +595,7 @@ libxl_domain_build_info = Struct("domain_build_info",[ + ("pvh", Struct(None, [("pvshim", libxl_defbool), + ("pvshim_path", string), + ("pvshim_cmdline", string), ++ ("pvshim_extra", string), # eg "loglvl=all guest_loglvl=all apic_verbosity=debug e820-verbose" + ])), + ("invalid", None), + ], keyvar_init_val = "LIBXL_DOMAIN_TYPE_INVALID")), +-- +2.14.3 + + +From 0e2d64ae8f4af4dbd49127107ae6237e7f748c04 Mon Sep 17 00:00:00 2001 +From: Ian Jackson +Date: Fri, 22 Dec 2017 16:12:23 +0000 +Subject: [PATCH 72/77] xl: pvshim: Provide and document xl config + +Signed-off-by: Ian Jackson +Signed-off-by: Wei Liu +--- +v2: pvshim, not pvhshim + works with type "pvh", not type "pv" + pvshim_etc. options in config are not erroneously ignored +--- + docs/man/xl.cfg.pod.5.in | 35 +++++++++++++++++++++++++++++++++++ + tools/xl/xl_parse.c | 14 ++++++++++++++ + 2 files changed, 49 insertions(+) + +diff --git a/docs/man/xl.cfg.pod.5.in b/docs/man/xl.cfg.pod.5.in +index b7b91d8627..bf6c266de1 100644 +--- a/docs/man/xl.cfg.pod.5.in ++++ b/docs/man/xl.cfg.pod.5.in +@@ -508,6 +508,41 @@ Load the specified file as firmware for the guest. + Currently there's no firmware available for PVH guests, they should be + booted using the B method or the B option. + ++=over 4 ++ ++=item B ++ ++Whether to boot this guest as a PV guest within a PVH container. ++Ie, the guest will experience a PV environment, ++but ++processor hardware extensions are used to ++separate its address space ++to mitigate the Meltdown attack (CVE-2017-5754). ++ ++Default is false. ++ ++=item B ++ ++The PV shim is a specially-built firmware-like executable ++constructed from the hypervisor source tree. ++This option specifies to use a non-default shim. ++Ignored if pvhsim is false. ++ ++=item B ++ ++Command line for the shim. ++Default is "pv-shim console=xen,pv sched=null". ++Ignored if pvhsim is false. ++ ++=item B ++ ++Extra command line arguments for the shim. ++If supplied, appended to the value for pvshim_cmdline. ++Default is empty. ++Ignored if pvhsim is false. ++ ++=back ++ + =head3 Other Options + + =over 4 +diff --git a/tools/xl/xl_parse.c b/tools/xl/xl_parse.c +index 9a692d5ae6..fdfe693de1 100644 +--- a/tools/xl/xl_parse.c ++++ b/tools/xl/xl_parse.c +@@ -964,6 +964,20 @@ void parse_config_data(const char *config_source, + xlu_cfg_replace_string(config, "pool", &c_info->pool_name, 0); + + libxl_domain_build_info_init_type(b_info, c_info->type); ++ ++ if (b_info->type == LIBXL_DOMAIN_TYPE_PVH) { ++ xlu_cfg_get_defbool(config, "pvshim", &b_info->u.pvh.pvshim, 0); ++ if (!xlu_cfg_get_string(config, "pvshim_path", &buf, 0)) ++ xlu_cfg_replace_string(config, "pvshim_path", ++ &b_info->u.pvh.pvshim_path, 0); ++ if (!xlu_cfg_get_string(config, "pvshim_cmdline", &buf, 0)) ++ xlu_cfg_replace_string(config, "pvshim_cmdline", ++ &b_info->u.pvh.pvshim_cmdline, 0); ++ if (!xlu_cfg_get_string(config, "pvshim_extra", &buf, 0)) ++ xlu_cfg_replace_string(config, "pvshim_extra", ++ &b_info->u.pvh.pvshim_extra, 0); ++ } ++ + if (blkdev_start) + b_info->blkdev_start = strdup(blkdev_start); + +-- +2.14.3 + + +From 0a515eeb966add7c63d764cabffec3b2f560a588 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Tue, 16 Jan 2018 14:48:53 +0000 +Subject: [PATCH 73/77] xen/pvshim: map vcpu_info earlier for APs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Or else init_percpu_time is going to dereference a NULL pointer when +trying to access vcpu_info. + +Signed-off-by: Roger Pau Monné +Tested-by: George Dunlap +--- + xen/arch/x86/smpboot.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index 5c7863035e..5ed82b16a8 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -346,6 +346,9 @@ void start_secondary(void *unused) + else + microcode_resume_cpu(cpu); + ++ if ( xen_guest ) ++ hypervisor_ap_setup(); ++ + smp_callin(); + + init_percpu_time(); +@@ -374,9 +377,6 @@ void start_secondary(void *unused) + cpumask_set_cpu(cpu, &cpu_online_map); + unlock_vector_lock(); + +- if ( xen_guest ) +- hypervisor_ap_setup(); +- + /* We can take interrupts now: we're officially "up". */ + local_irq_enable(); + mtrr_ap_init(); +-- +2.14.3 + + +From 6f1979c8e4184f1f2b24b860e30d3b037b2e7f05 Mon Sep 17 00:00:00 2001 +From: Michael Young +Date: Mon, 15 Jan 2018 21:23:20 +0000 +Subject: [PATCH 74/77] -xen-attach is needed for pvh boot with qemu-xen +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Currently the boot of a pvh guest using the qemu-xen device model fails +with the error +xen emulation not implemented (yet) +in the qemu-dm log file. This patch adds the missing -xen-attach +argument. + +V2: Use b_info->type != LIBXL_DOMAIN_TYPE_HVM instead of + (b_info->type == LIBXL_DOMAIN_TYPE_PV) || + (b_info->type == LIBXL_DOMAIN_TYPE_PVH) +as recommended by Roger Pau Monné. + +Signed-off-by: Michael Young +Reviewed-by: Roger Pau Monné +Acked-by: Wei Liu +--- + tools/libxl/libxl_dm.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c +index a2ea95a9be..a3cddce8b7 100644 +--- a/tools/libxl/libxl_dm.c ++++ b/tools/libxl/libxl_dm.c +@@ -1021,7 +1021,7 @@ static int libxl__build_device_model_args_new(libxl__gc *gc, + */ + flexarray_append(dm_args, "-no-user-config"); + +- if (b_info->type == LIBXL_DOMAIN_TYPE_PV) { ++ if (b_info->type != LIBXL_DOMAIN_TYPE_HVM) { + flexarray_append(dm_args, "-xen-attach"); + } + +-- +2.14.3 + + +From 69f4d872e524932d392acd80989c5b776baa4522 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Wed, 17 Jan 2018 10:57:02 +0000 +Subject: [PATCH 75/77] x86/guest: use the vcpu_info area from shared_info +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +If using less than 32 vCPUs (XEN_LEGACY_MAX_VCPUS). + +This is a workaround that should allow to boot the shim on hypervisors +without commit "x86/upcall: inject a spurious event after setting +upcall vector" as long as less than 32 vCPUs are assigned to the +shim. + +Signed-off-by: Roger Pau Monné +Acked-by: Jan Beulich +--- + xen/arch/x86/guest/xen.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/guest/xen.c b/xen/arch/x86/guest/xen.c +index 2a5554ab26..ed8b8c8c7b 100644 +--- a/xen/arch/x86/guest/xen.c ++++ b/xen/arch/x86/guest/xen.c +@@ -257,7 +257,8 @@ void __init hypervisor_setup(void) + map_shared_info(); + + set_vcpu_id(); +- vcpu_info = xzalloc_array(struct vcpu_info, nr_cpu_ids); ++ if ( nr_cpu_ids > XEN_LEGACY_MAX_VCPUS ) ++ vcpu_info = xzalloc_array(struct vcpu_info, nr_cpu_ids); + if ( map_vcpuinfo() ) + { + xfree(vcpu_info); +-- +2.14.3 + + +From 79f797c3f41c15a74d627a8eabc373ec7b202933 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Wed, 17 Jan 2018 09:48:14 +0000 +Subject: [PATCH 76/77] firmware/shim: fix build process to use POSIX find + options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The -printf find option is not POSIX compatible, so replace it with +another rune. + +Signed-off-by: Roger Pau Monné +Acked-by: Wei Liu +--- + tools/firmware/xen-dir/Makefile | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/tools/firmware/xen-dir/Makefile b/tools/firmware/xen-dir/Makefile +index adf6c31e8d..de754c752e 100644 +--- a/tools/firmware/xen-dir/Makefile ++++ b/tools/firmware/xen-dir/Makefile +@@ -21,7 +21,8 @@ linkfarm.stamp: $(DEP_DIRS) $(DEP_FILES) FORCE + $(foreach d, $(LINK_DIRS), \ + (mkdir -p $(D)/$(d); \ + cd $(D)/$(d); \ +- find $(XEN_ROOT)/$(d)/ -type d -printf "./%P\n" | xargs mkdir -p);) ++ find $(XEN_ROOT)/$(d)/ -type d -exec sh -c \ ++ "echo {} | sed 's,^$(XEN_ROOT)/$(d)/,,g' | xargs mkdir -p" \;);) + $(foreach d, $(LINK_DIRS), \ + (cd $(XEN_ROOT); \ + find $(d) ! -type l -type f \ +-- +2.14.3 + + +From fa23f2aaa24c603f748b49b32378b738d18cc68f Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Wed, 17 Jan 2018 12:00:41 +0000 +Subject: [PATCH 77/77] xen/pvh: place the trampoline at page 0x1 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Since PVH guest jump straight into trampoline_setup trampoline_phys is +not initialized, thus the trampoline is relocated to address 0. + +This works, but has the undesirable effect of having VA 0 mapped to +MFN 0, which means NULL pointed dereferences no longer trigger a page +fault. + +In order to solve this, place the trampoline at page 0x1 and reserve +the memory used by it. + +Signed-off-by: Roger Pau Monné +Reviewed-by: Wei Liu +--- + xen/arch/x86/boot/head.S | 3 +++ + xen/arch/x86/mm.c | 9 +++++++-- + 2 files changed, 10 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S +index 14caca6798..c527910478 100644 +--- a/xen/arch/x86/boot/head.S ++++ b/xen/arch/x86/boot/head.S +@@ -411,6 +411,9 @@ __pvh_start: + /* Skip bootloader setup and bios setup, go straight to trampoline */ + movb $1, sym_esi(pvh_boot) + movb $1, sym_esi(skip_realmode) ++ ++ /* Set trampoline_phys to use mfn 1 to avoid having a mapping at VA 0 */ ++ movw $0x1000, sym_esi(trampoline_phys) + jmp trampoline_setup + + #endif /* CONFIG_PVH_GUEST */ +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 371c764027..a8b59617d3 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -292,9 +292,14 @@ void __init arch_init_memory(void) + /* + * First 1MB of RAM is historically marked as I/O. If we booted PVH, + * reclaim the space. Irrespective, leave MFN 0 as special for the sake +- * of 0 being a very common default value. ++ * of 0 being a very common default value. Also reserve page 0x1 which is ++ * used by the trampoline code on PVH. + */ +- for ( i = 0; i < (pvh_boot ? 1 : 0x100); i++ ) ++ BUG_ON(pvh_boot && trampoline_phys != 0x1000); ++ for ( i = 0; ++ i < (pvh_boot ? (1 + PFN_UP(trampoline_end - trampoline_start)) ++ : 0x100); ++ i++ ) + share_xen_page_with_guest(mfn_to_page(_mfn(i)), + dom_io, XENSHARE_writable); + +-- +2.14.3 + diff --git a/xen.comet.fixes.patch b/xen.comet.fixes.patch new file mode 100644 index 0000000..2cc0465 --- /dev/null +++ b/xen.comet.fixes.patch @@ -0,0 +1,150 @@ +From db3ae8becc2b4f9f544eafa06a7c858c7cc9f029 Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Wed, 17 Jan 2018 09:50:27 +0000 +Subject: [PATCH] tools: fix arm build after bdf693ee61b48 + +The ramdisk fields were removed. We should use modules[0] instead. + +Signed-off-by: Wei Liu +Acked-by: Ian Jackson +--- + tools/libxc/xc_dom_arm.c | 10 +++++----- + tools/libxl/libxl_arm.c | 6 +++--- + 2 files changed, 8 insertions(+), 8 deletions(-) + +diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c +index fce151d821..5b9eca6087 100644 +--- a/tools/libxc/xc_dom_arm.c ++++ b/tools/libxc/xc_dom_arm.c +@@ -390,8 +390,8 @@ static int meminit(struct xc_dom_image *dom) + const uint64_t kernsize = kernend - kernbase; + const uint64_t dtb_size = dom->devicetree_blob ? + ROUNDUP(dom->devicetree_size, XC_PAGE_SHIFT) : 0; +- const uint64_t ramdisk_size = dom->ramdisk_blob ? +- ROUNDUP(dom->ramdisk_size, XC_PAGE_SHIFT) : 0; ++ const uint64_t ramdisk_size = dom->modules[0].blob ? ++ ROUNDUP(dom->modules[0].size, XC_PAGE_SHIFT) : 0; + const uint64_t modsize = dtb_size + ramdisk_size; + const uint64_t ram128mb = bankbase[0] + (128<<20); + +@@ -483,12 +483,12 @@ static int meminit(struct xc_dom_image *dom) + */ + if ( ramdisk_size ) + { +- dom->ramdisk_seg.vstart = modbase; +- dom->ramdisk_seg.vend = modbase + ramdisk_size; ++ dom->modules[0].seg.vstart = modbase; ++ dom->modules[0].seg.vend = modbase + ramdisk_size; + + DOMPRINTF("%s: ramdisk: 0x%" PRIx64 " -> 0x%" PRIx64 "", + __FUNCTION__, +- dom->ramdisk_seg.vstart, dom->ramdisk_seg.vend); ++ dom->modules[0].seg.vstart, dom->modules[0].seg.vend); + + modbase += ramdisk_size; + } +diff --git a/tools/libxl/libxl_arm.c b/tools/libxl/libxl_arm.c +index de1840bece..3e46554301 100644 +--- a/tools/libxl/libxl_arm.c ++++ b/tools/libxl/libxl_arm.c +@@ -923,7 +923,7 @@ next_resize: + FDT( fdt_begin_node(fdt, "") ); + + FDT( make_root_properties(gc, vers, fdt) ); +- FDT( make_chosen_node(gc, fdt, !!dom->ramdisk_blob, state, info) ); ++ FDT( make_chosen_node(gc, fdt, !!dom->modules[0].blob, state, info) ); + FDT( make_cpus_node(gc, fdt, info->max_vcpus, ainfo) ); + FDT( make_psci_node(gc, fdt) ); + +@@ -1053,8 +1053,8 @@ int libxl__arch_domain_finalise_hw_description(libxl__gc *gc, + int i; + const uint64_t bankbase[] = GUEST_RAM_BANK_BASES; + +- const struct xc_dom_seg *ramdisk = dom->ramdisk_blob ? +- &dom->ramdisk_seg : NULL; ++ const struct xc_dom_seg *ramdisk = dom->modules[0].blob ? ++ &dom->modules[0].seg : NULL; + + if (ramdisk) { + int chosen, res; +-- +2.14.3 + +From 81838c9067ab7f4b89d33f90a71225ffff9800ba Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Wed, 17 Jan 2018 16:43:54 +0000 +Subject: [PATCH] ocaml: fix arm build + +ARM doesn't have emulation_flags in the arch_domainconfig. + +Signed-off-by: Wei Liu +Reviewed-by: Julien Grall +--- + tools/ocaml/libs/xc/xenctrl_stubs.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/tools/ocaml/libs/xc/xenctrl_stubs.c b/tools/ocaml/libs/xc/xenctrl_stubs.c +index 0b5a2361c0..dd6000caa3 100644 +--- a/tools/ocaml/libs/xc/xenctrl_stubs.c ++++ b/tools/ocaml/libs/xc/xenctrl_stubs.c +@@ -176,10 +176,14 @@ CAMLprim value stub_xc_domain_create(value xch, value ssidref, + break; + + case 1: /* X86 - emulation flags in the block */ ++#if defined(__i386__) || defined(__x86_64__) + for (l = Field(Field(domconfig, 0), 0); + l != Val_none; + l = Field(l, 1)) + config.emulation_flags |= 1u << Int_val(Field(l, 0)); ++#else ++ caml_failwith("Unhandled: x86"); ++#endif + break; + + default: +@@ -320,6 +324,7 @@ static value alloc_domaininfo(xc_domaininfo_t * info) + + Store_field(result, 15, tmp); + ++#if defined(__i386__) || defined(__x86_64__) + /* emulation_flags: x86_arch_emulation_flags list; */ + tmp = emul_list = Val_emptylist; + for (i = 0; i < 10; i++) { +@@ -341,6 +346,7 @@ static value alloc_domaininfo(xc_domaininfo_t * info) + Store_field(arch_config, 0, x86_arch_config); + + Store_field(result, 16, arch_config); ++#endif + + CAMLreturn(result); + } +-- +2.14.3 + +From 36c560e7f38130f12a36e8b66b0785fb655fe893 Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Tue, 16 Jan 2018 18:56:45 +0000 +Subject: [PATCH] Don't build xen-shim for 32 bit build host + +Signed-off-by: Wei Liu +--- + tools/firmware/Makefile | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/tools/firmware/Makefile b/tools/firmware/Makefile +index 9387cc0878..b2f011df49 100644 +--- a/tools/firmware/Makefile ++++ b/tools/firmware/Makefile +@@ -1,7 +1,9 @@ + XEN_ROOT = $(CURDIR)/../.. + include $(XEN_ROOT)/tools/Rules.mk + ++ifneq ($(XEN_TARGET_ARCH),x86_32) + CONFIG_PV_SHIM := y ++endif + + # hvmloader is a 32-bit protected mode binary. + TARGET := hvmloader/hvmloader +-- +2.14.3 + diff --git a/xen.fedora.efi.build.patch b/xen.fedora.efi.build.patch index a531445..96cfb35 100644 --- a/xen.fedora.efi.build.patch +++ b/xen.fedora.efi.build.patch @@ -6,8 +6,8 @@ echo '$(TARGET).efi'; fi) +LD_EFI ?= $(LD) - ifneq ($(build_id_linker),) - notes_phdrs = --notes + shim-$(CONFIG_PVH_GUEST) := $(TARGET)-shim + @@ -173,20 +174,20 @@ $(TARGET).efi: prelink-efi.o $(note_file) efi.lds efi/relocs-dummy.o $(BASEDIR)/common/symbols-dummy.o efi/mkreloc diff --git a/xen.hypervisor.config b/xen.hypervisor.config index 58c9858..9101058 100644 --- a/xen.hypervisor.config +++ b/xen.hypervisor.config @@ -61,6 +61,7 @@ CONFIG_HAS_PCI=y CONFIG_VIDEO=y CONFIG_VGA=y CONFIG_DEFCONFIG_LIST="$ARCH_DEFCONFIG" +CONFIG_XEN_GUEST=n # # Debugging Options diff --git a/xen.spec b/xen.spec index 2c0a30b..3b1d309 100644 --- a/xen.spec +++ b/xen.spec @@ -1,4 +1,4 @@ -%{!?python_sitearch: %define python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print get_python_lib(1)")} +%{!?python_sitearch: %define python_sitearch %(/usr/bin/python2 -c "from distutils.sysconfig import get_python_lib; print get_python_lib(1)")} # Build ocaml bits unless rpmbuild was run with --without ocaml # or ocamlopt is missing (the xen makefile doesn't build ocaml bits if it isn't there) %define with_ocaml %{?_without_ocaml: 0} %{?!_without_ocaml: 1} @@ -60,7 +60,7 @@ Summary: Xen is a virtual machine monitor Name: xen Version: 4.10.0 -Release: 2%{?dist} +Release: 3%{?dist} Group: Development/Libraries License: GPLv2+ and LGPLv2+ and BSD URL: http://xen.org/ @@ -117,6 +117,9 @@ Patch38: qemu.trad.CVE-2017-8309.patch Patch39: qemu.trad.CVE-2017-9330.patch Patch40: xen.ocaml.safe-strings.patch Patch41: xsa253.patch +Patch42: 4.10.0-shim-comet-3.patch +Patch43: xen.comet.fixes.patch +Patch44: xen.xsa254.pti.patch BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root @@ -297,8 +300,6 @@ manage Xen virtual machines. %prep %setup -q %patch1 -p1 -%patch2 -p1 -%patch3 -p1 %patch4 -p1 %patch5 -p1 %patch6 -p1 @@ -327,7 +328,12 @@ manage Xen virtual machines. %patch34 -p1 %patch37 -p1 %patch40 -p1 -%patch41 -p1 +#%patch41 -p1 +%patch42 -p1 +%patch43 -p1 +%patch44 -p1 +%patch2 -p1 +%patch3 -p1 # qemu-xen-traditional patches pushd tools/qemu-xen-traditional @@ -374,7 +380,7 @@ export EXTRA_CFLAGS_QEMU_XEN="$RPM_OPT_FLAGS" %if %build_hyp %if %build_crosshyp %define efi_flags LD_EFI=false -XEN_TARGET_ARCH=x86_64 make %{?_smp_mflags} %{?efi_flags} prefix=/usr xen CC="/usr/bin/x86_64-linux-gnu-gcc `echo $RPM_OPT_FLAGS | sed -e 's/-m32//g' -e 's/-march=i686//g' -e 's/-mtune=atom//g' -e 's/-specs=\/usr\/lib\/rpm\/redhat\/redhat-annobin-cc1//g'`" +XEN_TARGET_ARCH=x86_64 make %{?_smp_mflags} %{?efi_flags} prefix=/usr xen CC="/usr/bin/x86_64-linux-gnu-gcc `echo $RPM_OPT_FLAGS | sed -e 's/-m32//g' -e 's/-march=i686//g' -e 's/-mtune=atom//g' -e 's/-specs=\/usr\/lib\/rpm\/redhat\/redhat-annobin-cc1//g' -e 's/-fstack-clash-protection//g'`" %else %ifarch armv7hl make %{?_smp_mflags} %{?efi_flags} prefix=/usr xen CC="gcc `echo $RPM_OPT_FLAGS | sed -e 's/-mfloat-abi=hard//g' -e 's/-march=armv7-a//g'`" @@ -382,7 +388,7 @@ make %{?_smp_mflags} %{?efi_flags} prefix=/usr xen CC="gcc `echo $RPM_OPT_FLAGS %ifarch aarch64 make %{?_smp_mflags} %{?efi_flags} prefix=/usr xen CC="gcc $RPM_OPT_FLAGS" %else -make %{?_smp_mflags} %{?efi_flags} prefix=/usr xen CC="gcc `echo $RPM_OPT_FLAGSi | sed -e 's/-specs=\/usr\/lib\/rpm\/redhat\/redhat-annobin-cc1//g'`" +make %{?_smp_mflags} %{?efi_flags} prefix=/usr xen CC="gcc `echo $RPM_OPT_FLAGS | sed -e 's/-specs=\/usr\/lib\/rpm\/redhat\/redhat-annobin-cc1//g'`" %endif %endif %endif @@ -736,6 +742,9 @@ rm -rf %{buildroot} %ifarch %{ix86} x86_64 %dir /usr/lib/%{name}/boot /usr/lib/xen/boot/hvmloader +%ifnarch %{ix86} +/usr/lib/xen/boot/xen-shim +%endif %if %build_stubdom /usr/lib/xen/boot/ioemu-stubdom.gz /usr/lib/xen/boot/xenstore-stubdom.gz @@ -877,6 +886,18 @@ rm -rf %{buildroot} %endif %changelog +* Sun Jan 14 2018 Michael Young - 4.10.0-3 +- fix typo in annobin build fix +- add 4.10.0-shim-comet-3 shim mitigation for [XSA-254, CVE-2017-5753, + CVE-2017-5715, CVE-2017-5754] + build fixes + XSA-253 patch included in comet patches + CONFIG_XEN_GUEST line needed xen.hypervisor.config for comet + delay and adjust xen.use.fedora.ipxe.patch and xen.fedora.efi.build.patch + package /usr/lib/xen/boot/xen-shim +- add Xen page-table isolation (XPTI) mitigation for XSA-254 +- -fstack-clash-protection isn't recognized in hypervisor build x86_64 on i686 +- __python macro is no longer set, replace by /usr/bin/python2 + * Thu Jan 04 2018 Michael Young - 4.10.0-2 - x86: memory leak with MSR emulation [XSA-253, CVE-2018-5244] (#1531110) diff --git a/xen.use.fedora.ipxe.patch b/xen.use.fedora.ipxe.patch index 500bc20..8785393 100644 --- a/xen.use.fedora.ipxe.patch +++ b/xen.use.fedora.ipxe.patch @@ -28,6 +28,6 @@ SUBDIRS-$(CONFIG_ROMBIOS) += vgabios -SUBDIRS-$(CONFIG_ROMBIOS) += etherboot +#SUBDIRS-$(CONFIG_ROMBIOS) += etherboot + SUBDIRS-$(CONFIG_PV_SHIM) += xen-dir SUBDIRS-y += hvmloader - LD32BIT-$(CONFIG_FreeBSD) := LD32BIT_FLAG=-melf_i386_fbsd diff --git a/xen.xsa254.pti.patch b/xen.xsa254.pti.patch new file mode 100644 index 0000000..5295737 --- /dev/null +++ b/xen.xsa254.pti.patch @@ -0,0 +1,1377 @@ +From 910dd005da20f27f3415b7eccdf436874989506b Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Wed, 17 Jan 2018 16:54:44 +0100 +Subject: [PATCH 1/5] x86/entry: Remove support for partial cpu_user_regs + frames + +Save all GPRs on entry to Xen. + +The entry_int82() path is via a DPL1 gate, only usable by 32bit PV guests, so +can get away with only saving the 32bit registers. All other entrypoints can +be reached from 32 or 64bit contexts. + +This is part of XSA-254. + +Signed-off-by: Andrew Cooper +Reviewed-by: Wei Liu +Acked-by: Jan Beulich +master commit: f9eb74789af77e985ae653193f3622263499f674 +master date: 2018-01-05 19:57:07 +0000 +--- + tools/tests/x86_emulator/x86-emulate.c | 1 - + xen/arch/x86/pv/domain.c | 1 - + xen/arch/x86/pv/emul-priv-op.c | 2 - + xen/arch/x86/x86_64/compat/entry.S | 7 ++- + xen/arch/x86/x86_64/entry.S | 12 ++-- + xen/arch/x86/x86_64/traps.c | 13 ++-- + xen/arch/x86/x86_emulate.c | 1 - + xen/arch/x86/x86_emulate/x86_emulate.c | 8 +-- + xen/common/wait.c | 1 - + xen/include/asm-x86/asm_defns.h | 105 +++------------------------------ + 10 files changed, 26 insertions(+), 125 deletions(-) + +diff --git a/tools/tests/x86_emulator/x86-emulate.c b/tools/tests/x86_emulator/x86-emulate.c +index 975ddc7e53..9056610907 100644 +--- a/tools/tests/x86_emulator/x86-emulate.c ++++ b/tools/tests/x86_emulator/x86-emulate.c +@@ -3,7 +3,6 @@ + #include + + #define cpu_has_amd_erratum(nr) 0 +-#define mark_regs_dirty(r) ((void)(r)) + #define cpu_has_mpx false + #define read_bndcfgu() 0 + #define xstate_set_init(what) +diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c +index 2234128bb3..74e9e667d2 100644 +--- a/xen/arch/x86/pv/domain.c ++++ b/xen/arch/x86/pv/domain.c +@@ -20,7 +20,6 @@ + static void noreturn continue_nonidle_domain(struct vcpu *v) + { + check_wakeup_from_wait(); +- mark_regs_dirty(guest_cpu_user_regs()); + reset_stack_and_jump(ret_from_intr); + } + +diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c +index 2f9264548a..5f23c2cfbf 100644 +--- a/xen/arch/x86/pv/emul-priv-op.c ++++ b/xen/arch/x86/pv/emul-priv-op.c +@@ -337,7 +337,6 @@ static int read_io(unsigned int port, unsigned int bytes, + io_emul_stub_t *io_emul = + io_emul_stub_setup(poc, ctxt->opcode, port, bytes); + +- mark_regs_dirty(ctxt->regs); + io_emul(ctxt->regs); + return X86EMUL_DONE; + } +@@ -436,7 +435,6 @@ static int write_io(unsigned int port, unsigned int bytes, + io_emul_stub_t *io_emul = + io_emul_stub_setup(poc, ctxt->opcode, port, bytes); + +- mark_regs_dirty(ctxt->regs); + io_emul(ctxt->regs); + if ( (bytes == 1) && pv_post_outb_hook ) + pv_post_outb_hook(port, val); +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index ba6e941837..3fea54ee9d 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -16,7 +16,8 @@ + ENTRY(entry_int82) + ASM_CLAC + pushq $0 +- SAVE_VOLATILE type=HYPERCALL_VECTOR compat=1 ++ movl $HYPERCALL_VECTOR, 4(%rsp) ++ SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */ + CR4_PV32_RESTORE + + GET_CURRENT(bx) +@@ -60,7 +61,6 @@ compat_test_guest_events: + /* %rbx: struct vcpu */ + compat_process_softirqs: + sti +- andl $~TRAP_regs_partial,UREGS_entry_vector(%rsp) + call do_softirq + jmp compat_test_all_events + +@@ -197,7 +197,8 @@ ENTRY(cstar_enter) + pushq $FLAT_USER_CS32 + pushq %rcx + pushq $0 +- SAVE_VOLATILE TRAP_syscall ++ movl $TRAP_syscall, 4(%rsp) ++ SAVE_ALL + GET_CURRENT(bx) + movq VCPU_domain(%rbx),%rcx + cmpb $0,DOMAIN_is_32bit_pv(%rcx) +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index 6066ed8b18..1dd9ccf6a2 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -98,7 +98,8 @@ ENTRY(lstar_enter) + pushq $FLAT_KERNEL_CS64 + pushq %rcx + pushq $0 +- SAVE_VOLATILE TRAP_syscall ++ movl $TRAP_syscall, 4(%rsp) ++ SAVE_ALL + GET_CURRENT(bx) + testb $TF_kernel_mode,VCPU_thread_flags(%rbx) + jz switch_to_kernel +@@ -140,7 +141,6 @@ test_guest_events: + /* %rbx: struct vcpu */ + process_softirqs: + sti +- SAVE_PRESERVED + call do_softirq + jmp test_all_events + +@@ -190,7 +190,8 @@ GLOBAL(sysenter_eflags_saved) + pushq $3 /* ring 3 null cs */ + pushq $0 /* null rip */ + pushq $0 +- SAVE_VOLATILE TRAP_syscall ++ movl $TRAP_syscall, 4(%rsp) ++ SAVE_ALL + GET_CURRENT(bx) + cmpb $0,VCPU_sysenter_disables_events(%rbx) + movq VCPU_sysenter_addr(%rbx),%rax +@@ -207,7 +208,6 @@ UNLIKELY_END(sysenter_nt_set) + leal (,%rcx,TBF_INTERRUPT),%ecx + UNLIKELY_START(z, sysenter_gpf) + movq VCPU_trap_ctxt(%rbx),%rsi +- SAVE_PRESERVED + movl $TRAP_gp_fault,UREGS_entry_vector(%rsp) + movl %eax,TRAPBOUNCE_error_code(%rdx) + movq TRAP_gp_fault * TRAPINFO_sizeof + TRAPINFO_eip(%rsi),%rax +@@ -225,7 +225,8 @@ UNLIKELY_END(sysenter_gpf) + ENTRY(int80_direct_trap) + ASM_CLAC + pushq $0 +- SAVE_VOLATILE 0x80 ++ movl $0x80, 4(%rsp) ++ SAVE_ALL + + cmpb $0,untrusted_msi(%rip) + UNLIKELY_START(ne, msi_check) +@@ -253,7 +254,6 @@ int80_slow_path: + * IDT entry with DPL==0. + */ + movl $((0x80 << 3) | X86_XEC_IDT),UREGS_error_code(%rsp) +- SAVE_PRESERVED + movl $TRAP_gp_fault,UREGS_entry_vector(%rsp) + /* A GPF wouldn't have incremented the instruction pointer. */ + subq $2,UREGS_rip(%rsp) +diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c +index 2a326be58e..3652f5ff21 100644 +--- a/xen/arch/x86/x86_64/traps.c ++++ b/xen/arch/x86/x86_64/traps.c +@@ -80,15 +80,10 @@ static void _show_registers( + regs->rbp, regs->rsp, regs->r8); + printk("r9: %016lx r10: %016lx r11: %016lx\n", + regs->r9, regs->r10, regs->r11); +- if ( !(regs->entry_vector & TRAP_regs_partial) ) +- { +- printk("r12: %016lx r13: %016lx r14: %016lx\n", +- regs->r12, regs->r13, regs->r14); +- printk("r15: %016lx cr0: %016lx cr4: %016lx\n", +- regs->r15, crs[0], crs[4]); +- } +- else +- printk("cr0: %016lx cr4: %016lx\n", crs[0], crs[4]); ++ printk("r12: %016lx r13: %016lx r14: %016lx\n", ++ regs->r12, regs->r13, regs->r14); ++ printk("r15: %016lx cr0: %016lx cr4: %016lx\n", ++ regs->r15, crs[0], crs[4]); + printk("cr3: %016lx cr2: %016lx\n", crs[3], crs[2]); + printk("fsb: %016lx gsb: %016lx gss: %016lx\n", + crs[5], crs[6], crs[7]); +diff --git a/xen/arch/x86/x86_emulate.c b/xen/arch/x86/x86_emulate.c +index cc334ca8f9..c7ba221d11 100644 +--- a/xen/arch/x86/x86_emulate.c ++++ b/xen/arch/x86/x86_emulate.c +@@ -11,7 +11,6 @@ + + #include + #include +-#include /* mark_regs_dirty() */ + #include /* current_cpu_info */ + #include + #include /* cpu_has_amd_erratum() */ +diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c +index 54a275664a..820495fb9c 100644 +--- a/xen/arch/x86/x86_emulate/x86_emulate.c ++++ b/xen/arch/x86/x86_emulate/x86_emulate.c +@@ -1956,10 +1956,10 @@ decode_register( + case 9: p = ®s->r9; break; + case 10: p = ®s->r10; break; + case 11: p = ®s->r11; break; +- case 12: mark_regs_dirty(regs); p = ®s->r12; break; +- case 13: mark_regs_dirty(regs); p = ®s->r13; break; +- case 14: mark_regs_dirty(regs); p = ®s->r14; break; +- case 15: mark_regs_dirty(regs); p = ®s->r15; break; ++ case 12: p = ®s->r12; break; ++ case 13: p = ®s->r13; break; ++ case 14: p = ®s->r14; break; ++ case 15: p = ®s->r15; break; + #endif + default: BUG(); p = NULL; break; + } +diff --git a/xen/common/wait.c b/xen/common/wait.c +index 9490a17dc2..c5fc094e2c 100644 +--- a/xen/common/wait.c ++++ b/xen/common/wait.c +@@ -127,7 +127,6 @@ static void __prepare_to_wait(struct waitqueue_vcpu *wqv) + unsigned long dummy; + u32 entry_vector = cpu_info->guest_cpu_user_regs.entry_vector; + +- cpu_info->guest_cpu_user_regs.entry_vector &= ~TRAP_regs_partial; + ASSERT(wqv->esp == 0); + + /* Save current VCPU affinity; force wakeup on *this* CPU only. */ +diff --git a/xen/include/asm-x86/asm_defns.h b/xen/include/asm-x86/asm_defns.h +index 388fc93b9d..98192eb4e6 100644 +--- a/xen/include/asm-x86/asm_defns.h ++++ b/xen/include/asm-x86/asm_defns.h +@@ -17,15 +17,6 @@ + void ret_from_intr(void); + #endif + +-#ifdef CONFIG_FRAME_POINTER +-/* Indicate special exception stack frame by inverting the frame pointer. */ +-#define SETUP_EXCEPTION_FRAME_POINTER(offs) \ +- leaq offs(%rsp),%rbp; \ +- notq %rbp +-#else +-#define SETUP_EXCEPTION_FRAME_POINTER(offs) +-#endif +- + #ifndef NDEBUG + #define ASSERT_INTERRUPT_STATUS(x, msg) \ + pushf; \ +@@ -42,31 +33,6 @@ void ret_from_intr(void); + #define ASSERT_INTERRUPTS_DISABLED \ + ASSERT_INTERRUPT_STATUS(z, "INTERRUPTS DISABLED") + +-/* +- * This flag is set in an exception frame when registers R12-R15 did not get +- * saved. +- */ +-#define _TRAP_regs_partial 16 +-#define TRAP_regs_partial (1 << _TRAP_regs_partial) +-/* +- * This flag gets set in an exception frame when registers R12-R15 possibly +- * get modified from their originally saved values and hence need to be +- * restored even if the normal call flow would restore register values. +- * +- * The flag being set implies _TRAP_regs_partial to be unset. Restoring +- * R12-R15 thus is +- * - required when this flag is set, +- * - safe when _TRAP_regs_partial is unset. +- */ +-#define _TRAP_regs_dirty 17 +-#define TRAP_regs_dirty (1 << _TRAP_regs_dirty) +- +-#define mark_regs_dirty(r) ({ \ +- struct cpu_user_regs *r__ = (r); \ +- ASSERT(!((r__)->entry_vector & TRAP_regs_partial)); \ +- r__->entry_vector |= TRAP_regs_dirty; \ +-}) +- + #ifdef __ASSEMBLY__ + # define _ASM_EX(p) p-. + #else +@@ -236,7 +202,7 @@ static always_inline void stac(void) + #endif + + #ifdef __ASSEMBLY__ +-.macro SAVE_ALL op ++.macro SAVE_ALL op, compat=0 + .ifeqs "\op", "CLAC" + ASM_CLAC + .else +@@ -255,40 +221,6 @@ static always_inline void stac(void) + movq %rdx,UREGS_rdx(%rsp) + movq %rcx,UREGS_rcx(%rsp) + movq %rax,UREGS_rax(%rsp) +- movq %r8,UREGS_r8(%rsp) +- movq %r9,UREGS_r9(%rsp) +- movq %r10,UREGS_r10(%rsp) +- movq %r11,UREGS_r11(%rsp) +- movq %rbx,UREGS_rbx(%rsp) +- movq %rbp,UREGS_rbp(%rsp) +- SETUP_EXCEPTION_FRAME_POINTER(UREGS_rbp) +- movq %r12,UREGS_r12(%rsp) +- movq %r13,UREGS_r13(%rsp) +- movq %r14,UREGS_r14(%rsp) +- movq %r15,UREGS_r15(%rsp) +-.endm +- +-/* +- * Save all registers not preserved by C code or used in entry/exit code. Mark +- * the frame as partial. +- * +- * @type: exception type +- * @compat: R8-R15 don't need saving, and the frame nevertheless is complete +- */ +-.macro SAVE_VOLATILE type compat=0 +-.if \compat +- movl $\type,UREGS_entry_vector-UREGS_error_code(%rsp) +-.else +- movl $\type|TRAP_regs_partial,\ +- UREGS_entry_vector-UREGS_error_code(%rsp) +-.endif +- addq $-(UREGS_error_code-UREGS_r15),%rsp +- cld +- movq %rdi,UREGS_rdi(%rsp) +- movq %rsi,UREGS_rsi(%rsp) +- movq %rdx,UREGS_rdx(%rsp) +- movq %rcx,UREGS_rcx(%rsp) +- movq %rax,UREGS_rax(%rsp) + .if !\compat + movq %r8,UREGS_r8(%rsp) + movq %r9,UREGS_r9(%rsp) +@@ -297,20 +229,17 @@ static always_inline void stac(void) + .endif + movq %rbx,UREGS_rbx(%rsp) + movq %rbp,UREGS_rbp(%rsp) +- SETUP_EXCEPTION_FRAME_POINTER(UREGS_rbp) +-.endm +- +-/* +- * Complete a frame potentially only partially saved. +- */ +-.macro SAVE_PRESERVED +- btrl $_TRAP_regs_partial,UREGS_entry_vector(%rsp) +- jnc 987f ++#ifdef CONFIG_FRAME_POINTER ++/* Indicate special exception stack frame by inverting the frame pointer. */ ++ leaq UREGS_rbp(%rsp), %rbp ++ notq %rbp ++#endif ++.if !\compat + movq %r12,UREGS_r12(%rsp) + movq %r13,UREGS_r13(%rsp) + movq %r14,UREGS_r14(%rsp) + movq %r15,UREGS_r15(%rsp) +-987: ++.endif + .endm + + #define LOAD_ONE_REG(reg, compat) \ +@@ -330,7 +259,6 @@ static always_inline void stac(void) + */ + .macro RESTORE_ALL adj=0 compat=0 + .if !\compat +- testl $TRAP_regs_dirty,UREGS_entry_vector(%rsp) + movq UREGS_r11(%rsp),%r11 + movq UREGS_r10(%rsp),%r10 + movq UREGS_r9(%rsp),%r9 +@@ -347,33 +275,16 @@ static always_inline void stac(void) + LOAD_ONE_REG(si, \compat) + LOAD_ONE_REG(di, \compat) + .if !\compat +- jz 987f + movq UREGS_r15(%rsp),%r15 + movq UREGS_r14(%rsp),%r14 + movq UREGS_r13(%rsp),%r13 + movq UREGS_r12(%rsp),%r12 +-#ifndef NDEBUG +- .subsection 1 +-987: testl $TRAP_regs_partial,UREGS_entry_vector(%rsp) +- jnz 987f +- cmpq UREGS_r15(%rsp),%r15 +- jne 789f +- cmpq UREGS_r14(%rsp),%r14 +- jne 789f +- cmpq UREGS_r13(%rsp),%r13 +- jne 789f +- cmpq UREGS_r12(%rsp),%r12 +- je 987f +-789: BUG /* Corruption of partial register state. */ +- .subsection 0 +-#endif + .else + xor %r15, %r15 + xor %r14, %r14 + xor %r13, %r13 + xor %r12, %r12 + .endif +-987: + LOAD_ONE_REG(bp, \compat) + LOAD_ONE_REG(bx, \compat) + subq $-(UREGS_error_code-UREGS_r15+\adj), %rsp +-- +2.14.3 + + +From 57dc197cf0d36c56ba1d9d32c6a1454bb52605bb Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Wed, 17 Jan 2018 16:56:03 +0100 +Subject: [PATCH 3/5] x86/mm: Always set _PAGE_ACCESSED on L4e updates + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: bd61fe94bee0556bc2f64999a4a8315b93f90f21 +master date: 2018-01-15 13:53:16 +0000 +--- + xen/arch/x86/pv/mm.h | 18 +++++++++++++++--- + 1 file changed, 15 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h +index 7502d533c6..976209ba4c 100644 +--- a/xen/arch/x86/pv/mm.h ++++ b/xen/arch/x86/pv/mm.h +@@ -144,9 +144,21 @@ static inline l3_pgentry_t unadjust_guest_l3e(l3_pgentry_t l3e, + static inline l4_pgentry_t adjust_guest_l4e(l4_pgentry_t l4e, + const struct domain *d) + { +- if ( likely(l4e_get_flags(l4e) & _PAGE_PRESENT) && +- likely(!is_pv_32bit_domain(d)) ) +- l4e_add_flags(l4e, _PAGE_USER); ++ /* ++ * When shadowing an L4 behind the guests back (e.g. for per-pcpu ++ * purposes), we cannot efficiently sync access bit updates from hardware ++ * (on the shadow tables) back into the guest view. ++ * ++ * We therefore unconditionally set _PAGE_ACCESSED even in the guests ++ * view. This will appear to the guest as a CPU which proactively pulls ++ * all valid L4e's into its TLB, which is compatible with the x86 ABI. ++ * ++ * At the time of writing, all PV guests set the access bit anyway, so ++ * this is no actual change in their behaviour. ++ */ ++ if ( likely(l4e_get_flags(l4e) & _PAGE_PRESENT) ) ++ l4e_add_flags(l4e, (_PAGE_ACCESSED | ++ (is_pv_32bit_domain(d) ? 0 : _PAGE_USER))); + + return l4e; + } +-- +2.14.3 + + +From 234f481337ea1a93db968d614649a6bdfdc8418a Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Wed, 17 Jan 2018 16:56:57 +0100 +Subject: [PATCH 4/5] x86: Meltdown band-aid against malicious 64-bit PV guests + +This is a very simplistic change limiting the amount of memory a running +64-bit PV guest has mapped (and hence available for attacking): Only the +mappings of stack, IDT, and TSS are being cloned from the direct map +into per-CPU page tables. Guest controlled parts of the page tables are +being copied into those per-CPU page tables upon entry into the guest. +Cross-vCPU synchronization of top level page table entry changes is +being effected by forcing other active vCPU-s of the guest into the +hypervisor. + +The change to context_switch() isn't strictly necessary, but there's no +reason to keep switching page tables once a PV guest is being scheduled +out. + +This isn't providing full isolation yet, but it should be covering all +pieces of information exposure of which would otherwise require an XSA. + +There is certainly much room for improvement, especially of performance, +here - first and foremost suppressing all the negative effects on AMD +systems. But in the interest of backportability (including to really old +hypervisors, which may not even have alternative patching) any such is +being left out here. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +master commit: 5784de3e2067ed73efc2fe42e62831e8ae7f46c4 +master date: 2018-01-16 17:49:03 +0100 +--- + xen/arch/x86/domain.c | 5 + + xen/arch/x86/mm.c | 21 ++++ + xen/arch/x86/smpboot.c | 198 +++++++++++++++++++++++++++++++++++++ + xen/arch/x86/x86_64/asm-offsets.c | 2 + + xen/arch/x86/x86_64/compat/entry.S | 11 +++ + xen/arch/x86/x86_64/entry.S | 149 +++++++++++++++++++++++++++- + xen/include/asm-x86/asm_defns.h | 30 ++++++ + xen/include/asm-x86/current.h | 12 +++ + xen/include/asm-x86/processor.h | 1 + + xen/include/asm-x86/x86_64/page.h | 5 +- + 10 files changed, 428 insertions(+), 6 deletions(-) + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index b44c95b493..f4a3d7445b 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -1507,6 +1507,9 @@ void paravirt_ctxt_switch_to(struct vcpu *v) + { + unsigned long cr4; + ++ this_cpu(root_pgt)[root_table_offset(PERDOMAIN_VIRT_START)] = ++ l4e_from_page(v->domain->arch.perdomain_l3_pg, __PAGE_HYPERVISOR_RW); ++ + cr4 = pv_guest_cr4_to_real_cr4(v); + if ( unlikely(cr4 != read_cr4()) ) + write_cr4(cr4); +@@ -1676,6 +1679,8 @@ void context_switch(struct vcpu *prev, struct vcpu *next) + + ASSERT(local_irq_is_enabled()); + ++ get_cpu_info()->xen_cr3 = 0; ++ + cpumask_copy(&dirty_mask, next->vcpu_dirty_cpumask); + /* Allow at most one CPU at a time to be dirty. */ + ASSERT(cpumask_weight(&dirty_mask) <= 1); +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index a7a76a71db..6c7d12034b 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -3509,6 +3509,7 @@ long do_mmu_update( + struct vcpu *curr = current, *v = curr; + struct domain *d = v->domain, *pt_owner = d, *pg_owner; + mfn_t map_mfn = INVALID_MFN; ++ bool sync_guest = false; + uint32_t xsm_needed = 0; + uint32_t xsm_checked = 0; + int rc = put_old_guest_table(curr); +@@ -3663,6 +3664,8 @@ long do_mmu_update( + case PGT_l4_page_table: + rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD, v); ++ if ( !rc ) ++ sync_guest = true; + break; + case PGT_writable_page: + perfc_incr(writable_mmu_updates); +@@ -3765,6 +3768,24 @@ long do_mmu_update( + if ( va ) + unmap_domain_page(va); + ++ if ( sync_guest ) ++ { ++ /* ++ * Force other vCPU-s of the affected guest to pick up L4 entry ++ * changes (if any). Issue a flush IPI with empty operation mask to ++ * facilitate this (including ourselves waiting for the IPI to ++ * actually have arrived). Utilize the fact that FLUSH_VA_VALID is ++ * meaningless without FLUSH_CACHE, but will allow to pass the no-op ++ * check in flush_area_mask(). ++ */ ++ unsigned int cpu = smp_processor_id(); ++ cpumask_t *mask = per_cpu(scratch_cpumask, cpu); ++ ++ cpumask_andnot(mask, pt_owner->domain_dirty_cpumask, cpumask_of(cpu)); ++ if ( !cpumask_empty(mask) ) ++ flush_area_mask(mask, ZERO_BLOCK_PTR, FLUSH_VA_VALID); ++ } ++ + perfc_add(num_page_updates, i); + + out: +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index 1609b627ae..b1fbb57a81 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -327,6 +327,9 @@ void start_secondary(void *unused) + */ + spin_debug_disable(); + ++ get_cpu_info()->xen_cr3 = 0; ++ get_cpu_info()->pv_cr3 = __pa(this_cpu(root_pgt)); ++ + load_system_tables(); + + /* Full exception support from here on in. */ +@@ -635,6 +638,187 @@ void cpu_exit_clear(unsigned int cpu) + set_cpu_state(CPU_STATE_DEAD); + } + ++static int clone_mapping(const void *ptr, root_pgentry_t *rpt) ++{ ++ unsigned long linear = (unsigned long)ptr, pfn; ++ unsigned int flags; ++ l3_pgentry_t *pl3e = l4e_to_l3e(idle_pg_table[root_table_offset(linear)]) + ++ l3_table_offset(linear); ++ l2_pgentry_t *pl2e; ++ l1_pgentry_t *pl1e; ++ ++ if ( linear < DIRECTMAP_VIRT_START ) ++ return 0; ++ ++ flags = l3e_get_flags(*pl3e); ++ ASSERT(flags & _PAGE_PRESENT); ++ if ( flags & _PAGE_PSE ) ++ { ++ pfn = (l3e_get_pfn(*pl3e) & ~((1UL << (2 * PAGETABLE_ORDER)) - 1)) | ++ (PFN_DOWN(linear) & ((1UL << (2 * PAGETABLE_ORDER)) - 1)); ++ flags &= ~_PAGE_PSE; ++ } ++ else ++ { ++ pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(linear); ++ flags = l2e_get_flags(*pl2e); ++ ASSERT(flags & _PAGE_PRESENT); ++ if ( flags & _PAGE_PSE ) ++ { ++ pfn = (l2e_get_pfn(*pl2e) & ~((1UL << PAGETABLE_ORDER) - 1)) | ++ (PFN_DOWN(linear) & ((1UL << PAGETABLE_ORDER) - 1)); ++ flags &= ~_PAGE_PSE; ++ } ++ else ++ { ++ pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(linear); ++ flags = l1e_get_flags(*pl1e); ++ if ( !(flags & _PAGE_PRESENT) ) ++ return 0; ++ pfn = l1e_get_pfn(*pl1e); ++ } ++ } ++ ++ if ( !(root_get_flags(rpt[root_table_offset(linear)]) & _PAGE_PRESENT) ) ++ { ++ pl3e = alloc_xen_pagetable(); ++ if ( !pl3e ) ++ return -ENOMEM; ++ clear_page(pl3e); ++ l4e_write(&rpt[root_table_offset(linear)], ++ l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR)); ++ } ++ else ++ pl3e = l4e_to_l3e(rpt[root_table_offset(linear)]); ++ ++ pl3e += l3_table_offset(linear); ++ ++ if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) ++ { ++ pl2e = alloc_xen_pagetable(); ++ if ( !pl2e ) ++ return -ENOMEM; ++ clear_page(pl2e); ++ l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR)); ++ } ++ else ++ { ++ ASSERT(!(l3e_get_flags(*pl3e) & _PAGE_PSE)); ++ pl2e = l3e_to_l2e(*pl3e); ++ } ++ ++ pl2e += l2_table_offset(linear); ++ ++ if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) ++ { ++ pl1e = alloc_xen_pagetable(); ++ if ( !pl1e ) ++ return -ENOMEM; ++ clear_page(pl1e); ++ l2e_write(pl2e, l2e_from_paddr(__pa(pl1e), __PAGE_HYPERVISOR)); ++ } ++ else ++ { ++ ASSERT(!(l2e_get_flags(*pl2e) & _PAGE_PSE)); ++ pl1e = l2e_to_l1e(*pl2e); ++ } ++ ++ pl1e += l1_table_offset(linear); ++ ++ if ( l1e_get_flags(*pl1e) & _PAGE_PRESENT ) ++ { ++ ASSERT(l1e_get_pfn(*pl1e) == pfn); ++ ASSERT(l1e_get_flags(*pl1e) == flags); ++ } ++ else ++ l1e_write(pl1e, l1e_from_pfn(pfn, flags)); ++ ++ return 0; ++} ++ ++DEFINE_PER_CPU(root_pgentry_t *, root_pgt); ++ ++static int setup_cpu_root_pgt(unsigned int cpu) ++{ ++ root_pgentry_t *rpt = alloc_xen_pagetable(); ++ unsigned int off; ++ int rc; ++ ++ if ( !rpt ) ++ return -ENOMEM; ++ ++ clear_page(rpt); ++ per_cpu(root_pgt, cpu) = rpt; ++ ++ rpt[root_table_offset(RO_MPT_VIRT_START)] = ++ idle_pg_table[root_table_offset(RO_MPT_VIRT_START)]; ++ /* SH_LINEAR_PT inserted together with guest mappings. */ ++ /* PERDOMAIN inserted during context switch. */ ++ rpt[root_table_offset(XEN_VIRT_START)] = ++ idle_pg_table[root_table_offset(XEN_VIRT_START)]; ++ ++ /* Install direct map page table entries for stack, IDT, and TSS. */ ++ for ( off = rc = 0; !rc && off < STACK_SIZE; off += PAGE_SIZE ) ++ rc = clone_mapping(__va(__pa(stack_base[cpu])) + off, rpt); ++ ++ if ( !rc ) ++ rc = clone_mapping(idt_tables[cpu], rpt); ++ if ( !rc ) ++ rc = clone_mapping(&per_cpu(init_tss, cpu), rpt); ++ ++ return rc; ++} ++ ++static void cleanup_cpu_root_pgt(unsigned int cpu) ++{ ++ root_pgentry_t *rpt = per_cpu(root_pgt, cpu); ++ unsigned int r; ++ ++ if ( !rpt ) ++ return; ++ ++ per_cpu(root_pgt, cpu) = NULL; ++ ++ for ( r = root_table_offset(DIRECTMAP_VIRT_START); ++ r < root_table_offset(HYPERVISOR_VIRT_END); ++r ) ++ { ++ l3_pgentry_t *l3t; ++ unsigned int i3; ++ ++ if ( !(root_get_flags(rpt[r]) & _PAGE_PRESENT) ) ++ continue; ++ ++ l3t = l4e_to_l3e(rpt[r]); ++ ++ for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; ++i3 ) ++ { ++ l2_pgentry_t *l2t; ++ unsigned int i2; ++ ++ if ( !(l3e_get_flags(l3t[i3]) & _PAGE_PRESENT) ) ++ continue; ++ ++ ASSERT(!(l3e_get_flags(l3t[i3]) & _PAGE_PSE)); ++ l2t = l3e_to_l2e(l3t[i3]); ++ ++ for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; ++i2 ) ++ { ++ if ( !(l2e_get_flags(l2t[i2]) & _PAGE_PRESENT) ) ++ continue; ++ ++ ASSERT(!(l2e_get_flags(l2t[i2]) & _PAGE_PSE)); ++ free_xen_pagetable(l2e_to_l1e(l2t[i2])); ++ } ++ ++ free_xen_pagetable(l2t); ++ } ++ ++ free_xen_pagetable(l3t); ++ } ++ ++ free_xen_pagetable(rpt); ++} ++ + static void cpu_smpboot_free(unsigned int cpu) + { + unsigned int order, socket = cpu_to_socket(cpu); +@@ -673,6 +857,8 @@ static void cpu_smpboot_free(unsigned int cpu) + free_domheap_page(mfn_to_page(mfn)); + } + ++ cleanup_cpu_root_pgt(cpu); ++ + order = get_order_from_pages(NR_RESERVED_GDT_PAGES); + free_xenheap_pages(per_cpu(gdt_table, cpu), order); + +@@ -728,6 +914,9 @@ static int cpu_smpboot_alloc(unsigned int cpu) + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); + ++ if ( setup_cpu_root_pgt(cpu) ) ++ goto oom; ++ + for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1); + i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i ) + if ( cpu_online(i) && cpu_to_node(i) == node ) +@@ -783,6 +972,8 @@ static struct notifier_block cpu_smpboot_nfb = { + + void __init smp_prepare_cpus(unsigned int max_cpus) + { ++ int rc; ++ + register_cpu_notifier(&cpu_smpboot_nfb); + + mtrr_aps_sync_begin(); +@@ -796,6 +987,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) + + stack_base[0] = stack_start; + ++ rc = setup_cpu_root_pgt(0); ++ if ( rc ) ++ panic("Error %d setting up PV root page table\n", rc); ++ get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0)); ++ + set_nr_sockets(); + + socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets); +@@ -864,6 +1060,8 @@ void __init smp_prepare_boot_cpu(void) + #if NR_CPUS > 2 * BITS_PER_LONG + per_cpu(scratch_cpumask, cpu) = &scratch_cpu0mask; + #endif ++ ++ get_cpu_info()->xen_cr3 = 0; + } + + static void +diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c +index e136af6b99..b1a4310974 100644 +--- a/xen/arch/x86/x86_64/asm-offsets.c ++++ b/xen/arch/x86/x86_64/asm-offsets.c +@@ -137,6 +137,8 @@ void __dummy__(void) + OFFSET(CPUINFO_processor_id, struct cpu_info, processor_id); + OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); + OFFSET(CPUINFO_cr4, struct cpu_info, cr4); ++ OFFSET(CPUINFO_xen_cr3, struct cpu_info, xen_cr3); ++ OFFSET(CPUINFO_pv_cr3, struct cpu_info, pv_cr3); + DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info)); + BLANK(); + +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index 3fea54ee9d..e668f00c36 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -199,6 +199,17 @@ ENTRY(cstar_enter) + pushq $0 + movl $TRAP_syscall, 4(%rsp) + SAVE_ALL ++ ++ GET_STACK_END(bx) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx ++ neg %rcx ++ jz .Lcstar_cr3_okay ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++ neg %rcx ++ write_cr3 rcx, rdi, rsi ++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++.Lcstar_cr3_okay: ++ + GET_CURRENT(bx) + movq VCPU_domain(%rbx),%rcx + cmpb $0,DOMAIN_is_32bit_pv(%rcx) +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index 1dd9ccf6a2..fc38874b1f 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -37,6 +37,32 @@ ENTRY(switch_to_kernel) + /* %rbx: struct vcpu, interrupts disabled */ + restore_all_guest: + ASSERT_INTERRUPTS_DISABLED ++ ++ /* Copy guest mappings and switch to per-CPU root page table. */ ++ mov %cr3, %r9 ++ GET_STACK_END(dx) ++ mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi ++ movabs $PADDR_MASK & PAGE_MASK, %rsi ++ movabs $DIRECTMAP_VIRT_START, %rcx ++ mov %rdi, %rax ++ and %rsi, %rdi ++ and %r9, %rsi ++ add %rcx, %rdi ++ add %rcx, %rsi ++ mov $ROOT_PAGETABLE_FIRST_XEN_SLOT, %ecx ++ mov root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rsi), %r8 ++ mov %r8, root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rdi) ++ rep movsq ++ mov $ROOT_PAGETABLE_ENTRIES - \ ++ ROOT_PAGETABLE_LAST_XEN_SLOT - 1, %ecx ++ sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \ ++ ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rsi ++ sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \ ++ ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi ++ rep movsq ++ mov %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx) ++ write_cr3 rax, rdi, rsi ++ + RESTORE_ALL + testw $TRAP_syscall,4(%rsp) + jz iret_exit_to_guest +@@ -71,6 +97,22 @@ iret_exit_to_guest: + ALIGN + /* No special register assumptions. */ + restore_all_xen: ++ /* ++ * Check whether we need to switch to the per-CPU page tables, in ++ * case we return to late PV exit code (from an NMI or #MC). ++ */ ++ GET_STACK_END(ax) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rax), %rdx ++ mov STACK_CPUINFO_FIELD(pv_cr3)(%rax), %rax ++ test %rdx, %rdx ++ /* ++ * Ideally the condition would be "nsz", but such doesn't exist, ++ * so "g" will have to do. ++ */ ++UNLIKELY_START(g, exit_cr3) ++ write_cr3 rax, rdi, rsi ++UNLIKELY_END(exit_cr3) ++ + RESTORE_ALL adj=8 + iretq + +@@ -100,7 +142,18 @@ ENTRY(lstar_enter) + pushq $0 + movl $TRAP_syscall, 4(%rsp) + SAVE_ALL +- GET_CURRENT(bx) ++ ++ GET_STACK_END(bx) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx ++ neg %rcx ++ jz .Llstar_cr3_okay ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++ neg %rcx ++ write_cr3 rcx, rdi, rsi ++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++.Llstar_cr3_okay: ++ ++ __GET_CURRENT(bx) + testb $TF_kernel_mode,VCPU_thread_flags(%rbx) + jz switch_to_kernel + +@@ -192,7 +245,18 @@ GLOBAL(sysenter_eflags_saved) + pushq $0 + movl $TRAP_syscall, 4(%rsp) + SAVE_ALL +- GET_CURRENT(bx) ++ ++ GET_STACK_END(bx) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx ++ neg %rcx ++ jz .Lsyse_cr3_okay ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++ neg %rcx ++ write_cr3 rcx, rdi, rsi ++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++.Lsyse_cr3_okay: ++ ++ __GET_CURRENT(bx) + cmpb $0,VCPU_sysenter_disables_events(%rbx) + movq VCPU_sysenter_addr(%rbx),%rax + setne %cl +@@ -228,13 +292,23 @@ ENTRY(int80_direct_trap) + movl $0x80, 4(%rsp) + SAVE_ALL + ++ GET_STACK_END(bx) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx ++ neg %rcx ++ jz .Lint80_cr3_okay ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++ neg %rcx ++ write_cr3 rcx, rdi, rsi ++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++.Lint80_cr3_okay: ++ + cmpb $0,untrusted_msi(%rip) + UNLIKELY_START(ne, msi_check) + movl $0x80,%edi + call check_for_unexpected_msi + UNLIKELY_END(msi_check) + +- GET_CURRENT(bx) ++ __GET_CURRENT(bx) + + /* Check that the callback is non-null. */ + leaq VCPU_int80_bounce(%rbx),%rdx +@@ -391,9 +465,27 @@ ENTRY(dom_crash_sync_extable) + + ENTRY(common_interrupt) + SAVE_ALL CLAC ++ ++ GET_STACK_END(14) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx ++ mov %rcx, %r15 ++ neg %rcx ++ jz .Lintr_cr3_okay ++ jns .Lintr_cr3_load ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ neg %rcx ++.Lintr_cr3_load: ++ write_cr3 rcx, rdi, rsi ++ xor %ecx, %ecx ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ testb $3, UREGS_cs(%rsp) ++ cmovnz %rcx, %r15 ++.Lintr_cr3_okay: ++ + CR4_PV32_RESTORE + movq %rsp,%rdi + callq do_IRQ ++ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + jmp ret_from_intr + + /* No special register assumptions. */ +@@ -411,6 +503,23 @@ ENTRY(page_fault) + /* No special register assumptions. */ + GLOBAL(handle_exception) + SAVE_ALL CLAC ++ ++ GET_STACK_END(14) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx ++ mov %rcx, %r15 ++ neg %rcx ++ jz .Lxcpt_cr3_okay ++ jns .Lxcpt_cr3_load ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ neg %rcx ++.Lxcpt_cr3_load: ++ write_cr3 rcx, rdi, rsi ++ xor %ecx, %ecx ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ testb $3, UREGS_cs(%rsp) ++ cmovnz %rcx, %r15 ++.Lxcpt_cr3_okay: ++ + handle_exception_saved: + GET_CURRENT(bx) + testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) +@@ -475,6 +584,7 @@ handle_exception_saved: + leaq exception_table(%rip),%rdx + PERFC_INCR(exceptions, %rax, %rbx) + callq *(%rdx,%rax,8) ++ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + testb $3,UREGS_cs(%rsp) + jz restore_all_xen + leaq VCPU_trap_bounce(%rbx),%rdx +@@ -507,6 +617,7 @@ exception_with_ints_disabled: + rep; movsq # make room for ec/ev + 1: movq UREGS_error_code(%rsp),%rax # ec/ev + movq %rax,UREGS_kernel_sizeof(%rsp) ++ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + jmp restore_all_xen # return to fixup code + + /* No special register assumptions. */ +@@ -585,6 +696,17 @@ ENTRY(double_fault) + movl $TRAP_double_fault,4(%rsp) + /* Set AC to reduce chance of further SMAP faults */ + SAVE_ALL STAC ++ ++ GET_STACK_END(bx) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rbx ++ test %rbx, %rbx ++ jz .Ldblf_cr3_okay ++ jns .Ldblf_cr3_load ++ neg %rbx ++.Ldblf_cr3_load: ++ write_cr3 rbx, rdi, rsi ++.Ldblf_cr3_okay: ++ + movq %rsp,%rdi + call do_double_fault + BUG /* do_double_fault() shouldn't return. */ +@@ -603,10 +725,28 @@ ENTRY(nmi) + movl $TRAP_nmi,4(%rsp) + handle_ist_exception: + SAVE_ALL CLAC ++ ++ GET_STACK_END(14) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx ++ mov %rcx, %r15 ++ neg %rcx ++ jz .List_cr3_okay ++ jns .List_cr3_load ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ neg %rcx ++.List_cr3_load: ++ write_cr3 rcx, rdi, rsi ++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++.List_cr3_okay: ++ + CR4_PV32_RESTORE + testb $3,UREGS_cs(%rsp) + jz 1f +- /* Interrupted guest context. Copy the context to stack bottom. */ ++ /* ++ * Interrupted guest context. Clear the restore value for xen_cr3 ++ * and copy the context to stack bottom. ++ */ ++ xor %r15, %r15 + GET_CPUINFO_FIELD(guest_cpu_user_regs,di) + movq %rsp,%rsi + movl $UREGS_kernel_sizeof/8,%ecx +@@ -616,6 +756,7 @@ handle_ist_exception: + movzbl UREGS_entry_vector(%rsp),%eax + leaq exception_table(%rip),%rdx + callq *(%rdx,%rax,8) ++ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + cmpb $TRAP_nmi,UREGS_entry_vector(%rsp) + jne ret_from_intr + +diff --git a/xen/include/asm-x86/asm_defns.h b/xen/include/asm-x86/asm_defns.h +index 98192eb4e6..fb0fee9286 100644 +--- a/xen/include/asm-x86/asm_defns.h ++++ b/xen/include/asm-x86/asm_defns.h +@@ -93,9 +93,30 @@ void ret_from_intr(void); + UNLIKELY_DONE(mp, tag); \ + __UNLIKELY_END(tag) + ++ .equ .Lrax, 0 ++ .equ .Lrcx, 1 ++ .equ .Lrdx, 2 ++ .equ .Lrbx, 3 ++ .equ .Lrsp, 4 ++ .equ .Lrbp, 5 ++ .equ .Lrsi, 6 ++ .equ .Lrdi, 7 ++ .equ .Lr8, 8 ++ .equ .Lr9, 9 ++ .equ .Lr10, 10 ++ .equ .Lr11, 11 ++ .equ .Lr12, 12 ++ .equ .Lr13, 13 ++ .equ .Lr14, 14 ++ .equ .Lr15, 15 ++ + #define STACK_CPUINFO_FIELD(field) (1 - CPUINFO_sizeof + CPUINFO_##field) + #define GET_STACK_END(reg) \ ++ .if .Lr##reg > 8; \ ++ movq $STACK_SIZE-1, %r##reg; \ ++ .else; \ + movl $STACK_SIZE-1, %e##reg; \ ++ .endif; \ + orq %rsp, %r##reg + + #define GET_CPUINFO_FIELD(field, reg) \ +@@ -177,6 +198,15 @@ void ret_from_intr(void); + #define ASM_STAC ASM_AC(STAC) + #define ASM_CLAC ASM_AC(CLAC) + ++.macro write_cr3 val:req, tmp1:req, tmp2:req ++ mov %cr4, %\tmp1 ++ mov %\tmp1, %\tmp2 ++ and $~X86_CR4_PGE, %\tmp1 ++ mov %\tmp1, %cr4 ++ mov %\val, %cr3 ++ mov %\tmp2, %cr4 ++.endm ++ + #define CR4_PV32_RESTORE \ + 667: ASM_NOP5; \ + .pushsection .altinstr_replacement, "ax"; \ +diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h +index 89849929eb..b929c48c85 100644 +--- a/xen/include/asm-x86/current.h ++++ b/xen/include/asm-x86/current.h +@@ -41,6 +41,18 @@ struct cpu_info { + struct vcpu *current_vcpu; + unsigned long per_cpu_offset; + unsigned long cr4; ++ /* ++ * Of the two following fields the latter is being set to the CR3 value ++ * to be used on the given pCPU for loading whenever 64-bit PV guest ++ * context is being entered. The value never changes once set. ++ * The former is the value to restore when re-entering Xen, if any. IOW ++ * its value being zero means there's nothing to restore. However, its ++ * value can also be negative, indicating to the exit-to-Xen code that ++ * restoring is not necessary, but allowing any nested entry code paths ++ * to still know the value to put back into CR3. ++ */ ++ unsigned long xen_cr3; ++ unsigned long pv_cr3; + /* get_stack_bottom() must be 16-byte aligned */ + }; + +diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h +index 41a8d8c32f..2962e83464 100644 +--- a/xen/include/asm-x86/processor.h ++++ b/xen/include/asm-x86/processor.h +@@ -462,6 +462,7 @@ extern idt_entry_t idt_table[]; + extern idt_entry_t *idt_tables[]; + + DECLARE_PER_CPU(struct tss_struct, init_tss); ++DECLARE_PER_CPU(root_pgentry_t *, root_pgt); + + extern void init_int80_direct_trap(struct vcpu *v); + +diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h +index 6fb7cd5553..05a0334893 100644 +--- a/xen/include/asm-x86/x86_64/page.h ++++ b/xen/include/asm-x86/x86_64/page.h +@@ -24,8 +24,8 @@ + /* These are architectural limits. Current CPUs support only 40-bit phys. */ + #define PADDR_BITS 52 + #define VADDR_BITS 48 +-#define PADDR_MASK ((1UL << PADDR_BITS)-1) +-#define VADDR_MASK ((1UL << VADDR_BITS)-1) ++#define PADDR_MASK ((_AC(1,UL) << PADDR_BITS) - 1) ++#define VADDR_MASK ((_AC(1,UL) << VADDR_BITS) - 1) + + #define VADDR_TOP_BIT (1UL << (VADDR_BITS - 1)) + #define CANONICAL_MASK (~0UL & ~VADDR_MASK) +@@ -107,6 +107,7 @@ typedef l4_pgentry_t root_pgentry_t; + : (((_s) < ROOT_PAGETABLE_FIRST_XEN_SLOT) || \ + ((_s) > ROOT_PAGETABLE_LAST_XEN_SLOT))) + ++#define root_table_offset l4_table_offset + #define root_get_pfn l4e_get_pfn + #define root_get_flags l4e_get_flags + #define root_get_intpte l4e_get_intpte +-- +2.14.3 + + +From 7cccd6f748ec724cf9408cec6b3ec8e54a8a2c1f Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Wed, 17 Jan 2018 16:57:33 +0100 +Subject: [PATCH 5/5] x86: allow Meltdown band-aid to be disabled + +First of all we don't need it on AMD systems. Additionally allow its use +to be controlled by command line option. For best backportability, this +intentionally doesn't use alternative instruction patching to achieve +the intended effect - while we likely want it, this will be later +follow-up. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +master commit: e871e80c38547d9faefc6604532ba3e985e65873 +master date: 2018-01-16 17:50:59 +0100 +--- + docs/misc/xen-command-line.markdown | 12 ++++++++++++ + xen/arch/x86/domain.c | 7 +++++-- + xen/arch/x86/mm.c | 2 +- + xen/arch/x86/smpboot.c | 17 ++++++++++++++--- + xen/arch/x86/x86_64/entry.S | 2 ++ + 5 files changed, 34 insertions(+), 6 deletions(-) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 781110d4b2..49539b4d1c 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -1849,6 +1849,18 @@ In the case that x2apic is in use, this option switches between physical and + clustered mode. The default, given no hint from the **FADT**, is cluster + mode. + ++### xpti ++> `= ` ++ ++> Default: `false` on AMD hardware ++> Default: `true` everywhere else ++ ++Override default selection of whether to isolate 64-bit PV guest page ++tables. ++ ++** WARNING: Not yet a complete isolation implementation, but better than ++nothing. ** ++ + ### xsave + > `= ` + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index f4a3d7445b..b357b60f73 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -1505,10 +1505,13 @@ void paravirt_ctxt_switch_from(struct vcpu *v) + + void paravirt_ctxt_switch_to(struct vcpu *v) + { ++ root_pgentry_t *root_pgt = this_cpu(root_pgt); + unsigned long cr4; + +- this_cpu(root_pgt)[root_table_offset(PERDOMAIN_VIRT_START)] = +- l4e_from_page(v->domain->arch.perdomain_l3_pg, __PAGE_HYPERVISOR_RW); ++ if ( root_pgt ) ++ root_pgt[root_table_offset(PERDOMAIN_VIRT_START)] = ++ l4e_from_page(v->domain->arch.perdomain_l3_pg, ++ __PAGE_HYPERVISOR_RW); + + cr4 = pv_guest_cr4_to_real_cr4(v); + if ( unlikely(cr4 != read_cr4()) ) +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 6c7d12034b..53295f85b7 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -3665,7 +3665,7 @@ long do_mmu_update( + rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD, v); + if ( !rc ) +- sync_guest = true; ++ sync_guest = this_cpu(root_pgt); + break; + case PGT_writable_page: + perfc_incr(writable_mmu_updates); +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index b1fbb57a81..edf607f5a2 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -328,7 +328,7 @@ void start_secondary(void *unused) + spin_debug_disable(); + + get_cpu_info()->xen_cr3 = 0; +- get_cpu_info()->pv_cr3 = __pa(this_cpu(root_pgt)); ++ get_cpu_info()->pv_cr3 = this_cpu(root_pgt) ? __pa(this_cpu(root_pgt)) : 0; + + load_system_tables(); + +@@ -736,14 +736,20 @@ static int clone_mapping(const void *ptr, root_pgentry_t *rpt) + return 0; + } + ++static __read_mostly int8_t opt_xpti = -1; ++boolean_param("xpti", opt_xpti); + DEFINE_PER_CPU(root_pgentry_t *, root_pgt); + + static int setup_cpu_root_pgt(unsigned int cpu) + { +- root_pgentry_t *rpt = alloc_xen_pagetable(); ++ root_pgentry_t *rpt; + unsigned int off; + int rc; + ++ if ( !opt_xpti ) ++ return 0; ++ ++ rpt = alloc_xen_pagetable(); + if ( !rpt ) + return -ENOMEM; + +@@ -987,10 +993,14 @@ void __init smp_prepare_cpus(unsigned int max_cpus) + + stack_base[0] = stack_start; + ++ if ( opt_xpti < 0 ) ++ opt_xpti = boot_cpu_data.x86_vendor != X86_VENDOR_AMD; ++ + rc = setup_cpu_root_pgt(0); + if ( rc ) + panic("Error %d setting up PV root page table\n", rc); +- get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0)); ++ if ( per_cpu(root_pgt, 0) ) ++ get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0)); + + set_nr_sockets(); + +@@ -1062,6 +1072,7 @@ void __init smp_prepare_boot_cpu(void) + #endif + + get_cpu_info()->xen_cr3 = 0; ++ get_cpu_info()->pv_cr3 = 0; + } + + static void +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index fc38874b1f..a8825c89df 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -46,6 +46,7 @@ restore_all_guest: + movabs $DIRECTMAP_VIRT_START, %rcx + mov %rdi, %rax + and %rsi, %rdi ++ jz .Lrag_keep_cr3 + and %r9, %rsi + add %rcx, %rdi + add %rcx, %rsi +@@ -62,6 +63,7 @@ restore_all_guest: + rep movsq + mov %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx) + write_cr3 rax, rdi, rsi ++.Lrag_keep_cr3: + + RESTORE_ALL + testw $TRAP_syscall,4(%rsp) +-- +2.14.3 +