From c6bfd9a2dd2fce7d872a401442ce83797acc2ff9 Mon Sep 17 00:00:00 2001 From: Michael Young Date: Aug 14 2018 20:00:08 +0000 Subject: L1 Terminal Fault speculative side channel patch bundle [XSA-273, CVE-2018-3620, CVE-2018-3646] Use of v2 grant tables may cause crash on ARM [XSA-268] x86: Incorrect MSR_DEBUGCTL handling lets guests enable BTS [XSA-269] oxenstored does not apply quota-maxentity [XSA-272] no longer need to set python_sitearch --- diff --git a/xen.spec b/xen.spec index 90b1be8..470dd21 100644 --- a/xen.spec +++ b/xen.spec @@ -1,4 +1,3 @@ -%define python_sitearch %(/usr/bin/python2 -c "from distutils.sysconfig import get_python_lib; print get_python_lib(1)") # Build ocaml bits unless rpmbuild was run with --without ocaml # or ocamlopt is missing (the xen makefile doesn't build ocaml bits if it isn't there) %define with_ocaml %{?_without_ocaml: 0} %{?!_without_ocaml: 1} @@ -61,7 +60,7 @@ Summary: Xen is a virtual machine monitor Name: xen Version: 4.11.0 -Release: 1%{?dist} +Release: 2%{?dist} Group: Development/Libraries License: GPLv2+ and LGPLv2+ and BSD URL: http://xen.org/ @@ -118,6 +117,7 @@ Patch39: qemu.trad.CVE-2017-9330.patch Patch40: xen.gcc8.temp.fix.patch Patch41: xen.drop.brctl.patch Patch42: xen.stubdom.build.patch +Patch43: xsa273.patch %if %build_qemutrad @@ -327,6 +327,7 @@ manage Xen virtual machines. %patch40 -p1 %patch41 -p1 %patch42 -p1 +%patch43 -p1 # qemu-xen-traditional patches pushd tools/qemu-xen-traditional @@ -859,6 +860,15 @@ fi %endif %changelog +* Tue Aug 14 2018 Michael Young - 4.11.0-2 +- no longer need to set python_sitearch +- L1 Terminal Fault speculative side channel patch bundle [XSA-273, + CVE-2018-3620, CVE-2018-3646] + also includes + Use of v2 grant tables may cause crash on ARM [XSA-268] + x86: Incorrect MSR_DEBUGCTL handling lets guests enable BTS [XSA-269] + oxenstored does not apply quota-maxentity [XSA-272] + * Thu Jul 12 2018 Michael Young - 4.11.0-1 - update to 4.11.0 (#1592976) remove patches for issues now fixed upstream diff --git a/xsa273.patch b/xsa273.patch new file mode 100644 index 0000000..7047f71 --- /dev/null +++ b/xsa273.patch @@ -0,0 +1,6346 @@ +From e932371d6ae0f69b89abb2dce725483c75356de2 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 30 Jul 2018 11:17:27 +0200 +Subject: [PATCH 02/42] xen: Port the array_index_nospec() infrastructure from + Linux + +This is as the infrastructure appeared in Linux 4.17, adapted slightly for +Xen. + +Signed-off-by: Andrew Cooper +Signed-off-by: Julien Grall +Acked-by: Jan Beulich +master commit: 2ddfae51d8b1d7b8cd33a4f6ad4d16d27cb869ae +master date: 2018-07-06 16:49:57 +0100 +--- + xen/include/asm-arm/arm32/system.h | 18 ++++++++ + xen/include/asm-arm/arm64/system.h | 22 ++++++++++ + xen/include/asm-x86/system.h | 24 ++++++++++ + xen/include/xen/compiler.h | 3 ++ + xen/include/xen/nospec.h | 70 ++++++++++++++++++++++++++++++ + 5 files changed, 137 insertions(+) + create mode 100644 xen/include/xen/nospec.h + +diff --git a/xen/include/asm-arm/arm32/system.h b/xen/include/asm-arm/arm32/system.h +index c617b40438..ab57abfbc5 100644 +--- a/xen/include/asm-arm/arm32/system.h ++++ b/xen/include/asm-arm/arm32/system.h +@@ -48,6 +48,24 @@ static inline int local_fiq_is_enabled(void) + return !(flags & PSR_FIQ_MASK); + } + ++#define CSDB ".inst 0xe320f014" ++ ++static inline unsigned long array_index_mask_nospec(unsigned long idx, ++ unsigned long sz) ++{ ++ unsigned long mask; ++ ++ asm volatile( "cmp %1, %2\n" ++ "sbc %0, %1, %1\n" ++ CSDB ++ : "=r" (mask) ++ : "r" (idx), "Ir" (sz) ++ : "cc" ); ++ ++ return mask; ++} ++#define array_index_mask_nospec array_index_mask_nospec ++ + #endif + /* + * Local variables: +diff --git a/xen/include/asm-arm/arm64/system.h b/xen/include/asm-arm/arm64/system.h +index 2e2ee212a1..2e36573ac6 100644 +--- a/xen/include/asm-arm/arm64/system.h ++++ b/xen/include/asm-arm/arm64/system.h +@@ -58,6 +58,28 @@ static inline int local_fiq_is_enabled(void) + return !(flags & PSR_FIQ_MASK); + } + ++#define csdb() asm volatile ( "hint #20" : : : "memory" ) ++ ++/* ++ * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz ++ * and 0 otherwise. ++ */ ++static inline unsigned long array_index_mask_nospec(unsigned long idx, ++ unsigned long sz) ++{ ++ unsigned long mask; ++ ++ asm volatile ( "cmp %1, %2\n" ++ "sbc %0, xzr, xzr\n" ++ : "=r" (mask) ++ : "r" (idx), "Ir" (sz) ++ : "cc" ); ++ csdb(); ++ ++ return mask; ++} ++#define array_index_mask_nospec array_index_mask_nospec ++ + #endif + /* + * Local variables: +diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h +index 43fb6fe489..483cd20afd 100644 +--- a/xen/include/asm-x86/system.h ++++ b/xen/include/asm-x86/system.h +@@ -221,6 +221,30 @@ static always_inline unsigned long __xadd( + #define set_mb(var, value) do { xchg(&var, value); } while (0) + #define set_wmb(var, value) do { var = value; smp_wmb(); } while (0) + ++/** ++ * array_index_mask_nospec() - generate a mask that is ~0UL when the ++ * bounds check succeeds and 0 otherwise ++ * @index: array element index ++ * @size: number of elements in array ++ * ++ * Returns: ++ * 0 - (index < size) ++ */ ++static inline unsigned long array_index_mask_nospec(unsigned long index, ++ unsigned long size) ++{ ++ unsigned long mask; ++ ++ asm volatile ( "cmp %[size], %[index]; sbb %[mask], %[mask];" ++ : [mask] "=r" (mask) ++ : [size] "g" (size), [index] "r" (index) ); ++ ++ return mask; ++} ++ ++/* Override default implementation in nospec.h. */ ++#define array_index_mask_nospec array_index_mask_nospec ++ + #define local_irq_disable() asm volatile ( "cli" : : : "memory" ) + #define local_irq_enable() asm volatile ( "sti" : : : "memory" ) + +diff --git a/xen/include/xen/compiler.h b/xen/include/xen/compiler.h +index 533a8ea0f3..a7e05681c9 100644 +--- a/xen/include/xen/compiler.h ++++ b/xen/include/xen/compiler.h +@@ -81,6 +81,9 @@ + #pragma GCC visibility push(hidden) + #endif + ++/* Make the optimizer believe the variable can be manipulated arbitrarily. */ ++#define OPTIMIZER_HIDE_VAR(var) __asm__ ( "" : "+g" (var) ) ++ + /* This macro obfuscates arithmetic on a variable address so that gcc + shouldn't recognize the original var, and make assumptions about it */ + /* +diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h +new file mode 100644 +index 0000000000..48793996e8 +--- /dev/null ++++ b/xen/include/xen/nospec.h +@@ -0,0 +1,70 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* Copyright(c) 2018 Linus Torvalds. All rights reserved. */ ++/* Copyright(c) 2018 Alexei Starovoitov. All rights reserved. */ ++/* Copyright(c) 2018 Intel Corporation. All rights reserved. */ ++/* Copyright(c) 2018 Citrix Systems R&D Ltd. All rights reserved. */ ++ ++#ifndef XEN_NOSPEC_H ++#define XEN_NOSPEC_H ++ ++#include ++ ++/** ++ * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise ++ * @index: array element index ++ * @size: number of elements in array ++ * ++ * When @index is out of bounds (@index >= @size), the sign bit will be ++ * set. Extend the sign bit to all bits and invert, giving a result of ++ * zero for an out of bounds index, or ~0 if within bounds [0, @size). ++ */ ++#ifndef array_index_mask_nospec ++static inline unsigned long array_index_mask_nospec(unsigned long index, ++ unsigned long size) ++{ ++ /* ++ * Always calculate and emit the mask even if the compiler ++ * thinks the mask is not needed. The compiler does not take ++ * into account the value of @index under speculation. ++ */ ++ OPTIMIZER_HIDE_VAR(index); ++ return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1); ++} ++#endif ++ ++/* ++ * array_index_nospec - sanitize an array index after a bounds check ++ * ++ * For a code sequence like: ++ * ++ * if (index < size) { ++ * index = array_index_nospec(index, size); ++ * val = array[index]; ++ * } ++ * ++ * ...if the CPU speculates past the bounds check then ++ * array_index_nospec() will clamp the index within the range of [0, ++ * size). ++ */ ++#define array_index_nospec(index, size) \ ++({ \ ++ typeof(index) _i = (index); \ ++ typeof(size) _s = (size); \ ++ unsigned long _mask = array_index_mask_nospec(_i, _s); \ ++ \ ++ BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ ++ BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ ++ \ ++ (typeof(_i)) (_i & _mask); \ ++}) ++ ++#endif /* XEN_NOSPEC_H */ ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +-- +2.17.1 + + +From da33530ab393dcc04d3e35424956277669b8d8ce Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Mon, 30 Jul 2018 11:18:54 +0200 +Subject: [PATCH 03/42] x86: correctly set nonlazy_xstate_used when loading + full state + +In this case, just like xcr0_accum, nonlazy_xstate_used should always be +set to the intended new value, rather than possibly leaving the flag set +from a prior state load. + +Signed-off-by: Jan Beulich +Reviewed-by: Wei Liu +Acked-by: Andrew Cooper +master commit: f46bf0e101ca63118b9db2616e8f51e972d7f563 +master date: 2018-07-09 10:51:02 +0200 +--- + xen/arch/x86/domctl.c | 3 +-- + xen/arch/x86/hvm/hvm.c | 3 +-- + 2 files changed, 2 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c +index 8fbbf3aeb3..b04388d663 100644 +--- a/xen/arch/x86/domctl.c ++++ b/xen/arch/x86/domctl.c +@@ -1187,8 +1187,7 @@ long arch_do_domctl( + vcpu_pause(v); + v->arch.xcr0 = _xcr0; + v->arch.xcr0_accum = _xcr0_accum; +- if ( _xcr0_accum & XSTATE_NONLAZY ) +- v->arch.nonlazy_xstate_used = 1; ++ v->arch.nonlazy_xstate_used = _xcr0_accum & XSTATE_NONLAZY; + compress_xsave_states(v, _xsave_area, + evc->size - PV_XSAVE_HDR_SIZE); + vcpu_unpause(v); +diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c +index c23983cdff..279cb88e45 100644 +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -1324,8 +1324,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) + + v->arch.xcr0 = ctxt->xcr0; + v->arch.xcr0_accum = ctxt->xcr0_accum; +- if ( ctxt->xcr0_accum & XSTATE_NONLAZY ) +- v->arch.nonlazy_xstate_used = 1; ++ v->arch.nonlazy_xstate_used = ctxt->xcr0_accum & XSTATE_NONLAZY; + compress_xsave_states(v, &ctxt->save_area, + size - offsetof(struct hvm_hw_cpu_xsave, save_area)); + +-- +2.17.1 + + +From 4bdeedbd611c59f07878eb22955f655a81452835 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Mon, 30 Jul 2018 11:19:41 +0200 +Subject: [PATCH 04/42] x86/spec-ctrl: command line handling adjustments + +For one, "no-xen" should not imply "no-eager-fpu", as "eager FPU" mode +is to guard guests, not Xen itself, which is also expressed so by +print_details(). + +And then opt_ssbd, despite being off by default, should also be cleared +by the "no" and "no-xen" sub-options. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +master commit: ac3f9a72141a48d40fabfff561d5a7dc0e1b810d +master date: 2018-07-10 12:22:31 +0200 +--- + xen/arch/x86/spec_ctrl.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 08e6784c4c..73dc7170c7 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -124,6 +124,8 @@ static int __init parse_spec_ctrl(const char *s) + opt_msr_sc_pv = false; + opt_msr_sc_hvm = false; + ++ opt_eager_fpu = 0; ++ + disable_common: + opt_rsb_pv = false; + opt_rsb_hvm = false; +@@ -131,7 +133,7 @@ static int __init parse_spec_ctrl(const char *s) + opt_thunk = THUNK_JMP; + opt_ibrs = 0; + opt_ibpb = false; +- opt_eager_fpu = 0; ++ opt_ssbd = false; + } + else if ( val > 0 ) + rc = -EINVAL; +-- +2.17.1 + + +From 10c548215b052a266c53abd9d37d08b06ed91bb3 Mon Sep 17 00:00:00 2001 +From: Ian Jackson +Date: Mon, 30 Jul 2018 11:20:44 +0200 +Subject: [PATCH 05/42] xen: oprofile/nmi_int.c: Drop unwanted sexual reference + +This is not really very nice. + +This line doesn't have much value in itself. The rest of this comment +block is pretty clear what it wants to convey. So delete it. + +(While we are here, adopt the CODING_STYLE-mandated formatting.) + +Signed-off-by: Ian Jackson +Acked-by: Wei Liu +Acked-by: Lars Kurth +Acked-by: George Dunlap +master commit: 41cb2db62627a7438d938aae487550c3f4acb1da +master date: 2018-07-12 16:38:30 +0100 +--- + xen/arch/x86/oprofile/nmi_int.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/oprofile/nmi_int.c b/xen/arch/x86/oprofile/nmi_int.c +index d8f5230906..3dfb8fef93 100644 +--- a/xen/arch/x86/oprofile/nmi_int.c ++++ b/xen/arch/x86/oprofile/nmi_int.c +@@ -182,7 +182,7 @@ int nmi_reserve_counters(void) + if (!allocate_msrs()) + return -ENOMEM; + +- /* We walk a thin line between law and rape here. ++ /* + * We need to be careful to install our NMI handler + * without actually triggering any NMIs as this will + * break the core code horrifically. +-- +2.17.1 + + +From ac35e050b64a565fe234dd42e8dac163e946e58d Mon Sep 17 00:00:00 2001 +From: Sergey Dyasli +Date: Mon, 30 Jul 2018 11:21:28 +0200 +Subject: [PATCH 06/42] mm/page_alloc: correct first_dirty calculations during + block merging + +Currently it's possible to hit an assertion in alloc_heap_pages(): + +Assertion 'first_dirty != INVALID_DIRTY_IDX || !(pg[i].count_info & PGC_need_scrub)' failed at page_alloc.c:988 + +This can happen because a piece of logic to calculate first_dirty +during block merging in free_heap_pages() is missing for the following +scenario: + +1. Current block's first_dirty equals to INVALID_DIRTY_IDX +2. Successor block is free but its first_dirty != INVALID_DIRTY_IDX +3. The successor is merged into current block +4. Current block's first_dirty still equals to INVALID_DIRTY_IDX + +This will trigger the assertion during allocation of such block in +alloc_heap_pages() because there will be pages with PGC_need_scrub +bit set despite the claim of first_dirty that the block is scrubbed. + +Add the missing piece of logic and slightly update the comment for +the predecessor case to better capture the code's intent. + +Fixes 1a37f33ea613 ("mm: Place unscrubbed pages at the end of pagelist") + +Signed-off-by: Sergey Dyasli +Reviewed-by: Jan Beulich +Reviewed-by: Boris Ostrovsky +master commit: 1e2df9608857b5355f2ec3b1a34b87a2007dcd16 +master date: 2018-07-12 10:45:11 +0200 +--- + xen/common/page_alloc.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c +index 20ee1e4897..02aeed7c47 100644 +--- a/xen/common/page_alloc.c ++++ b/xen/common/page_alloc.c +@@ -1426,7 +1426,7 @@ static void free_heap_pages( + + page_list_del(predecessor, &heap(node, zone, order)); + +- /* Keep predecessor's first_dirty if it is already set. */ ++ /* Update predecessor's first_dirty if necessary. */ + if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX && + pg->u.free.first_dirty != INVALID_DIRTY_IDX ) + predecessor->u.free.first_dirty = (1U << order) + +@@ -1447,6 +1447,12 @@ static void free_heap_pages( + + check_and_stop_scrub(successor); + ++ /* Update pg's first_dirty if necessary. */ ++ if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX && ++ successor->u.free.first_dirty != INVALID_DIRTY_IDX ) ++ pg->u.free.first_dirty = (1U << order) + ++ successor->u.free.first_dirty; ++ + page_list_del(successor, &heap(node, zone, order)); + } + +-- +2.17.1 + + +From a44cf0c8728e08858638170a057675ca5479fdc7 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Mon, 30 Jul 2018 11:22:06 +0200 +Subject: [PATCH 07/42] allow cpu_down() to be called earlier + +The function's use of the stop-machine logic has so far prevented its +use ahead of the processing of the "ordinary" initcalls. Since at this +early time we're in a controlled environment anyway, there's no need for +such a heavy tool. Additionally this ought to have less of a performance +impact especially on large systems, compared to the alternative of +making stop-machine functionality available earlier. + +Signed-off-by: Jan Beulich +Reviewed-by: Wei Liu +Reviewed-by: Andrew Cooper +master commit: 5894c0a2da66243a89088d309c7e1ea212ab28d6 +master date: 2018-07-16 15:15:12 +0200 +--- + xen/common/cpu.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/xen/common/cpu.c b/xen/common/cpu.c +index 6350f150bd..653a56b840 100644 +--- a/xen/common/cpu.c ++++ b/xen/common/cpu.c +@@ -67,12 +67,17 @@ void __init register_cpu_notifier(struct notifier_block *nb) + spin_unlock(&cpu_add_remove_lock); + } + +-static int take_cpu_down(void *unused) ++static void _take_cpu_down(void *unused) + { + void *hcpu = (void *)(long)smp_processor_id(); + int notifier_rc = notifier_call_chain(&cpu_chain, CPU_DYING, hcpu, NULL); + BUG_ON(notifier_rc != NOTIFY_DONE); + __cpu_disable(); ++} ++ ++static int take_cpu_down(void *arg) ++{ ++ _take_cpu_down(arg); + return 0; + } + +@@ -98,7 +103,9 @@ int cpu_down(unsigned int cpu) + goto fail; + } + +- if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 ) ++ if ( unlikely(system_state < SYS_STATE_active) ) ++ on_selected_cpus(cpumask_of(cpu), _take_cpu_down, NULL, true); ++ else if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 ) + goto fail; + + __cpu_die(cpu); +-- +2.17.1 + + +From b53e0defcea1400c03f83d1d5cc30a3b237c8cfe Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 30 Jul 2018 11:22:42 +0200 +Subject: [PATCH 08/42] x86/svm Fixes and cleanup to svm_inject_event() + + * State adjustments (and debug tracing) for #DB/#BP/#PF should not be done + for `int $n` instructions. Updates to %cr2 occur even if the exception + combines to #DF. + * Don't opencode DR_STEP when updating %dr6. + * Simplify the logic for calling svm_emul_swint_injection() as in the common + case, every condition needs checking. + * Fix comments which have become stale as code has moved between components. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +Reviewed-by: Boris Ostrovsky +master commit: 8dab867c81ede455009028a9a88edc4ff3b9da88 +master date: 2018-07-17 10:12:40 +0100 +--- + xen/arch/x86/hvm/svm/svm.c | 41 ++++++++++++++++---------------------- + 1 file changed, 17 insertions(+), 24 deletions(-) + +diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c +index 165500e3f2..b964c59dad 100644 +--- a/xen/arch/x86/hvm/svm/svm.c ++++ b/xen/arch/x86/hvm/svm/svm.c +@@ -1432,24 +1432,18 @@ static void svm_inject_event(const struct x86_event *event) + * Xen must emulate enough of the event injection to be sure that a + * further fault shouldn't occur during delivery. This covers the fact + * that hardware doesn't perform DPL checking on injection. +- * +- * Also, it accounts for proper positioning of %rip for an event with trap +- * semantics (where %rip should point after the instruction) which suffers +- * a fault during injection (at which point %rip should point at the +- * instruction). + */ + if ( event->type == X86_EVENTTYPE_PRI_SW_EXCEPTION || +- (!cpu_has_svm_nrips && (event->type == X86_EVENTTYPE_SW_INTERRUPT || +- event->type == X86_EVENTTYPE_SW_EXCEPTION)) ) ++ (!cpu_has_svm_nrips && (event->type >= X86_EVENTTYPE_SW_INTERRUPT)) ) + svm_emul_swint_injection(&_event); + +- switch ( _event.vector ) ++ switch ( _event.vector | -(_event.type == X86_EVENTTYPE_SW_INTERRUPT) ) + { + case TRAP_debug: + if ( regs->eflags & X86_EFLAGS_TF ) + { + __restore_debug_registers(vmcb, curr); +- vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000); ++ vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | DR_STEP); + } + /* fall through */ + case TRAP_int3: +@@ -1459,6 +1453,13 @@ static void svm_inject_event(const struct x86_event *event) + domain_pause_for_debugger(); + return; + } ++ break; ++ ++ case TRAP_page_fault: ++ ASSERT(_event.type == X86_EVENTTYPE_HW_EXCEPTION); ++ curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2; ++ vmcb_set_cr2(vmcb, _event.cr2); ++ break; + } + + if ( unlikely(eventinj.fields.v) && +@@ -1481,13 +1482,9 @@ static void svm_inject_event(const struct x86_event *event) + * icebp, software events with trap semantics need emulating, so %rip in + * the trap frame points after the instruction. + * +- * The x86 emulator (if requested by the x86_swint_emulate_* choice) will +- * have performed checks such as presence/dpl/etc and believes that the +- * event injection will succeed without faulting. +- * +- * The x86 emulator will always provide fault semantics for software +- * events, with _trap.insn_len set appropriately. If the injection +- * requires emulation, move %rip forwards at this point. ++ * svm_emul_swint_injection() has already confirmed that events with trap ++ * semantics won't fault on injection. Position %rip/NextRIP suitably, ++ * and restrict the event type to what hardware will tolerate. + */ + switch ( _event.type ) + { +@@ -1544,16 +1541,12 @@ static void svm_inject_event(const struct x86_event *event) + eventinj.fields.errorcode == (uint16_t)eventinj.fields.errorcode); + vmcb->eventinj = eventinj; + +- if ( _event.vector == TRAP_page_fault ) +- { +- curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2; +- vmcb_set_cr2(vmcb, _event.cr2); +- HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, TRC_PAR_LONG(_event.cr2)); +- } ++ if ( _event.vector == TRAP_page_fault && ++ _event.type == X86_EVENTTYPE_HW_EXCEPTION ) ++ HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, ++ TRC_PAR_LONG(_event.cr2)); + else +- { + HVMTRACE_2D(INJ_EXC, _event.vector, _event.error_code); +- } + } + + static int svm_event_pending(struct vcpu *v) +-- +2.17.1 + + +From 0a2016ca2fabfe674c311dcfd8e15fec0ba3f7b6 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Mon, 30 Jul 2018 11:23:22 +0200 +Subject: [PATCH 09/42] cpupools: fix state when downing a CPU failed + +While I've run into the issue with further patches in place which no +longer guarantee the per-CPU area to start out as all zeros, the +CPU_DOWN_FAILED processing looks to have the same issue: By not zapping +the per-CPU cpupool pointer, cpupool_cpu_add()'s (indirect) invocation +of schedule_cpu_switch() will trigger the "c != old_pool" assertion +there. + +Clearing the field during CPU_DOWN_PREPARE is too early (afaict this +should not happen before cpu_disable_scheduler()). Clearing it in +CPU_DEAD and CPU_DOWN_FAILED would be an option, but would take the same +piece of code twice. Since the field's value shouldn't matter while the +CPU is offline, simply clear it (implicitly) for CPU_ONLINE and +CPU_DOWN_FAILED, but only for other than the suspend/resume case (which +gets specially handled in cpupool_cpu_remove()). + +By adjusting the conditional in cpupool_cpu_add() CPU_DOWN_FAILED +handling in the suspend case should now also be handled better. + +Signed-off-by: Jan Beulich +Reviewed-by: Juergen Gross +master commit: cb1ae9a27819cea0c5008773c68a7be6f37eb0e5 +master date: 2018-07-19 09:41:55 +0200 +--- + xen/common/cpupool.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c +index 999839444e..1e8edcbd57 100644 +--- a/xen/common/cpupool.c ++++ b/xen/common/cpupool.c +@@ -490,7 +490,7 @@ static int cpupool_cpu_add(unsigned int cpu) + cpumask_clear_cpu(cpu, &cpupool_locked_cpus); + cpumask_set_cpu(cpu, &cpupool_free_cpus); + +- if ( system_state == SYS_STATE_resume ) ++ if ( system_state == SYS_STATE_suspend || system_state == SYS_STATE_resume ) + { + struct cpupool **c; + +@@ -522,6 +522,7 @@ static int cpupool_cpu_add(unsigned int cpu) + * (or unplugging would have failed) and that is the default behavior + * anyway. + */ ++ per_cpu(cpupool, cpu) = NULL; + ret = cpupool_assign_cpu_locked(cpupool0, cpu); + } + out: +-- +2.17.1 + + +From bd51a6424202a5f1cd13dee6614bcb69ecbd2458 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Mon, 30 Jul 2018 11:24:01 +0200 +Subject: [PATCH 10/42] x86/AMD: distinguish compute units from hyper-threads +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Fam17 replaces CUs by HTs, which we should reflect accordingly, even if +the difference is not very big. The most relevant change (requiring some +code restructuring) is that the topoext feature no longer means there is +a valid CU ID. + +Take the opportunity and convert wrongly plain int variables in +set_cpu_sibling_map() to unsigned int. + +Signed-off-by: Jan Beulich +Reviewed-by: Brian Woods +Reviewed-by: Roger Pau Monné +Acked-by: Andrew Cooper +master commit: 9429b07a0af7f92a5f25e4068e11db881e157495 +master date: 2018-07-19 09:42:42 +0200 +--- + xen/arch/x86/cpu/amd.c | 16 +++++++++++----- + xen/arch/x86/smpboot.c | 32 ++++++++++++++++++++------------ + 2 files changed, 31 insertions(+), 17 deletions(-) + +diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c +index 458a3fe60c..76078b55b2 100644 +--- a/xen/arch/x86/cpu/amd.c ++++ b/xen/arch/x86/cpu/amd.c +@@ -505,17 +505,23 @@ static void amd_get_topology(struct cpuinfo_x86 *c) + u32 eax, ebx, ecx, edx; + + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); +- c->compute_unit_id = ebx & 0xFF; + c->x86_num_siblings = ((ebx >> 8) & 0x3) + 1; ++ ++ if (c->x86 < 0x17) ++ c->compute_unit_id = ebx & 0xFF; ++ else { ++ c->cpu_core_id = ebx & 0xFF; ++ c->x86_max_cores /= c->x86_num_siblings; ++ } + } + + if (opt_cpu_info) + printk("CPU %d(%d) -> Processor %d, %s %d\n", + cpu, c->x86_max_cores, c->phys_proc_id, +- cpu_has(c, X86_FEATURE_TOPOEXT) ? "Compute Unit" : +- "Core", +- cpu_has(c, X86_FEATURE_TOPOEXT) ? c->compute_unit_id : +- c->cpu_core_id); ++ c->compute_unit_id != INVALID_CUID ? "Compute Unit" ++ : "Core", ++ c->compute_unit_id != INVALID_CUID ? c->compute_unit_id ++ : c->cpu_core_id); + } + + static void early_init_amd(struct cpuinfo_x86 *c) +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index d4478e6132..78ba73578a 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -234,33 +234,41 @@ static void link_thread_siblings(int cpu1, int cpu2) + cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1)); + } + +-static void set_cpu_sibling_map(int cpu) ++static void set_cpu_sibling_map(unsigned int cpu) + { +- int i; ++ unsigned int i; + struct cpuinfo_x86 *c = cpu_data; + + cpumask_set_cpu(cpu, &cpu_sibling_setup_map); + + cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]); ++ cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, cpu)); ++ cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu)); + + if ( c[cpu].x86_num_siblings > 1 ) + { + for_each_cpu ( i, &cpu_sibling_setup_map ) + { +- if ( cpu_has(c, X86_FEATURE_TOPOEXT) ) { +- if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) && +- (c[cpu].compute_unit_id == c[i].compute_unit_id) ) ++ if ( cpu == i || c[cpu].phys_proc_id != c[i].phys_proc_id ) ++ continue; ++ if ( c[cpu].compute_unit_id != INVALID_CUID && ++ c[i].compute_unit_id != INVALID_CUID ) ++ { ++ if ( c[cpu].compute_unit_id == c[i].compute_unit_id ) ++ link_thread_siblings(cpu, i); ++ } ++ else if ( c[cpu].cpu_core_id != XEN_INVALID_CORE_ID && ++ c[i].cpu_core_id != XEN_INVALID_CORE_ID ) ++ { ++ if ( c[cpu].cpu_core_id == c[i].cpu_core_id ) + link_thread_siblings(cpu, i); +- } else if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) && +- (c[cpu].cpu_core_id == c[i].cpu_core_id) ) { +- link_thread_siblings(cpu, i); + } ++ else ++ printk(XENLOG_WARNING ++ "CPU%u: unclear relationship with CPU%u\n", ++ cpu, i); + } + } +- else +- { +- cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu)); +- } + + if ( c[cpu].x86_max_cores == 1 ) + { +-- +2.17.1 + + +From 5908b4866b682d9189c36eddf7c898fd95b27ec1 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Mon, 30 Jul 2018 11:24:53 +0200 +Subject: [PATCH 11/42] x86: distinguish CPU offlining from CPU removal + +In order to be able to service #MC on offlined CPUs, the GDT, IDT, +stack, and per-CPU data (which includes the TSS) need to be kept +allocated. They should only be freed upon CPU removal (which we +currently don't support, so some code is becoming effectively dead for +the moment). + +Note that for now park_offline_cpus doesn't get set to true anywhere - +this is going to be the subject of a subsequent patch. + +Signed-off-by: Jan Beulich +Reviewed-by: Wei Liu +Reviewed-by: Andrew Cooper +master commit: 2e6c8f182c9c50129b1c7a620242861e6ad6a9fb +master date: 2018-07-19 13:43:33 +0100 +--- + xen/arch/x86/cpu/mcheck/mce.c | 15 ++++++-- + xen/arch/x86/domain.c | 9 +++-- + xen/arch/x86/genapic/x2apic.c | 9 +++-- + xen/arch/x86/percpu.c | 9 +++-- + xen/arch/x86/smpboot.c | 71 ++++++++++++++++++++++------------- + xen/include/asm-x86/smp.h | 2 + + xen/include/xen/cpu.h | 2 + + xen/include/xen/cpumask.h | 23 ++++++++++++ + xen/include/xen/mm.h | 8 ++++ + xen/include/xen/xmalloc.h | 6 +++ + 10 files changed, 115 insertions(+), 39 deletions(-) + +diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c +index a8c287d124..32273d9208 100644 +--- a/xen/arch/x86/cpu/mcheck/mce.c ++++ b/xen/arch/x86/cpu/mcheck/mce.c +@@ -692,12 +692,15 @@ static void cpu_bank_free(unsigned int cpu) + + mcabanks_free(poll); + mcabanks_free(clr); ++ ++ per_cpu(poll_bankmask, cpu) = NULL; ++ per_cpu(mce_clear_banks, cpu) = NULL; + } + + static int cpu_bank_alloc(unsigned int cpu) + { +- struct mca_banks *poll = mcabanks_alloc(); +- struct mca_banks *clr = mcabanks_alloc(); ++ struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc(); ++ struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc(); + + if ( !poll || !clr ) + { +@@ -725,7 +728,13 @@ static int cpu_callback( + + case CPU_UP_CANCELED: + case CPU_DEAD: +- cpu_bank_free(cpu); ++ if ( !park_offline_cpus ) ++ cpu_bank_free(cpu); ++ break; ++ ++ case CPU_REMOVE: ++ if ( park_offline_cpus ) ++ cpu_bank_free(cpu); + break; + } + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 9850a782ec..c39cf2c6e5 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -107,10 +107,11 @@ static void play_dead(void) + local_irq_disable(); + + /* +- * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible, +- * as they may be freed at any time. In this case, heap corruption or +- * #PF can occur (when heap debugging is enabled). For example, even +- * printk() can involve tasklet scheduling, which touches per-cpu vars. ++ * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible, ++ * as they may be freed at any time if offline CPUs don't get parked. In ++ * this case, heap corruption or #PF can occur (when heap debugging is ++ * enabled). For example, even printk() can involve tasklet scheduling, ++ * which touches per-cpu vars. + * + * Consider very carefully when adding code to *dead_idle. Most hypervisor + * subsystems are unsafe to call. +diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c +index 4779b0d0d5..d997806272 100644 +--- a/xen/arch/x86/genapic/x2apic.c ++++ b/xen/arch/x86/genapic/x2apic.c +@@ -201,18 +201,21 @@ static int update_clusterinfo( + if ( !cluster_cpus_spare ) + cluster_cpus_spare = xzalloc(cpumask_t); + if ( !cluster_cpus_spare || +- !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) ) ++ !cond_alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) ) + err = -ENOMEM; + break; + case CPU_UP_CANCELED: + case CPU_DEAD: ++ case CPU_REMOVE: ++ if ( park_offline_cpus == (action != CPU_REMOVE) ) ++ break; + if ( per_cpu(cluster_cpus, cpu) ) + { + cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu)); + if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) ) +- xfree(per_cpu(cluster_cpus, cpu)); ++ XFREE(per_cpu(cluster_cpus, cpu)); + } +- free_cpumask_var(per_cpu(scratch_mask, cpu)); ++ FREE_CPUMASK_VAR(per_cpu(scratch_mask, cpu)); + break; + } + +diff --git a/xen/arch/x86/percpu.c b/xen/arch/x86/percpu.c +index c9997b7937..8be4ebddf4 100644 +--- a/xen/arch/x86/percpu.c ++++ b/xen/arch/x86/percpu.c +@@ -28,7 +28,7 @@ static int init_percpu_area(unsigned int cpu) + char *p; + + if ( __per_cpu_offset[cpu] != INVALID_PERCPU_AREA ) +- return -EBUSY; ++ return 0; + + if ( (p = alloc_xenheap_pages(PERCPU_ORDER, 0)) == NULL ) + return -ENOMEM; +@@ -76,9 +76,12 @@ static int cpu_percpu_callback( + break; + case CPU_UP_CANCELED: + case CPU_DEAD: +- free_percpu_area(cpu); ++ if ( !park_offline_cpus ) ++ free_percpu_area(cpu); + break; +- default: ++ case CPU_REMOVE: ++ if ( park_offline_cpus ) ++ free_percpu_area(cpu); + break; + } + +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index 78ba73578a..7e76cc3d68 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -63,6 +63,8 @@ static cpumask_t scratch_cpu0mask; + cpumask_t cpu_online_map __read_mostly; + EXPORT_SYMBOL(cpu_online_map); + ++bool __read_mostly park_offline_cpus; ++ + unsigned int __read_mostly nr_sockets; + cpumask_t **__read_mostly socket_cpumask; + static cpumask_t *secondary_socket_cpumask; +@@ -895,7 +897,14 @@ static void cleanup_cpu_root_pgt(unsigned int cpu) + } + } + +-static void cpu_smpboot_free(unsigned int cpu) ++/* ++ * The 'remove' boolean controls whether a CPU is just getting offlined (and ++ * parked), or outright removed / offlined without parking. Parked CPUs need ++ * things like their stack, GDT, IDT, TSS, and per-CPU data still available. ++ * A few other items, in particular CPU masks, are also retained, as it's ++ * difficult to prove that they're entirely unreferenced from parked CPUs. ++ */ ++static void cpu_smpboot_free(unsigned int cpu, bool remove) + { + unsigned int order, socket = cpu_to_socket(cpu); + struct cpuinfo_x86 *c = cpu_data; +@@ -906,15 +915,19 @@ static void cpu_smpboot_free(unsigned int cpu) + socket_cpumask[socket] = NULL; + } + +- c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID; +- c[cpu].cpu_core_id = XEN_INVALID_CORE_ID; +- c[cpu].compute_unit_id = INVALID_CUID; + cpumask_clear_cpu(cpu, &cpu_sibling_setup_map); + +- free_cpumask_var(per_cpu(cpu_sibling_mask, cpu)); +- free_cpumask_var(per_cpu(cpu_core_mask, cpu)); +- if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask ) +- free_cpumask_var(per_cpu(scratch_cpumask, cpu)); ++ if ( remove ) ++ { ++ c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID; ++ c[cpu].cpu_core_id = XEN_INVALID_CORE_ID; ++ c[cpu].compute_unit_id = INVALID_CUID; ++ ++ FREE_CPUMASK_VAR(per_cpu(cpu_sibling_mask, cpu)); ++ FREE_CPUMASK_VAR(per_cpu(cpu_core_mask, cpu)); ++ if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask ) ++ FREE_CPUMASK_VAR(per_cpu(scratch_cpumask, cpu)); ++ } + + cleanup_cpu_root_pgt(cpu); + +@@ -936,19 +949,21 @@ static void cpu_smpboot_free(unsigned int cpu) + } + + order = get_order_from_pages(NR_RESERVED_GDT_PAGES); +- free_xenheap_pages(per_cpu(gdt_table, cpu), order); ++ if ( remove ) ++ FREE_XENHEAP_PAGES(per_cpu(gdt_table, cpu), order); + + free_xenheap_pages(per_cpu(compat_gdt_table, cpu), order); + +- order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t)); +- free_xenheap_pages(idt_tables[cpu], order); +- idt_tables[cpu] = NULL; +- +- if ( stack_base[cpu] != NULL ) ++ if ( remove ) + { +- memguard_unguard_stack(stack_base[cpu]); +- free_xenheap_pages(stack_base[cpu], STACK_ORDER); +- stack_base[cpu] = NULL; ++ order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t)); ++ FREE_XENHEAP_PAGES(idt_tables[cpu], order); ++ ++ if ( stack_base[cpu] ) ++ { ++ memguard_unguard_stack(stack_base[cpu]); ++ FREE_XENHEAP_PAGES(stack_base[cpu], STACK_ORDER); ++ } + } + } + +@@ -963,15 +978,17 @@ static int cpu_smpboot_alloc(unsigned int cpu) + if ( node != NUMA_NO_NODE ) + memflags = MEMF_node(node); + +- stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags); ++ if ( stack_base[cpu] == NULL ) ++ stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags); + if ( stack_base[cpu] == NULL ) + goto out; + memguard_guard_stack(stack_base[cpu]); + + order = get_order_from_pages(NR_RESERVED_GDT_PAGES); +- per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags); ++ gdt = per_cpu(gdt_table, cpu) ?: alloc_xenheap_pages(order, memflags); + if ( gdt == NULL ) + goto out; ++ per_cpu(gdt_table, cpu) = gdt; + memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE); + BUILD_BUG_ON(NR_CPUS > 0x10000); + gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; +@@ -983,7 +1000,8 @@ static int cpu_smpboot_alloc(unsigned int cpu) + gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; + + order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t)); +- idt_tables[cpu] = alloc_xenheap_pages(order, memflags); ++ if ( idt_tables[cpu] == NULL ) ++ idt_tables[cpu] = alloc_xenheap_pages(order, memflags); + if ( idt_tables[cpu] == NULL ) + goto out; + memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t)); +@@ -1011,16 +1029,16 @@ static int cpu_smpboot_alloc(unsigned int cpu) + (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL ) + goto out; + +- if ( !(zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) && +- zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) && +- alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu))) ) ++ if ( !(cond_zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) && ++ cond_zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) && ++ cond_alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu))) ) + goto out; + + rc = 0; + + out: + if ( rc ) +- cpu_smpboot_free(cpu); ++ cpu_smpboot_free(cpu, true); + + return rc; + } +@@ -1038,9 +1056,10 @@ static int cpu_smpboot_callback( + break; + case CPU_UP_CANCELED: + case CPU_DEAD: +- cpu_smpboot_free(cpu); ++ cpu_smpboot_free(cpu, !park_offline_cpus); + break; +- default: ++ case CPU_REMOVE: ++ cpu_smpboot_free(cpu, true); + break; + } + +diff --git a/xen/include/asm-x86/smp.h b/xen/include/asm-x86/smp.h +index 4e5f673fec..09c55458df 100644 +--- a/xen/include/asm-x86/smp.h ++++ b/xen/include/asm-x86/smp.h +@@ -26,6 +26,8 @@ DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask); + DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask); + DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask); + ++extern bool park_offline_cpus; ++ + void smp_send_nmi_allbutself(void); + + void send_IPI_mask(const cpumask_t *, int vector); +diff --git a/xen/include/xen/cpu.h b/xen/include/xen/cpu.h +index ffefc09f8e..2fe3ec05d8 100644 +--- a/xen/include/xen/cpu.h ++++ b/xen/include/xen/cpu.h +@@ -47,6 +47,8 @@ void register_cpu_notifier(struct notifier_block *nb); + #define CPU_DYING (0x0007 | NOTIFY_REVERSE) + /* CPU_DEAD: CPU is dead. */ + #define CPU_DEAD (0x0008 | NOTIFY_REVERSE) ++/* CPU_REMOVE: CPU was removed. */ ++#define CPU_REMOVE (0x0009 | NOTIFY_REVERSE) + + /* Perform CPU hotplug. May return -EAGAIN. */ + int cpu_down(unsigned int cpu); +diff --git a/xen/include/xen/cpumask.h b/xen/include/xen/cpumask.h +index 42340a098e..4a11bcc3f3 100644 +--- a/xen/include/xen/cpumask.h ++++ b/xen/include/xen/cpumask.h +@@ -351,16 +351,35 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask) + return *mask != NULL; + } + ++static inline bool cond_alloc_cpumask_var(cpumask_var_t *mask) ++{ ++ if (*mask == NULL) ++ *mask = _xmalloc(nr_cpumask_bits / 8, sizeof(long)); ++ return *mask != NULL; ++} ++ + static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask) + { + *(void **)mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long)); + return *mask != NULL; + } + ++static inline bool cond_zalloc_cpumask_var(cpumask_var_t *mask) ++{ ++ if (*mask == NULL) ++ *mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long)); ++ else ++ cpumask_clear(*mask); ++ return *mask != NULL; ++} ++ + static inline void free_cpumask_var(cpumask_var_t mask) + { + xfree(mask); + } ++ ++/* Free an allocated mask, and zero the pointer to it. */ ++#define FREE_CPUMASK_VAR(m) XFREE(m) + #else + typedef cpumask_t cpumask_var_t[1]; + +@@ -368,16 +387,20 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask) + { + return 1; + } ++#define cond_alloc_cpumask_var alloc_cpumask_var + + static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask) + { + cpumask_clear(*mask); + return 1; + } ++#define cond_zalloc_cpumask_var zalloc_cpumask_var + + static inline void free_cpumask_var(cpumask_var_t mask) + { + } ++ ++#define FREE_CPUMASK_VAR(m) free_cpumask_var(m) + #endif + + #if NR_CPUS > 1 +diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h +index e928551c91..24654e8e22 100644 +--- a/xen/include/xen/mm.h ++++ b/xen/include/xen/mm.h +@@ -162,6 +162,14 @@ void free_xenheap_pages(void *v, unsigned int order); + bool scrub_free_pages(void); + #define alloc_xenheap_page() (alloc_xenheap_pages(0,0)) + #define free_xenheap_page(v) (free_xenheap_pages(v,0)) ++ ++/* Free an allocation, and zero the pointer to it. */ ++#define FREE_XENHEAP_PAGES(p, o) do { \ ++ free_xenheap_pages(p, o); \ ++ (p) = NULL; \ ++} while ( false ) ++#define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0) ++ + /* Map machine page range in Xen virtual address space. */ + int map_pages_to_xen( + unsigned long virt, +diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h +index cc2673d8ae..9aa5edf593 100644 +--- a/xen/include/xen/xmalloc.h ++++ b/xen/include/xen/xmalloc.h +@@ -26,6 +26,12 @@ + /* Free any of the above. */ + extern void xfree(void *); + ++/* Free an allocation, and zero the pointer to it. */ ++#define XFREE(p) do { \ ++ xfree(p); \ ++ (p) = NULL; \ ++} while ( false ) ++ + /* Underlying functions */ + extern void *_xmalloc(unsigned long size, unsigned long align); + extern void *_xzalloc(unsigned long size, unsigned long align); +-- +2.17.1 + + +From 75313e478e894176056e1fc5852136b344a0dc70 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Mon, 30 Jul 2018 11:25:38 +0200 +Subject: [PATCH 12/42] x86: possibly bring up all CPUs even if not all are + supposed to be used +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Reportedly Intel CPUs which can't broadcast #MC to all targeted +cores/threads because some have CR4.MCE clear will shut down. Therefore +we want to keep CR4.MCE enabled when offlining a CPU, and we need to +bring up all CPUs in order to be able to set CR4.MCE in the first place. + +The use of clear_in_cr4() in cpu_mcheck_disable() was ill advised +anyway, and to avoid future similar mistakes I'm removing clear_in_cr4() +altogether right here. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +Reviewed-by: Wei Liu +master commit: 8797d20a6ec2dd75195585a107ce345c51c0a59a +master date: 2018-07-19 13:43:33 +0100 +--- + xen/arch/x86/cpu/common.c | 4 ++++ + xen/arch/x86/cpu/mcheck/mce_intel.c | 2 -- + xen/arch/x86/mpparse.c | 15 +++++++++++---- + xen/arch/x86/setup.c | 18 +++++++++++++++--- + xen/include/asm-x86/processor.h | 6 ------ + 5 files changed, 30 insertions(+), 15 deletions(-) + +diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c +index 528aff1811..fdb022875a 100644 +--- a/xen/arch/x86/cpu/common.c ++++ b/xen/arch/x86/cpu/common.c +@@ -14,6 +14,7 @@ + #include /* for XEN_INVALID_{SOCKET,CORE}_ID */ + + #include "cpu.h" ++#include "mcheck/x86_mca.h" + + bool_t opt_arat = 1; + boolean_param("arat", opt_arat); +@@ -355,6 +356,9 @@ static void __init early_cpu_detect(void) + hap_paddr_bits = PADDR_BITS; + } + ++ if (c->x86_vendor != X86_VENDOR_AMD) ++ park_offline_cpus = opt_mce; ++ + initialize_cpu_data(0); + } + +diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c +index e5dd956a24..4474a34e34 100644 +--- a/xen/arch/x86/cpu/mcheck/mce_intel.c ++++ b/xen/arch/x86/cpu/mcheck/mce_intel.c +@@ -636,8 +636,6 @@ static void clear_cmci(void) + + static void cpu_mcheck_disable(void) + { +- clear_in_cr4(X86_CR4_MCE); +- + if ( cmci_support && opt_mce ) + clear_cmci(); + } +diff --git a/xen/arch/x86/mpparse.c b/xen/arch/x86/mpparse.c +index 49140e46f0..f3f6d48668 100644 +--- a/xen/arch/x86/mpparse.c ++++ b/xen/arch/x86/mpparse.c +@@ -68,19 +68,26 @@ physid_mask_t phys_cpu_present_map; + + void __init set_nr_cpu_ids(unsigned int max_cpus) + { ++ unsigned int tot_cpus = num_processors + disabled_cpus; ++ + if (!max_cpus) +- max_cpus = num_processors + disabled_cpus; ++ max_cpus = tot_cpus; + if (max_cpus > NR_CPUS) + max_cpus = NR_CPUS; + else if (!max_cpus) + max_cpus = 1; + printk(XENLOG_INFO "SMP: Allowing %u CPUs (%d hotplug CPUs)\n", + max_cpus, max_t(int, max_cpus - num_processors, 0)); +- nr_cpu_ids = max_cpus; ++ ++ if (!park_offline_cpus) ++ tot_cpus = max_cpus; ++ nr_cpu_ids = min(tot_cpus, NR_CPUS + 0u); ++ if (park_offline_cpus && nr_cpu_ids < num_processors) ++ printk(XENLOG_WARNING "SMP: Cannot bring up %u further CPUs\n", ++ num_processors - nr_cpu_ids); + + #ifndef nr_cpumask_bits +- nr_cpumask_bits = (max_cpus + (BITS_PER_LONG - 1)) & +- ~(BITS_PER_LONG - 1); ++ nr_cpumask_bits = ROUNDUP(nr_cpu_ids, BITS_PER_LONG); + printk(XENLOG_DEBUG "NR_CPUS:%u nr_cpumask_bits:%u\n", + NR_CPUS, nr_cpumask_bits); + #endif +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index a3172ca92c..984c948216 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -665,7 +665,7 @@ void __init noreturn __start_xen(unsigned long mbi_p) + { + char *memmap_type = NULL; + char *cmdline, *kextra, *loader; +- unsigned int initrdidx; ++ unsigned int initrdidx, num_parked = 0; + multiboot_info_t *mbi; + module_t *mod; + unsigned long nr_pages, raw_max_page, modules_headroom, *module_map; +@@ -1494,7 +1494,8 @@ void __init noreturn __start_xen(unsigned long mbi_p) + else + { + set_nr_cpu_ids(max_cpus); +- max_cpus = nr_cpu_ids; ++ if ( !max_cpus ) ++ max_cpus = nr_cpu_ids; + } + + if ( xen_guest ) +@@ -1617,16 +1618,27 @@ void __init noreturn __start_xen(unsigned long mbi_p) + /* Set up node_to_cpumask based on cpu_to_node[]. */ + numa_add_cpu(i); + +- if ( (num_online_cpus() < max_cpus) && !cpu_online(i) ) ++ if ( (park_offline_cpus || num_online_cpus() < max_cpus) && ++ !cpu_online(i) ) + { + int ret = cpu_up(i); + if ( ret != 0 ) + printk("Failed to bring up CPU %u (error %d)\n", i, ret); ++ else if ( num_online_cpus() > max_cpus ) ++ { ++ ret = cpu_down(i); ++ if ( !ret ) ++ ++num_parked; ++ else ++ printk("Could not re-offline CPU%u (%d)\n", i, ret); ++ } + } + } + } + + printk("Brought up %ld CPUs\n", (long)num_online_cpus()); ++ if ( num_parked ) ++ printk(XENLOG_INFO "Parked %u CPUs\n", num_parked); + smp_cpus_done(); + + do_initcalls(); +diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h +index 9924cdf1f3..2bd9e69684 100644 +--- a/xen/include/asm-x86/processor.h ++++ b/xen/include/asm-x86/processor.h +@@ -337,12 +337,6 @@ static always_inline void set_in_cr4 (unsigned long mask) + write_cr4(read_cr4() | mask); + } + +-static always_inline void clear_in_cr4 (unsigned long mask) +-{ +- mmu_cr4_features &= ~mask; +- write_cr4(read_cr4() & ~mask); +-} +- + static inline unsigned int read_pkru(void) + { + unsigned int pkru; +-- +2.17.1 + + +From 353edf12c865d2a1e24173aac841452b90614915 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Mon, 30 Jul 2018 11:26:16 +0200 +Subject: [PATCH 13/42] x86: command line option to avoid use of secondary + hyper-threads +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Shared resources (L1 cache and TLB in particular) present a risk of +information leak via side channels. Provide a means to avoid use of +hyperthreads in such cases. + +Signed-off-by: Jan Beulich +Reviewed-by: Roger Pau Monné +Reviewed-by: Andrew Cooper +master commit: d8f974f1a646c0200b97ebcabb808324b288fadb +master date: 2018-07-19 13:43:33 +0100 +--- + docs/misc/xen-command-line.markdown | 7 +++++++ + xen/arch/x86/setup.c | 8 +++++++- + xen/arch/x86/sysctl.c | 16 +++++++++++++++- + xen/include/asm-x86/setup.h | 2 ++ + 4 files changed, 31 insertions(+), 2 deletions(-) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 075e5ea159..3b710b71fb 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -1748,6 +1748,13 @@ Use `smap=hvm` to allow SMAP use by HVM guests only. + Flag to enable Supervisor Mode Execution Protection + Use `smep=hvm` to allow SMEP use by HVM guests only. + ++### smt (x86) ++> `= ` ++ ++Default: `true` ++ ++Control bring up of multiple hyper-threads per CPU core. ++ + ### snb\_igd\_quirk + > `= | cap | ` + +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index 984c948216..66fd13f93a 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -62,6 +62,9 @@ boolean_param("nosmp", opt_nosmp); + static unsigned int __initdata max_cpus; + integer_param("maxcpus", max_cpus); + ++int8_t __read_mostly opt_smt = -1; ++boolean_param("smt", opt_smt); ++ + /* opt_invpcid: If false, don't use INVPCID instruction even if available. */ + static bool __initdata opt_invpcid = true; + boolean_param("invpcid", opt_invpcid); +@@ -1624,7 +1627,10 @@ void __init noreturn __start_xen(unsigned long mbi_p) + int ret = cpu_up(i); + if ( ret != 0 ) + printk("Failed to bring up CPU %u (error %d)\n", i, ret); +- else if ( num_online_cpus() > max_cpus ) ++ else if ( num_online_cpus() > max_cpus || ++ (!opt_smt && ++ cpu_data[i].compute_unit_id == INVALID_CUID && ++ cpumask_weight(per_cpu(cpu_sibling_mask, i)) > 1) ) + { + ret = cpu_down(i); + if ( !ret ) +diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c +index 4d372db12b..e704ed7f1c 100644 +--- a/xen/arch/x86/sysctl.c ++++ b/xen/arch/x86/sysctl.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -48,14 +49,27 @@ static void l3_cache_get(void *arg) + + long cpu_up_helper(void *data) + { +- int cpu = (unsigned long)data; ++ unsigned int cpu = (unsigned long)data; + int ret = cpu_up(cpu); ++ + if ( ret == -EBUSY ) + { + /* On EBUSY, flush RCU work and have one more go. */ + rcu_barrier(); + ret = cpu_up(cpu); + } ++ ++ if ( !ret && !opt_smt && ++ cpu_data[cpu].compute_unit_id == INVALID_CUID && ++ cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) > 1 ) ++ { ++ ret = cpu_down_helper(data); ++ if ( ret ) ++ printk("Could not re-offline CPU%u (%d)\n", cpu, ret); ++ else ++ ret = -EPERM; ++ } ++ + return ret; + } + +diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h +index 19232afa01..c09a5ff381 100644 +--- a/xen/include/asm-x86/setup.h ++++ b/xen/include/asm-x86/setup.h +@@ -66,6 +66,8 @@ extern uint8_t kbd_shift_flags; + extern unsigned long highmem_start; + #endif + ++extern int8_t opt_smt; ++ + #ifdef CONFIG_SHADOW_PAGING + extern bool opt_dom0_shadow; + #else +-- +2.17.1 + + +From 037fe82cf5fadf0f74c3da70560ee7592a8f2083 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 30 Jul 2018 11:26:53 +0200 +Subject: [PATCH 14/42] x86/vmx: Don't clobber %dr6 while debugging state is + lazy +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +c/s 4f36452b63 introduced a write to %dr6 in the #DB intercept case, but the +guests debug registers may be lazy at this point, at which point the guests +later attempt to read %dr6 will discard this value and use the older stale +value. + +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +Acked-by: Kevin Tian +master commit: 3cdac2805692c7accde2f405d81cc0be799aee48 +master date: 2018-07-19 14:06:48 +0100 +--- + xen/arch/x86/hvm/vmx/vmx.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 610c8d6eb9..7189820bfc 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -3701,6 +3701,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) + */ + __vmread(EXIT_QUALIFICATION, &exit_qualification); + HVMTRACE_1D(TRAP_DEBUG, exit_qualification); ++ __restore_debug_registers(v); + write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE); + if ( !v->domain->debugger_attached ) + { +-- +2.17.1 + + +From 543027c9842d8416047ef38846d2de1295052e92 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 30 Jul 2018 11:27:33 +0200 +Subject: [PATCH 15/42] x86/xstate: Use a guests CPUID policy, rather than + allowing all features + +It turns out that Xen has never enforced that a domain remain within the +xstate features advertised in CPUID. + +The check of new_bv against xfeature_mask ensures that a domain stays within +the set of features that Xen has enabled in hardware (and therefore isn't a +security problem), but this does means that attempts to level a guest for +migration safety might not be effective if the guest ignores CPUID. + +Check the CPUID policy in validate_xstate() (for incoming migration) and in +handle_xsetbv() (for guest XSETBV instructions). This subsumes the PKRU check +for PV guests in handle_xsetbv() (and also demonstrates that I should have +spotted this problem while reviewing c/s fbf9971241f). + +For migration, this is correct despite the current (mis)ordering of data +because d->arch.cpuid is the applicable max policy. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: 361b835fa00d9f45167c50a60e054ccf22c065d7 +master date: 2018-07-19 19:57:26 +0100 +--- + xen/arch/x86/domctl.c | 2 +- + xen/arch/x86/hvm/hvm.c | 2 +- + xen/arch/x86/xstate.c | 17 +++++++++++------ + xen/include/asm-x86/xstate.h | 5 +++-- + 4 files changed, 16 insertions(+), 10 deletions(-) + +diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c +index b04388d663..fa82b6744e 100644 +--- a/xen/arch/x86/domctl.c ++++ b/xen/arch/x86/domctl.c +@@ -1163,7 +1163,7 @@ long arch_do_domctl( + if ( _xcr0_accum ) + { + if ( evc->size >= PV_XSAVE_HDR_SIZE + XSTATE_AREA_MIN_SIZE ) +- ret = validate_xstate(_xcr0, _xcr0_accum, ++ ret = validate_xstate(d, _xcr0, _xcr0_accum, + &_xsave_area->xsave_hdr); + } + else if ( !_xcr0 ) +diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c +index 279cb88e45..d544720876 100644 +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -1269,7 +1269,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) + ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; + h->cur += desc->length; + +- err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum, ++ err = validate_xstate(d, ctxt->xcr0, ctxt->xcr0_accum, + (const void *)&ctxt->save_area.xsave_hdr); + if ( err ) + { +diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c +index b4aea4b50a..1fbb0871d0 100644 +--- a/xen/arch/x86/xstate.c ++++ b/xen/arch/x86/xstate.c +@@ -670,12 +670,17 @@ static bool valid_xcr0(u64 xcr0) + return !(xcr0 & X86_XCR0_BNDREGS) == !(xcr0 & X86_XCR0_BNDCSR); + } + +-int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr) ++int validate_xstate(const struct domain *d, uint64_t xcr0, uint64_t xcr0_accum, ++ const struct xsave_hdr *hdr) + { ++ const struct cpuid_policy *cp = d->arch.cpuid; ++ uint64_t xcr0_max = ++ ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low; + unsigned int i; + + if ( (hdr->xstate_bv & ~xcr0_accum) || + (xcr0 & ~xcr0_accum) || ++ (xcr0_accum & ~xcr0_max) || + !valid_xcr0(xcr0) || + !valid_xcr0(xcr0_accum) ) + return -EINVAL; +@@ -694,18 +699,18 @@ int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr) + int handle_xsetbv(u32 index, u64 new_bv) + { + struct vcpu *curr = current; ++ const struct cpuid_policy *cp = curr->domain->arch.cpuid; ++ uint64_t xcr0_max = ++ ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low; + u64 mask; + + if ( index != XCR_XFEATURE_ENABLED_MASK ) + return -EOPNOTSUPP; + +- if ( (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) ) ++ if ( (new_bv & ~xcr0_max) || ++ (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) ) + return -EINVAL; + +- /* XCR0.PKRU is disabled on PV mode. */ +- if ( is_pv_vcpu(curr) && (new_bv & X86_XCR0_PKRU) ) +- return -EOPNOTSUPP; +- + if ( !set_xcr0(new_bv) ) + return -EFAULT; + +diff --git a/xen/include/asm-x86/xstate.h b/xen/include/asm-x86/xstate.h +index 86a4a1f75c..47f602b855 100644 +--- a/xen/include/asm-x86/xstate.h ++++ b/xen/include/asm-x86/xstate.h +@@ -97,8 +97,9 @@ void xsave(struct vcpu *v, uint64_t mask); + void xrstor(struct vcpu *v, uint64_t mask); + void xstate_set_init(uint64_t mask); + bool xsave_enabled(const struct vcpu *v); +-int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum, +- const struct xsave_hdr *); ++int __must_check validate_xstate(const struct domain *d, ++ uint64_t xcr0, uint64_t xcr0_accum, ++ const struct xsave_hdr *hdr); + int __must_check handle_xsetbv(u32 index, u64 new_bv); + void expand_xsave_states(struct vcpu *v, void *dest, unsigned int size); + void compress_xsave_states(struct vcpu *v, const void *src, unsigned int size); +-- +2.17.1 + + +From 06d2a763d07d53a4ccc7bd1255ffc9ea01ec1609 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 30 Jul 2018 11:29:00 +0200 +Subject: [PATCH 16/42] x86/xstate: Make errors in xstate calculations more + obvious by crashing the domain + +If xcr0_max exceeds xfeature_mask, then something is broken with the CPUID +policy derivation or auditing logic. If hardware rejects new_bv, then +something is broken with Xen's xstate logic. + +In both cases, crash the domain with an obvious error message, to help +highlight the issues. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: d6371ccb93012db4ad6615fe666205b86308cb4e +master date: 2018-07-19 19:57:26 +0100 +--- + xen/arch/x86/xstate.c | 26 +++++++++++++++++++++++--- + 1 file changed, 23 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c +index 1fbb0871d0..15edd5df96 100644 +--- a/xen/arch/x86/xstate.c ++++ b/xen/arch/x86/xstate.c +@@ -707,12 +707,32 @@ int handle_xsetbv(u32 index, u64 new_bv) + if ( index != XCR_XFEATURE_ENABLED_MASK ) + return -EOPNOTSUPP; + +- if ( (new_bv & ~xcr0_max) || +- (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) ) ++ /* ++ * The CPUID logic shouldn't be able to hand out an XCR0 exceeding Xen's ++ * maximum features, but keep the check for robustness. ++ */ ++ if ( unlikely(xcr0_max & ~xfeature_mask) ) ++ { ++ gprintk(XENLOG_ERR, ++ "xcr0_max %016" PRIx64 " exceeds hardware max %016" PRIx64 "\n", ++ xcr0_max, xfeature_mask); ++ domain_crash(curr->domain); ++ ++ return -EINVAL; ++ } ++ ++ if ( (new_bv & ~xcr0_max) || !valid_xcr0(new_bv) ) + return -EINVAL; + +- if ( !set_xcr0(new_bv) ) ++ /* By this point, new_bv really should be accepted by hardware. */ ++ if ( unlikely(!set_xcr0(new_bv)) ) ++ { ++ gprintk(XENLOG_ERR, "new_bv %016" PRIx64 " rejected by hardware\n", ++ new_bv); ++ domain_crash(curr->domain); ++ + return -EFAULT; ++ } + + mask = new_bv & ~curr->arch.xcr0_accum; + curr->arch.xcr0 = new_bv; +-- +2.17.1 + + +From 7de21555730367497eb01edf6e9e9530224105e7 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 30 Jul 2018 11:29:39 +0200 +Subject: [PATCH 17/42] x86/hvm: Disallow unknown MSR_EFER bits +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +It turns out that nothing ever prevented HVM guests from trying to set unknown +EFER bits. Generally, this results in a vmentry failure. + +For Intel hardware, all implemented bits are covered by the checks. + +For AMD hardware, the only EFER bit which isn't covered by the checks is TCE +(which AFAICT is specific to AMD Fam15/16 hardware). We never advertise TCE +in CPUID, but it isn't a security problem to have TCE unexpected enabled in +guest context. + +Disallow the setting of bits outside of the EFER_KNOWN_MASK, which prevents +any vmentry failures for guests, yielding #GP instead. + +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +Reviewed-by: Wei Liu +Acked-by: Jan Beulich +master commit: ef0269c6215d642a709866f04ba1a1f9f13f3614 +master date: 2018-07-24 11:25:53 +0100 +--- + xen/arch/x86/hvm/hvm.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c +index d544720876..4cbb688c05 100644 +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -907,6 +907,9 @@ const char *hvm_efer_valid(const struct vcpu *v, uint64_t value, + else + p = &host_cpuid_policy; + ++ if ( value & ~EFER_KNOWN_MASK ) ++ return "Unknown bits set"; ++ + if ( (value & EFER_SCE) && !p->extd.syscall ) + return "SCE without feature"; + +-- +2.17.1 + + +From 33ced725e11af4eabd3334d12f53ed807e9e2586 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 30 Jul 2018 11:30:09 +0200 +Subject: [PATCH 18/42] x86/spec-ctrl: Fix the parsing of xpti= on fixed Intel + hardware + +The calls to xpti_init_default() in parse_xpti() are buggy. The CPUID data +hasn't been fetched that early, and boot_cpu_has(X86_FEATURE_ARCH_CAPS) will +always evaluate false. + +As a result, the default case won't disable XPTI on Intel hardware which +advertises ARCH_CAPABILITIES_RDCL_NO. + +Simplify parse_xpti() to solely the setting of opt_xpti according to the +passed string, and have init_speculation_mitigations() call +xpti_init_default() if appropiate. Drop the force parameter, and pass caps +instead, to avoid redundant re-reading of MSR_ARCH_CAPS. + +Signed-off-by: Andrew Cooper +Reviewed-by: Juergen Gross +Reviewed-by: Wei Liu +Acked-by: Jan Beulich +master commit: be5e2ff6f54e0245331ed360b8786760f82fd673 +master date: 2018-07-24 11:25:54 +0100 +--- + xen/arch/x86/spec_ctrl.c | 17 +++++------------ + 1 file changed, 5 insertions(+), 12 deletions(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 73dc7170c7..32a4ea6e99 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -423,17 +423,10 @@ static bool __init should_use_eager_fpu(void) + #define OPT_XPTI_DEFAULT 0xff + uint8_t __read_mostly opt_xpti = OPT_XPTI_DEFAULT; + +-static __init void xpti_init_default(bool force) ++static __init void xpti_init_default(uint64_t caps) + { +- uint64_t caps = 0; +- +- if ( !force && (opt_xpti != OPT_XPTI_DEFAULT) ) +- return; +- + if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) + caps = ARCH_CAPABILITIES_RDCL_NO; +- else if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) +- rdmsrl(MSR_ARCH_CAPABILITIES, caps); + + if ( caps & ARCH_CAPABILITIES_RDCL_NO ) + opt_xpti = 0; +@@ -446,8 +439,6 @@ static __init int parse_xpti(const char *s) + const char *ss; + int val, rc = 0; + +- xpti_init_default(false); +- + do { + ss = strchr(s, ','); + if ( !ss ) +@@ -465,7 +456,7 @@ static __init int parse_xpti(const char *s) + + default: + if ( !strcmp(s, "default") ) +- xpti_init_default(true); ++ opt_xpti = OPT_XPTI_DEFAULT; + else if ( (val = parse_boolean("dom0", s, ss)) >= 0 ) + opt_xpti = (opt_xpti & ~OPT_XPTI_DOM0) | + (val ? OPT_XPTI_DOM0 : 0); +@@ -627,7 +618,9 @@ void __init init_speculation_mitigations(void) + if ( default_xen_spec_ctrl ) + setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE); + +- xpti_init_default(false); ++ if ( opt_xpti == OPT_XPTI_DEFAULT ) ++ xpti_init_default(caps); ++ + if ( opt_xpti == 0 ) + setup_force_cpu_cap(X86_FEATURE_NO_XPTI); + else +-- +2.17.1 + + +From 6fe9726aebc11433083b9810402501f1a71d02fd Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 9 Aug 2018 17:22:17 +0100 +Subject: [PATCH 19/42] x86/spec-ctrl: Yet more fixes for xpti= parsing + +As it currently stands, 'xpti=dom0' is indistinguishable from the default +value, which means it will be overridden by ARCH_CAPABILITIES_RDCL_NO on fixed +hardware. + +Switch opt_xpti to use -1 as a default like all our other related options, and +clobber it as soon as we have a string to parse. + +In addition, 'xpti' alone should be interpreted in its positive boolean form, +rather than resulting in a parse error. + + (XEN) parameter "xpti" has invalid value "", rc=-22! + +Signed-off-by: Andrew Cooper +Reviewed-by: Juergen Gross +Reviewed-by: Jan Beulich +(cherry picked from commit 2a3b34ec47817048ab59586855cf0709fc77487e) +--- + xen/arch/x86/spec_ctrl.c | 15 +++++++++++---- + xen/include/asm-x86/spec_ctrl.h | 2 +- + 2 files changed, 12 insertions(+), 5 deletions(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 32a4ea6e99..32213ace86 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -420,8 +420,7 @@ static bool __init should_use_eager_fpu(void) + } + } + +-#define OPT_XPTI_DEFAULT 0xff +-uint8_t __read_mostly opt_xpti = OPT_XPTI_DEFAULT; ++int8_t __read_mostly opt_xpti = -1; + + static __init void xpti_init_default(uint64_t caps) + { +@@ -439,6 +438,14 @@ static __init int parse_xpti(const char *s) + const char *ss; + int val, rc = 0; + ++ /* Inhibit the defaults as an explicit choice has been given. */ ++ if ( opt_xpti == -1 ) ++ opt_xpti = 0; ++ ++ /* Interpret 'xpti' alone in its positive boolean form. */ ++ if ( *s == '\0' ) ++ opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU; ++ + do { + ss = strchr(s, ','); + if ( !ss ) +@@ -456,7 +463,7 @@ static __init int parse_xpti(const char *s) + + default: + if ( !strcmp(s, "default") ) +- opt_xpti = OPT_XPTI_DEFAULT; ++ opt_xpti = -1; + else if ( (val = parse_boolean("dom0", s, ss)) >= 0 ) + opt_xpti = (opt_xpti & ~OPT_XPTI_DOM0) | + (val ? OPT_XPTI_DOM0 : 0); +@@ -618,7 +625,7 @@ void __init init_speculation_mitigations(void) + if ( default_xen_spec_ctrl ) + setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE); + +- if ( opt_xpti == OPT_XPTI_DEFAULT ) ++ if ( opt_xpti == -1 ) + xpti_init_default(caps); + + if ( opt_xpti == 0 ) +diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h +index 5b40afbab0..fea82603ca 100644 +--- a/xen/include/asm-x86/spec_ctrl.h ++++ b/xen/include/asm-x86/spec_ctrl.h +@@ -34,7 +34,7 @@ extern bool bsp_delay_spec_ctrl; + extern uint8_t default_xen_spec_ctrl; + extern uint8_t default_spec_ctrl_flags; + +-extern uint8_t opt_xpti; ++extern int8_t opt_xpti; + #define OPT_XPTI_DOM0 0x01 + #define OPT_XPTI_DOMU 0x02 + +-- +2.17.1 + + +From 4254e9874006cc2641b67d0531a3a65374f34c35 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 24 May 2018 17:20:09 +0000 +Subject: [PATCH 20/42] x86/vmx: Fix handing of MSR_DEBUGCTL on VMExit + +Currently, whenever the guest writes a nonzero value to MSR_DEBUGCTL, Xen +updates a host MSR load list entry with the current hardware value of +MSR_DEBUGCTL. + +On VMExit, hardware automatically resets MSR_DEBUGCTL to 0. Later, when the +guest writes to MSR_DEBUGCTL, the current value in hardware (0) is fed back +into guest load list. As a practical result, `ler` debugging gets lost on any +PCPU which has ever scheduled an HVM vcpu, and the common case when `ler` +debugging isn't active, guest actions result in an unnecessary load list entry +repeating the MSR_DEBUGCTL reset. + +Restoration of Xen's debugging setting needs to happen from the very first +vmexit. Due to the automatic reset, Xen need take no action in the general +case, and only needs to load a value when debugging is active. + +This could be fixed by using a host MSR load list entry set up during +construct_vmcs(). However, a more efficient option is to use an alternative +block in the VMExit path, keyed on whether hypervisor debugging has been +enabled. + +In order to set this up, drop the per cpu ler_msr variable (as there is no +point having it per cpu when it will be the same everywhere), and use a single +read_mostly variable instead. Split calc_ler_msr() out of percpu_traps_init() +for clarity. + +Finally, clean up do_debug(). Reinstate LBR early to help catch cascade +errors, which allows for the removal of the out label. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +Reviewed-by: Kevin Tian +(cherry picked from commit 730dc8d2c9e1b6402e66973cf99a7c56bc78be4c) +--- + xen/arch/x86/hvm/vmx/entry.S | 9 +++++ + xen/arch/x86/hvm/vmx/vmx.c | 3 +- + xen/arch/x86/traps.c | 64 +++++++++++++++---------------- + xen/arch/x86/x86_64/traps.c | 7 ++-- + xen/include/asm-x86/cpufeature.h | 1 + + xen/include/asm-x86/cpufeatures.h | 1 + + xen/include/asm-x86/msr.h | 2 +- + 7 files changed, 47 insertions(+), 40 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S +index aa2f103895..afd552f2b9 100644 +--- a/xen/arch/x86/hvm/vmx/entry.S ++++ b/xen/arch/x86/hvm/vmx/entry.S +@@ -41,6 +41,15 @@ ENTRY(vmx_asm_vmexit_handler) + SPEC_CTRL_ENTRY_FROM_HVM /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */ + /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ + ++ /* Hardware clears MSR_DEBUGCTL on VMExit. Reinstate it if debugging Xen. */ ++ .macro restore_lbr ++ mov $IA32_DEBUGCTLMSR_LBR, %eax ++ mov $MSR_IA32_DEBUGCTLMSR, %ecx ++ xor %edx, %edx ++ wrmsr ++ .endm ++ ALTERNATIVE "", restore_lbr, X86_FEATURE_XEN_LBR ++ + mov %rsp,%rdi + call vmx_vmexit_handler + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 7189820bfc..bb164359bb 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -3124,8 +3124,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + } + } + +- if ( (rc < 0) || +- (msr_content && (vmx_add_host_load_msr(msr) < 0)) ) ++ if ( rc < 0 ) + hvm_inject_hw_exception(TRAP_machine_check, X86_EVENT_NO_EC); + else + __vmwrite(GUEST_IA32_DEBUGCTL, msr_content); +diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c +index 9f045a2045..789d7ff8cd 100644 +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -96,8 +96,6 @@ string_param("nmi", opt_nmi); + DEFINE_PER_CPU(uint64_t, efer); + static DEFINE_PER_CPU(unsigned long, last_extable_addr); + +-DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr); +- + DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table); + DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table); + +@@ -117,6 +115,9 @@ integer_param("debug_stack_lines", debug_stack_lines); + static bool opt_ler; + boolean_param("ler", opt_ler); + ++/* LastExceptionFromIP on this hardware. Zero if LER is not in use. */ ++unsigned int __read_mostly ler_msr; ++ + #define stack_words_per_line 4 + #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp) + +@@ -1778,17 +1779,6 @@ void do_device_not_available(struct cpu_user_regs *regs) + return; + } + +-static void ler_enable(void) +-{ +- u64 debugctl; +- +- if ( !this_cpu(ler_msr) ) +- return; +- +- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); +- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | IA32_DEBUGCTLMSR_LBR); +-} +- + void do_debug(struct cpu_user_regs *regs) + { + unsigned long dr6; +@@ -1821,6 +1811,10 @@ void do_debug(struct cpu_user_regs *regs) + */ + write_debugreg(6, X86_DR6_DEFAULT); + ++ /* #DB automatically disabled LBR. Reinstate it if debugging Xen. */ ++ if ( cpu_has_xen_lbr ) ++ wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR); ++ + if ( !guest_mode(regs) ) + { + /* +@@ -1838,7 +1832,7 @@ void do_debug(struct cpu_user_regs *regs) + { + if ( regs->rip == (unsigned long)sysenter_eflags_saved ) + regs->eflags &= ~X86_EFLAGS_TF; +- goto out; ++ return; + } + if ( !debugger_trap_fatal(TRAP_debug, regs) ) + { +@@ -1895,20 +1889,14 @@ void do_debug(struct cpu_user_regs *regs) + regs->cs, _p(regs->rip), _p(regs->rip), + regs->ss, _p(regs->rsp), dr6); + +- goto out; ++ return; + } + + /* Save debug status register where guest OS can peek at it */ + v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT); + v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT); + +- ler_enable(); + pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC); +- return; +- +- out: +- ler_enable(); +- return; + } + + static void __init noinline __set_intr_gate(unsigned int n, +@@ -1952,38 +1940,46 @@ void load_TR(void) + : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" ); + } + +-void percpu_traps_init(void) ++static unsigned int calc_ler_msr(void) + { +- subarch_percpu_traps_init(); +- +- if ( !opt_ler ) +- return; +- + switch ( boot_cpu_data.x86_vendor ) + { + case X86_VENDOR_INTEL: + switch ( boot_cpu_data.x86 ) + { + case 6: +- this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP; +- break; ++ return MSR_IA32_LASTINTFROMIP; ++ + case 15: +- this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP; +- break; ++ return MSR_P4_LER_FROM_LIP; + } + break; ++ + case X86_VENDOR_AMD: + switch ( boot_cpu_data.x86 ) + { + case 6: + case 0xf ... 0x17: +- this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP; +- break; ++ return MSR_IA32_LASTINTFROMIP; + } + break; + } + +- ler_enable(); ++ return 0; ++} ++ ++void percpu_traps_init(void) ++{ ++ subarch_percpu_traps_init(); ++ ++ if ( !opt_ler ) ++ return; ++ ++ if ( !ler_msr && (ler_msr = calc_ler_msr()) ) ++ setup_force_cpu_cap(X86_FEATURE_XEN_LBR); ++ ++ if ( cpu_has_xen_lbr ) ++ wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR); + } + + void __init init_idt_traps(void) +diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c +index f7f6928d70..b0401850ef 100644 +--- a/xen/arch/x86/x86_64/traps.c ++++ b/xen/arch/x86/x86_64/traps.c +@@ -144,11 +144,12 @@ void show_registers(const struct cpu_user_regs *regs) + printk("CPU: %d\n", smp_processor_id()); + _show_registers(&fault_regs, fault_crs, context, v); + +- if ( this_cpu(ler_msr) && !guest_mode(regs) ) ++ if ( ler_msr && !guest_mode(regs) ) + { + u64 from, to; +- rdmsrl(this_cpu(ler_msr), from); +- rdmsrl(this_cpu(ler_msr) + 1, to); ++ ++ rdmsrl(ler_msr, from); ++ rdmsrl(ler_msr + 1, to); + printk("ler: %016lx -> %016lx\n", from, to); + } + } +diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h +index 2cf8f7ea2a..b237da165c 100644 +--- a/xen/include/asm-x86/cpufeature.h ++++ b/xen/include/asm-x86/cpufeature.h +@@ -113,6 +113,7 @@ + #define cpu_has_aperfmperf boot_cpu_has(X86_FEATURE_APERFMPERF) + #define cpu_has_lfence_dispatch boot_cpu_has(X86_FEATURE_LFENCE_DISPATCH) + #define cpu_has_no_xpti boot_cpu_has(X86_FEATURE_NO_XPTI) ++#define cpu_has_xen_lbr boot_cpu_has(X86_FEATURE_XEN_LBR) + + enum _cache_type { + CACHE_TYPE_NULL = 0, +diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h +index b90aa2d046..8e5cc53dde 100644 +--- a/xen/include/asm-x86/cpufeatures.h ++++ b/xen/include/asm-x86/cpufeatures.h +@@ -32,3 +32,4 @@ XEN_CPUFEATURE(SC_RSB_PV, (FSCAPINTS+0)*32+18) /* RSB overwrite needed for + XEN_CPUFEATURE(SC_RSB_HVM, (FSCAPINTS+0)*32+19) /* RSB overwrite needed for HVM */ + XEN_CPUFEATURE(NO_XPTI, (FSCAPINTS+0)*32+20) /* XPTI mitigation not in use */ + XEN_CPUFEATURE(SC_MSR_IDLE, (FSCAPINTS+0)*32+21) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */ ++XEN_CPUFEATURE(XEN_LBR, (FSCAPINTS+0)*32+22) /* Xen uses MSR_DEBUGCTL.LBR */ +diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h +index f14f265aa5..afbeb7f155 100644 +--- a/xen/include/asm-x86/msr.h ++++ b/xen/include/asm-x86/msr.h +@@ -241,7 +241,7 @@ static inline void write_efer(uint64_t val) + wrmsrl(MSR_EFER, val); + } + +-DECLARE_PER_CPU(u32, ler_msr); ++extern unsigned int ler_msr; + + DECLARE_PER_CPU(uint32_t, tsc_aux); + +-- +2.17.1 + + +From 61cc8769a917c646b9bc99ee8adbea602f8d50d2 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 28 May 2018 15:02:34 +0100 +Subject: [PATCH 21/42] x86/vmx: Defer vmx_vmcs_exit() as long as possible in + construct_vmcs() + +paging_update_paging_modes() and vmx_vlapic_msr_changed() both operate on the +VMCS being constructed. Avoid dropping and re-acquiring the reference +multiple times. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +Acked-by: Kevin Tian +(cherry picked from commit f30e3cf34042846e391e3f8361fc6a76d181a7ee) +--- + xen/arch/x86/hvm/vmx/vmcs.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index 258fc08f72..15d63663e5 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -996,6 +996,7 @@ static int construct_vmcs(struct vcpu *v) + struct domain *d = v->domain; + u32 vmexit_ctl = vmx_vmexit_control; + u32 vmentry_ctl = vmx_vmentry_control; ++ int rc = 0; + + vmx_vmcs_enter(v); + +@@ -1083,8 +1084,8 @@ static int construct_vmcs(struct vcpu *v) + + if ( msr_bitmap == NULL ) + { +- vmx_vmcs_exit(v); +- return -ENOMEM; ++ rc = -ENOMEM; ++ goto out; + } + + memset(msr_bitmap, ~0, PAGE_SIZE); +@@ -1268,14 +1269,15 @@ static int construct_vmcs(struct vcpu *v) + if ( cpu_has_vmx_tsc_scaling ) + __vmwrite(TSC_MULTIPLIER, d->arch.hvm_domain.tsc_scaling_ratio); + +- vmx_vmcs_exit(v); +- + /* will update HOST & GUEST_CR3 as reqd */ + paging_update_paging_modes(v); + + vmx_vlapic_msr_changed(v); + +- return 0; ++ out: ++ vmx_vmcs_exit(v); ++ ++ return rc; + } + + static int vmx_msr_entry_key_cmp(const void *key, const void *elt) +-- +2.17.1 + + +From 935e9c404714f5fa6d31890034a7e2cc11c6e0b9 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 7 May 2018 11:57:00 +0100 +Subject: [PATCH 22/42] x86/vmx: API improvements for MSR load/save + infrastructure +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Collect together related infrastructure in vmcs.h, rather than having it +spread out. Turn vmx_{read,write}_guest_msr() into static inlines, as they +are simple enough. + +Replace 'int type' with 'enum vmx_msr_list_type', and use switch statements +internally. Later changes are going to introduce a new type. + +Rename the type identifiers for consistency with the other VMX_MSR_* +constants. + +No functional change. + +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +Acked-by: Kevin Tian +(cherry picked from commit f54b63e8617ada823be43d60467a43c8224b7909) +--- + xen/arch/x86/hvm/vmx/vmcs.c | 93 +++++++++++++----------------- + xen/arch/x86/hvm/vmx/vmx.c | 8 +-- + xen/include/asm-x86/hvm/vmx/vmcs.h | 62 +++++++++++++++----- + 3 files changed, 91 insertions(+), 72 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index 15d63663e5..6bc6597242 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -1293,22 +1293,26 @@ static int vmx_msr_entry_key_cmp(const void *key, const void *elt) + return 0; + } + +-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type) ++struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type) + { + struct vcpu *curr = current; + unsigned int msr_count; +- struct vmx_msr_entry *msr_area; ++ struct vmx_msr_entry *msr_area = NULL; + +- if ( type == VMX_GUEST_MSR ) ++ switch ( type ) + { +- msr_count = curr->arch.hvm_vmx.msr_count; +- msr_area = curr->arch.hvm_vmx.msr_area; +- } +- else +- { +- ASSERT(type == VMX_HOST_MSR); ++ case VMX_MSR_HOST: + msr_count = curr->arch.hvm_vmx.host_msr_count; + msr_area = curr->arch.hvm_vmx.host_msr_area; ++ break; ++ ++ case VMX_MSR_GUEST: ++ msr_count = curr->arch.hvm_vmx.msr_count; ++ msr_area = curr->arch.hvm_vmx.msr_area; ++ break; ++ ++ default: ++ ASSERT_UNREACHABLE(); + } + + if ( msr_area == NULL ) +@@ -1318,48 +1322,27 @@ struct vmx_msr_entry *vmx_find_msr(u32 msr, int type) + vmx_msr_entry_key_cmp); + } + +-int vmx_read_guest_msr(u32 msr, u64 *val) +-{ +- struct vmx_msr_entry *ent; +- +- if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL ) +- { +- *val = ent->data; +- return 0; +- } +- +- return -ESRCH; +-} +- +-int vmx_write_guest_msr(u32 msr, u64 val) +-{ +- struct vmx_msr_entry *ent; +- +- if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL ) +- { +- ent->data = val; +- return 0; +- } +- +- return -ESRCH; +-} +- +-int vmx_add_msr(u32 msr, int type) ++int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type) + { + struct vcpu *curr = current; + unsigned int idx, *msr_count; + struct vmx_msr_entry **msr_area, *msr_area_elem; + +- if ( type == VMX_GUEST_MSR ) ++ switch ( type ) + { +- msr_count = &curr->arch.hvm_vmx.msr_count; +- msr_area = &curr->arch.hvm_vmx.msr_area; +- } +- else +- { +- ASSERT(type == VMX_HOST_MSR); ++ case VMX_MSR_HOST: + msr_count = &curr->arch.hvm_vmx.host_msr_count; + msr_area = &curr->arch.hvm_vmx.host_msr_area; ++ break; ++ ++ case VMX_MSR_GUEST: ++ msr_count = &curr->arch.hvm_vmx.msr_count; ++ msr_area = &curr->arch.hvm_vmx.msr_area; ++ break; ++ ++ default: ++ ASSERT_UNREACHABLE(); ++ return -EINVAL; + } + + if ( *msr_area == NULL ) +@@ -1367,13 +1350,17 @@ int vmx_add_msr(u32 msr, int type) + if ( (*msr_area = alloc_xenheap_page()) == NULL ) + return -ENOMEM; + +- if ( type == VMX_GUEST_MSR ) ++ switch ( type ) + { ++ case VMX_MSR_HOST: ++ __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area)); ++ break; ++ ++ case VMX_MSR_GUEST: + __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area)); + __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area)); ++ break; + } +- else +- __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area)); + } + + for ( idx = 0; idx < *msr_count && (*msr_area)[idx].index <= msr; idx++ ) +@@ -1392,16 +1379,18 @@ int vmx_add_msr(u32 msr, int type) + + ++*msr_count; + +- if ( type == VMX_GUEST_MSR ) ++ switch ( type ) + { ++ case VMX_MSR_HOST: ++ rdmsrl(msr, msr_area_elem->data); ++ __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count); ++ break; ++ ++ case VMX_MSR_GUEST: + msr_area_elem->data = 0; + __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count); + __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count); +- } +- else +- { +- rdmsrl(msr, msr_area_elem->data); +- __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count); ++ break; + } + + return 0; +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index bb164359bb..d4ebae8945 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -4169,7 +4169,7 @@ static void lbr_tsx_fixup(void) + struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; + struct vmx_msr_entry *msr; + +- if ( (msr = vmx_find_msr(lbr_from_start, VMX_GUEST_MSR)) != NULL ) ++ if ( (msr = vmx_find_msr(lbr_from_start, VMX_MSR_GUEST)) != NULL ) + { + /* + * Sign extend into bits 61:62 while preserving bit 63 +@@ -4179,7 +4179,7 @@ static void lbr_tsx_fixup(void) + msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2); + } + +- if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_GUEST_MSR)) != NULL ) ++ if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_MSR_GUEST)) != NULL ) + msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2); + } + +@@ -4207,8 +4207,8 @@ static void bdw_erratum_bdf14_fixup(void) + * erratum BDF14. Fix up MSR_IA32_LASTINT{FROM,TO}IP by + * sign-extending into bits 48:63. + */ +- sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_GUEST_MSR); +- sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_GUEST_MSR); ++ sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST); ++ sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST); + } + + static void lbr_fixup(void) +diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h +index 06c3179cec..20882d13e0 100644 +--- a/xen/include/asm-x86/hvm/vmx/vmcs.h ++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h +@@ -514,9 +514,6 @@ enum vmcs_field { + + #define VMCS_VPID_WIDTH 16 + +-#define VMX_GUEST_MSR 0 +-#define VMX_HOST_MSR 1 +- + /* VM Instruction error numbers */ + enum vmx_insn_errno + { +@@ -534,6 +531,52 @@ enum vmx_insn_errno + VMX_INSN_FAIL_INVALID = ~0, + }; + ++/* MSR load/save list infrastructure. */ ++enum vmx_msr_list_type { ++ VMX_MSR_HOST, /* MSRs loaded on VMExit. */ ++ VMX_MSR_GUEST, /* MSRs saved on VMExit, loaded on VMEntry. */ ++}; ++ ++int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type); ++ ++static inline int vmx_add_host_load_msr(uint32_t msr) ++{ ++ return vmx_add_msr(msr, VMX_MSR_HOST); ++} ++ ++static inline int vmx_add_guest_msr(uint32_t msr) ++{ ++ return vmx_add_msr(msr, VMX_MSR_GUEST); ++} ++ ++struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type); ++ ++static inline int vmx_read_guest_msr(uint32_t msr, uint64_t *val) ++{ ++ const struct vmx_msr_entry *ent = vmx_find_msr(msr, VMX_MSR_GUEST); ++ ++ if ( !ent ) ++ return -ESRCH; ++ ++ *val = ent->data; ++ ++ return 0; ++} ++ ++static inline int vmx_write_guest_msr(uint32_t msr, uint64_t val) ++{ ++ struct vmx_msr_entry *ent = vmx_find_msr(msr, VMX_MSR_GUEST); ++ ++ if ( !ent ) ++ return -ESRCH; ++ ++ ent->data = val; ++ ++ return 0; ++} ++ ++ ++/* MSR intercept bitmap infrastructure. */ + enum vmx_msr_intercept_type { + VMX_MSR_R = 1, + VMX_MSR_W = 2, +@@ -544,10 +587,6 @@ void vmx_clear_msr_intercept(struct vcpu *v, unsigned int msr, + enum vmx_msr_intercept_type type); + void vmx_set_msr_intercept(struct vcpu *v, unsigned int msr, + enum vmx_msr_intercept_type type); +-int vmx_read_guest_msr(u32 msr, u64 *val); +-int vmx_write_guest_msr(u32 msr, u64 val); +-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type); +-int vmx_add_msr(u32 msr, int type); + void vmx_vmcs_switch(paddr_t from, paddr_t to); + void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector); + void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector); +@@ -562,15 +601,6 @@ void virtual_vmcs_vmwrite(const struct vcpu *, u32 encoding, u64 val); + enum vmx_insn_errno virtual_vmcs_vmwrite_safe(const struct vcpu *v, + u32 vmcs_encoding, u64 val); + +-static inline int vmx_add_guest_msr(u32 msr) +-{ +- return vmx_add_msr(msr, VMX_GUEST_MSR); +-} +-static inline int vmx_add_host_load_msr(u32 msr) +-{ +- return vmx_add_msr(msr, VMX_HOST_MSR); +-} +- + DECLARE_PER_CPU(bool_t, vmxon); + + bool_t vmx_vcpu_pml_enabled(const struct vcpu *v); +-- +2.17.1 + + +From 52b8f9ae22a5daa1f2cad0aa5065b72b48c33ce4 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 7 May 2018 11:57:00 +0100 +Subject: [PATCH 23/42] x86/vmx: Internal cleanup for MSR load/save + infrastructure + + * Use an arch_vmx_struct local variable to reduce later code volume. + * Use start/total instead of msr_area/msr_count. This is in preparation for + more finegrained handling with later changes. + * Use ent/end pointers (again for preparation), and to make the vmx_add_msr() + logic easier to follow. + * Make the memory allocation block of vmx_add_msr() unlikely, and calculate + virt_to_maddr() just once. + +No practical change to functionality. + +Signed-off-by: Andrew Cooper +Acked-by: Kevin Tian +(cherry picked from commit 94fda356fcdcc847662a4c9f6cc63511f25c1247) +--- + xen/arch/x86/hvm/vmx/vmcs.c | 75 ++++++++++++++++++++----------------- + 1 file changed, 40 insertions(+), 35 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index 6bc6597242..a6ddba3132 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -1296,48 +1296,49 @@ static int vmx_msr_entry_key_cmp(const void *key, const void *elt) + struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type) + { + struct vcpu *curr = current; +- unsigned int msr_count; +- struct vmx_msr_entry *msr_area = NULL; ++ struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx; ++ struct vmx_msr_entry *start = NULL; ++ unsigned int total; + + switch ( type ) + { + case VMX_MSR_HOST: +- msr_count = curr->arch.hvm_vmx.host_msr_count; +- msr_area = curr->arch.hvm_vmx.host_msr_area; ++ start = vmx->host_msr_area; ++ total = vmx->host_msr_count; + break; + + case VMX_MSR_GUEST: +- msr_count = curr->arch.hvm_vmx.msr_count; +- msr_area = curr->arch.hvm_vmx.msr_area; ++ start = vmx->msr_area; ++ total = vmx->msr_count; + break; + + default: + ASSERT_UNREACHABLE(); + } + +- if ( msr_area == NULL ) ++ if ( !start ) + return NULL; + +- return bsearch(&msr, msr_area, msr_count, sizeof(struct vmx_msr_entry), +- vmx_msr_entry_key_cmp); ++ return bsearch(&msr, start, total, sizeof(*start), vmx_msr_entry_key_cmp); + } + + int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type) + { + struct vcpu *curr = current; +- unsigned int idx, *msr_count; +- struct vmx_msr_entry **msr_area, *msr_area_elem; ++ struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx; ++ struct vmx_msr_entry **ptr, *start = NULL, *ent, *end; ++ unsigned int total; + + switch ( type ) + { + case VMX_MSR_HOST: +- msr_count = &curr->arch.hvm_vmx.host_msr_count; +- msr_area = &curr->arch.hvm_vmx.host_msr_area; ++ ptr = &vmx->host_msr_area; ++ total = vmx->host_msr_count; + break; + + case VMX_MSR_GUEST: +- msr_count = &curr->arch.hvm_vmx.msr_count; +- msr_area = &curr->arch.hvm_vmx.msr_area; ++ ptr = &vmx->msr_area; ++ total = vmx->msr_count; + break; + + default: +@@ -1345,51 +1346,55 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type) + return -EINVAL; + } + +- if ( *msr_area == NULL ) ++ /* Allocate memory on first use. */ ++ if ( unlikely(!*ptr) ) + { +- if ( (*msr_area = alloc_xenheap_page()) == NULL ) ++ paddr_t addr; ++ ++ if ( (*ptr = alloc_xenheap_page()) == NULL ) + return -ENOMEM; + ++ addr = virt_to_maddr(*ptr); ++ + switch ( type ) + { + case VMX_MSR_HOST: +- __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area)); ++ __vmwrite(VM_EXIT_MSR_LOAD_ADDR, addr); + break; + + case VMX_MSR_GUEST: +- __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area)); +- __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area)); ++ __vmwrite(VM_EXIT_MSR_STORE_ADDR, addr); ++ __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, addr); + break; + } + } + +- for ( idx = 0; idx < *msr_count && (*msr_area)[idx].index <= msr; idx++ ) +- if ( (*msr_area)[idx].index == msr ) ++ start = *ptr; ++ end = start + total; ++ ++ for ( ent = start; ent < end && ent->index <= msr; ++ent ) ++ if ( ent->index == msr ) + return 0; + +- if ( *msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) ) ++ if ( total == (PAGE_SIZE / sizeof(*ent)) ) + return -ENOSPC; + +- memmove(*msr_area + idx + 1, *msr_area + idx, +- sizeof(*msr_area_elem) * (*msr_count - idx)); +- +- msr_area_elem = *msr_area + idx; +- msr_area_elem->index = msr; +- msr_area_elem->mbz = 0; ++ memmove(ent + 1, ent, sizeof(*ent) * (end - ent)); + +- ++*msr_count; ++ ent->index = msr; ++ ent->mbz = 0; + + switch ( type ) + { + case VMX_MSR_HOST: +- rdmsrl(msr, msr_area_elem->data); +- __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count); ++ rdmsrl(msr, ent->data); ++ __vmwrite(VM_EXIT_MSR_LOAD_COUNT, ++vmx->host_msr_count); + break; + + case VMX_MSR_GUEST: +- msr_area_elem->data = 0; +- __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count); +- __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count); ++ ent->data = 0; ++ __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_count); ++ __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_count); + break; + } + +-- +2.17.1 + + +From b52017c904ae770ab86a62bf3219ee21d23bb55b Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 7 May 2018 11:57:00 +0100 +Subject: [PATCH 24/42] x86/vmx: Factor locate_msr_entry() out of + vmx_find_msr() and vmx_add_msr() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Instead of having multiple algorithms searching the MSR lists, implement a +single one. It has the semantics required by vmx_add_msr(), to identify the +position in which an MSR should live, if it isn't already present. + +There will be a marginal improvement for vmx_find_msr() by avoiding the +function pointer calls to vmx_msr_entry_key_cmp(), and a major improvement for +vmx_add_msr() by using a binary search instead of a linear search. + +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +Acked-by: Kevin Tian +(cherry picked from commit 4d94828cf11104256dccea1fa7762f00575dfaa0) +--- + xen/arch/x86/hvm/vmx/vmcs.c | 41 +++++++++++++++++++++++++------------ + 1 file changed, 28 insertions(+), 13 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index a6ddba3132..c75b0ee5c3 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -1280,24 +1280,36 @@ static int construct_vmcs(struct vcpu *v) + return rc; + } + +-static int vmx_msr_entry_key_cmp(const void *key, const void *elt) ++/* ++ * Search an MSR list looking for an MSR entry, or the slot in which it should ++ * live (to keep the data sorted) if an entry is not found. ++ * ++ * The return pointer is guaranteed to be bounded by start and end. However, ++ * it may point at end, and may be invalid for the caller to dereference. ++ */ ++static struct vmx_msr_entry *locate_msr_entry( ++ struct vmx_msr_entry *start, struct vmx_msr_entry *end, uint32_t msr) + { +- const u32 *msr = key; +- const struct vmx_msr_entry *entry = elt; ++ while ( start < end ) ++ { ++ struct vmx_msr_entry *mid = start + (end - start) / 2; + +- if ( *msr > entry->index ) +- return 1; +- if ( *msr < entry->index ) +- return -1; ++ if ( msr < mid->index ) ++ end = mid; ++ else if ( msr > mid->index ) ++ start = mid + 1; ++ else ++ return mid; ++ } + +- return 0; ++ return start; + } + + struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type) + { + struct vcpu *curr = current; + struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx; +- struct vmx_msr_entry *start = NULL; ++ struct vmx_msr_entry *start = NULL, *ent, *end; + unsigned int total; + + switch ( type ) +@@ -1319,7 +1331,10 @@ struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type) + if ( !start ) + return NULL; + +- return bsearch(&msr, start, total, sizeof(*start), vmx_msr_entry_key_cmp); ++ end = start + total; ++ ent = locate_msr_entry(start, end, msr); ++ ++ return ((ent < end) && (ent->index == msr)) ? ent : NULL; + } + + int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type) +@@ -1371,10 +1386,10 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type) + + start = *ptr; + end = start + total; ++ ent = locate_msr_entry(start, end, msr); + +- for ( ent = start; ent < end && ent->index <= msr; ++ent ) +- if ( ent->index == msr ) +- return 0; ++ if ( (ent < end) && (ent->index == msr) ) ++ return 0; + + if ( total == (PAGE_SIZE / sizeof(*ent)) ) + return -ENOSPC; +-- +2.17.1 + + +From 218d403ad944f47548752d4a60e8f77e5f8e1950 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 7 May 2018 11:57:00 +0100 +Subject: [PATCH 25/42] x86/vmx: Support remote access to the MSR lists + +At the moment, all modifications of the MSR lists are in current context. +However, future changes may need to put MSR_EFER into the lists from domctl +hypercall context. + +Plumb a struct vcpu parameter down through the infrastructure, and use +vmx_vmcs_{enter,exit}() for safe access to the VMCS in vmx_add_msr(). Use +assertions to ensure that access is either in current context, or while the +vcpu is paused. + +Note these expectations beside the fields in arch_vmx_struct, and reorder the +fields to avoid unnecessary padding. + +Signed-off-by: Andrew Cooper +Acked-by: Kevin Tian +Reviewed-by: Jan Beulich +(cherry picked from commit 80599f0b770199116aa753bfdfac9bfe2e8ea86a) +--- + xen/arch/x86/cpu/vpmu_intel.c | 14 +++++------ + xen/arch/x86/hvm/vmx/vmcs.c | 40 ++++++++++++++++++++++-------- + xen/arch/x86/hvm/vmx/vmx.c | 22 ++++++++-------- + xen/include/asm-x86/hvm/vmx/vmcs.h | 34 ++++++++++++++++--------- + xen/include/xen/sched.h | 2 +- + 5 files changed, 72 insertions(+), 40 deletions(-) + +diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c +index 207e2e712c..c499e69f2f 100644 +--- a/xen/arch/x86/cpu/vpmu_intel.c ++++ b/xen/arch/x86/cpu/vpmu_intel.c +@@ -455,12 +455,12 @@ static int core2_vpmu_alloc_resource(struct vcpu *v) + if ( is_hvm_vcpu(v) ) + { + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +- if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) ) ++ if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) ) + goto out_err; + +- if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) ) ++ if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) ) + goto out_err; +- vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0); ++ vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0); + } + + core2_vpmu_cxt = xzalloc_bytes(sizeof(*core2_vpmu_cxt) + +@@ -613,7 +613,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content, + return -EINVAL; + + if ( is_hvm_vcpu(v) ) +- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, ++ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, + &core2_vpmu_cxt->global_ctrl); + else + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl); +@@ -682,7 +682,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content, + return -EINVAL; + + if ( is_hvm_vcpu(v) ) +- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, ++ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, + &core2_vpmu_cxt->global_ctrl); + else + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl); +@@ -701,7 +701,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content, + else + { + if ( is_hvm_vcpu(v) ) +- vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content); ++ vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content); + else + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, msr_content); + } +@@ -735,7 +735,7 @@ static int core2_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content) + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + if ( is_hvm_vcpu(v) ) +- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content); ++ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content); + else + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, *msr_content); + break; +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index c75b0ee5c3..e86f292fbc 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -1305,13 +1305,15 @@ static struct vmx_msr_entry *locate_msr_entry( + return start; + } + +-struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type) ++struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr, ++ enum vmx_msr_list_type type) + { +- struct vcpu *curr = current; +- struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx; ++ const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx; + struct vmx_msr_entry *start = NULL, *ent, *end; + unsigned int total; + ++ ASSERT(v == current || !vcpu_runnable(v)); ++ + switch ( type ) + { + case VMX_MSR_HOST: +@@ -1337,12 +1339,14 @@ struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type) + return ((ent < end) && (ent->index == msr)) ? ent : NULL; + } + +-int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type) ++int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type) + { +- struct vcpu *curr = current; +- struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx; ++ struct arch_vmx_struct *vmx = &v->arch.hvm_vmx; + struct vmx_msr_entry **ptr, *start = NULL, *ent, *end; + unsigned int total; ++ int rc; ++ ++ ASSERT(v == current || !vcpu_runnable(v)); + + switch ( type ) + { +@@ -1361,13 +1365,18 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type) + return -EINVAL; + } + ++ vmx_vmcs_enter(v); ++ + /* Allocate memory on first use. */ + if ( unlikely(!*ptr) ) + { + paddr_t addr; + + if ( (*ptr = alloc_xenheap_page()) == NULL ) +- return -ENOMEM; ++ { ++ rc = -ENOMEM; ++ goto out; ++ } + + addr = virt_to_maddr(*ptr); + +@@ -1389,10 +1398,16 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type) + ent = locate_msr_entry(start, end, msr); + + if ( (ent < end) && (ent->index == msr) ) +- return 0; ++ { ++ rc = 0; ++ goto out; ++ } + + if ( total == (PAGE_SIZE / sizeof(*ent)) ) +- return -ENOSPC; ++ { ++ rc = -ENOSPC; ++ goto out; ++ } + + memmove(ent + 1, ent, sizeof(*ent) * (end - ent)); + +@@ -1413,7 +1428,12 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type) + break; + } + +- return 0; ++ rc = 0; ++ ++ out: ++ vmx_vmcs_exit(v); ++ ++ return rc; + } + + void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector) +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index d4ebae8945..95162bf187 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -2822,7 +2822,7 @@ static int is_last_branch_msr(u32 ecx) + + static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content) + { +- const struct vcpu *curr = current; ++ struct vcpu *curr = current; + + HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x", msr); + +@@ -2901,7 +2901,7 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content) + if ( passive_domain_do_rdmsr(msr, msr_content) ) + goto done; + +- if ( vmx_read_guest_msr(msr, msr_content) == 0 ) ++ if ( vmx_read_guest_msr(curr, msr, msr_content) == 0 ) + break; + + if ( is_last_branch_msr(msr) ) +@@ -3113,7 +3113,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + + for ( ; (rc == 0) && lbr->count; lbr++ ) + for ( i = 0; (rc == 0) && (i < lbr->count); i++ ) +- if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 ) ++ if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 ) + { + vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW); + if ( lbr_tsx_fixup_needed ) +@@ -3153,7 +3153,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + if ( wrmsr_viridian_regs(msr, msr_content) ) + break; + +- if ( vmx_write_guest_msr(msr, msr_content) == 0 || ++ if ( vmx_write_guest_msr(v, msr, msr_content) == 0 || + is_last_branch_msr(msr) ) + break; + +@@ -4169,7 +4169,7 @@ static void lbr_tsx_fixup(void) + struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; + struct vmx_msr_entry *msr; + +- if ( (msr = vmx_find_msr(lbr_from_start, VMX_MSR_GUEST)) != NULL ) ++ if ( (msr = vmx_find_msr(curr, lbr_from_start, VMX_MSR_GUEST)) != NULL ) + { + /* + * Sign extend into bits 61:62 while preserving bit 63 +@@ -4179,15 +4179,15 @@ static void lbr_tsx_fixup(void) + msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2); + } + +- if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_MSR_GUEST)) != NULL ) ++ if ( (msr = vmx_find_msr(curr, lbr_lastint_from, VMX_MSR_GUEST)) != NULL ) + msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2); + } + +-static void sign_extend_msr(u32 msr, int type) ++static void sign_extend_msr(struct vcpu *v, u32 msr, int type) + { + struct vmx_msr_entry *entry; + +- if ( (entry = vmx_find_msr(msr, type)) != NULL ) ++ if ( (entry = vmx_find_msr(v, msr, type)) != NULL ) + { + if ( entry->data & VADDR_TOP_BIT ) + entry->data |= CANONICAL_MASK; +@@ -4198,6 +4198,8 @@ static void sign_extend_msr(u32 msr, int type) + + static void bdw_erratum_bdf14_fixup(void) + { ++ struct vcpu *curr = current; ++ + /* + * Occasionally, on certain Broadwell CPUs MSR_IA32_LASTINTTOIP has + * been observed to have the top three bits corrupted as though the +@@ -4207,8 +4209,8 @@ static void bdw_erratum_bdf14_fixup(void) + * erratum BDF14. Fix up MSR_IA32_LASTINT{FROM,TO}IP by + * sign-extending into bits 48:63. + */ +- sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST); +- sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST); ++ sign_extend_msr(curr, MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST); ++ sign_extend_msr(curr, MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST); + } + + static void lbr_fixup(void) +diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h +index 20882d13e0..62afebec11 100644 +--- a/xen/include/asm-x86/hvm/vmx/vmcs.h ++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h +@@ -130,10 +130,17 @@ struct arch_vmx_struct { + uint64_t sfmask; + + struct vmx_msr_bitmap *msr_bitmap; +- unsigned int msr_count; ++ ++ /* ++ * Most accesses to the MSR host/guest load/save lists are in current ++ * context. However, the data can be modified by toolstack/migration ++ * actions. Remote access is only permitted for paused vcpus, and is ++ * protected under the domctl lock. ++ */ + struct vmx_msr_entry *msr_area; +- unsigned int host_msr_count; + struct vmx_msr_entry *host_msr_area; ++ unsigned int msr_count; ++ unsigned int host_msr_count; + + unsigned long eoi_exitmap_changed; + DECLARE_BITMAP(eoi_exit_bitmap, NR_VECTORS); +@@ -537,23 +544,25 @@ enum vmx_msr_list_type { + VMX_MSR_GUEST, /* MSRs saved on VMExit, loaded on VMEntry. */ + }; + +-int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type); ++int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type); + +-static inline int vmx_add_host_load_msr(uint32_t msr) ++static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr) + { +- return vmx_add_msr(msr, VMX_MSR_HOST); ++ return vmx_add_msr(v, msr, VMX_MSR_GUEST); + } + +-static inline int vmx_add_guest_msr(uint32_t msr) ++static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr) + { +- return vmx_add_msr(msr, VMX_MSR_GUEST); ++ return vmx_add_msr(v, msr, VMX_MSR_HOST); + } + +-struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type); ++struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr, ++ enum vmx_msr_list_type type); + +-static inline int vmx_read_guest_msr(uint32_t msr, uint64_t *val) ++static inline int vmx_read_guest_msr(const struct vcpu *v, uint32_t msr, ++ uint64_t *val) + { +- const struct vmx_msr_entry *ent = vmx_find_msr(msr, VMX_MSR_GUEST); ++ const struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST); + + if ( !ent ) + return -ESRCH; +@@ -563,9 +572,10 @@ static inline int vmx_read_guest_msr(uint32_t msr, uint64_t *val) + return 0; + } + +-static inline int vmx_write_guest_msr(uint32_t msr, uint64_t val) ++static inline int vmx_write_guest_msr(struct vcpu *v, uint32_t msr, ++ uint64_t val) + { +- struct vmx_msr_entry *ent = vmx_find_msr(msr, VMX_MSR_GUEST); ++ struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST); + + if ( !ent ) + return -ESRCH; +diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h +index 99d2af2e1f..e79d5a36ca 100644 +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -788,7 +788,7 @@ static inline struct domain *next_domain_in_cpupool( + #define _VPF_parked 8 + #define VPF_parked (1UL<<_VPF_parked) + +-static inline int vcpu_runnable(struct vcpu *v) ++static inline bool vcpu_runnable(const struct vcpu *v) + { + return !(v->pause_flags | + atomic_read(&v->pause_count) | +-- +2.17.1 + + +From cfdd4e846a77ca5510b6c35adeec55014a73efb9 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 7 May 2018 11:57:00 +0100 +Subject: [PATCH 26/42] x86/vmx: Improvements to LBR MSR handling + +The main purpose of this patch is to only ever insert the LBR MSRs into the +guest load/save list once, as a future patch wants to change the behaviour of +vmx_add_guest_msr(). + +The repeated processing of lbr_info and the guests MSR load/save list is +redundant, and a guest using LBR itself will have to re-enable +MSR_DEBUGCTL.LBR in its #DB handler, meaning that Xen will repeat this +redundant processing every time the guest gets a debug exception. + +Rename lbr_fixup_enabled to lbr_flags to be a little more generic, and use one +bit to indicate that the MSRs have been inserted into the load/save list. +Shorten the existing FIXUP* identifiers to reduce code volume. + +Furthermore, handing the guest #MC on an error isn't a legitimate action. Two +of the three failure cases are definitely hypervisor bugs, and the third is a +boundary case which shouldn't occur in practice. The guest also won't execute +correctly, so handle errors by cleanly crashing the guest. + +Signed-off-by: Andrew Cooper +Acked-by: Kevin Tian +Reviewed-by: Jan Beulich +(cherry picked from commit be73a842e642772d7372004c9c105de35b771020) +--- + xen/arch/x86/hvm/vmx/vmx.c | 81 +++++++++++++++++++++--------- + xen/include/asm-x86/hvm/vmx/vmcs.h | 2 +- + 2 files changed, 59 insertions(+), 24 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 95162bf187..5f01652d48 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -2758,8 +2758,10 @@ enum + + #define LBR_FROM_SIGNEXT_2MSB ((1ULL << 59) | (1ULL << 60)) + +-#define FIXUP_LBR_TSX (1u << 0) +-#define FIXUP_BDW_ERRATUM_BDF14 (1u << 1) ++#define LBR_MSRS_INSERTED (1u << 0) ++#define LBR_FIXUP_TSX (1u << 1) ++#define LBR_FIXUP_BDF14 (1u << 2) ++#define LBR_FIXUP_MASK (LBR_FIXUP_TSX | LBR_FIXUP_BDF14) + + static bool __read_mostly lbr_tsx_fixup_needed; + static bool __read_mostly bdw_erratum_bdf14_fixup_needed; +@@ -3094,7 +3096,6 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + break; + + case MSR_IA32_DEBUGCTLMSR: { +- int i, rc = 0; + uint64_t supported = IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF; + + if ( boot_cpu_has(X86_FEATURE_RTM) ) +@@ -3105,30 +3106,64 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + if ( vpmu_do_wrmsr(msr, msr_content, supported) ) + break; + } +- if ( msr_content & IA32_DEBUGCTLMSR_LBR ) ++ ++ /* ++ * When a guest first enables LBR, arrange to save and restore the LBR ++ * MSRs and allow the guest direct access. ++ * ++ * MSR_DEBUGCTL and LBR has existed almost as long as MSRs have ++ * existed, and there is no architectural way to hide the feature, or ++ * fail the attempt to enable LBR. ++ * ++ * Unknown host LBR MSRs or hitting -ENOSPC with the guest load/save ++ * list are definitely hypervisor bugs, whereas -ENOMEM for allocating ++ * the load/save list is simply unlucky (and shouldn't occur with ++ * sensible management by the toolstack). ++ * ++ * Either way, there is nothing we can do right now to recover, and ++ * the guest won't execute correctly either. Simply crash the domain ++ * to make the failure obvious. ++ */ ++ if ( !(v->arch.hvm_vmx.lbr_flags & LBR_MSRS_INSERTED) && ++ (msr_content & IA32_DEBUGCTLMSR_LBR) ) + { + const struct lbr_info *lbr = last_branch_msr_get(); +- if ( lbr == NULL ) +- break; + +- for ( ; (rc == 0) && lbr->count; lbr++ ) +- for ( i = 0; (rc == 0) && (i < lbr->count); i++ ) +- if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 ) ++ if ( unlikely(!lbr) ) ++ { ++ gprintk(XENLOG_ERR, "Unknown Host LBR MSRs\n"); ++ domain_crash(v->domain); ++ return X86EMUL_OKAY; ++ } ++ ++ for ( ; lbr->count; lbr++ ) ++ { ++ unsigned int i; ++ ++ for ( i = 0; i < lbr->count; i++ ) ++ { ++ int rc = vmx_add_guest_msr(v, lbr->base + i); ++ ++ if ( unlikely(rc) ) + { +- vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW); +- if ( lbr_tsx_fixup_needed ) +- v->arch.hvm_vmx.lbr_fixup_enabled |= FIXUP_LBR_TSX; +- if ( bdw_erratum_bdf14_fixup_needed ) +- v->arch.hvm_vmx.lbr_fixup_enabled |= +- FIXUP_BDW_ERRATUM_BDF14; ++ gprintk(XENLOG_ERR, ++ "Guest load/save list error %d\n", rc); ++ domain_crash(v->domain); ++ return X86EMUL_OKAY; + } +- } + +- if ( rc < 0 ) +- hvm_inject_hw_exception(TRAP_machine_check, X86_EVENT_NO_EC); +- else +- __vmwrite(GUEST_IA32_DEBUGCTL, msr_content); ++ vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW); ++ } ++ } ++ ++ v->arch.hvm_vmx.lbr_flags |= LBR_MSRS_INSERTED; ++ if ( lbr_tsx_fixup_needed ) ++ v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_TSX; ++ if ( bdw_erratum_bdf14_fixup_needed ) ++ v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_BDF14; ++ } + ++ __vmwrite(GUEST_IA32_DEBUGCTL, msr_content); + break; + } + case MSR_IA32_FEATURE_CONTROL: +@@ -4217,9 +4252,9 @@ static void lbr_fixup(void) + { + struct vcpu *curr = current; + +- if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_LBR_TSX ) ++ if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_TSX ) + lbr_tsx_fixup(); +- if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_BDW_ERRATUM_BDF14 ) ++ if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_BDF14 ) + bdw_erratum_bdf14_fixup(); + } + +@@ -4287,7 +4322,7 @@ bool vmx_vmenter_helper(const struct cpu_user_regs *regs) + } + + out: +- if ( unlikely(curr->arch.hvm_vmx.lbr_fixup_enabled) ) ++ if ( unlikely(curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_MASK) ) + lbr_fixup(); + + HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0); +diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h +index 62afebec11..2c9e291bee 100644 +--- a/xen/include/asm-x86/hvm/vmx/vmcs.h ++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h +@@ -156,7 +156,7 @@ struct arch_vmx_struct { + /* Are we emulating rather than VMENTERing? */ + uint8_t vmx_emulate; + +- uint8_t lbr_fixup_enabled; ++ uint8_t lbr_flags; + + /* Bitmask of segments that we can't safely use in virtual 8086 mode */ + uint16_t vm86_segment_mask; +-- +2.17.1 + + +From 8b35b978a273a153ceadccd9c02d433f8be1c9bd Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 7 May 2018 11:57:00 +0100 +Subject: [PATCH 27/42] x86/vmx: Pass an MSR value into vmx_msr_add() + +The main purpose of this change is to allow us to set a specific MSR value, +without needing to know whether there is already a load/save list slot for it. + +Previously, callers wanting this property needed to call both vmx_add_*_msr() +and vmx_write_*_msr() to cover both cases, and there are no callers which want +the old behaviour of being a no-op if an entry already existed for the MSR. + +As a result of this API improvement, the default value for guest MSRs need not +be 0, and the default for host MSRs need not be passed via hardware register. +In practice, this cleans up the VPMU allocation logic, and avoids an MSR read +as part of vcpu construction. + +Signed-off-by: Andrew Cooper +Acked-by: Kevin Tian +Reviewed-by: Jan Beulich +(cherry picked from commit ee7689b94ac7094b975ab4a023cfeae209da0a36) +--- + xen/arch/x86/cpu/vpmu_intel.c | 6 ++---- + xen/arch/x86/hvm/vmx/vmcs.c | 14 +++++++------- + xen/arch/x86/hvm/vmx/vmx.c | 2 +- + xen/include/asm-x86/hvm/vmx/vmcs.h | 20 ++++++++++++++------ + 4 files changed, 24 insertions(+), 18 deletions(-) + +diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c +index c499e69f2f..1fc79c9ff4 100644 +--- a/xen/arch/x86/cpu/vpmu_intel.c ++++ b/xen/arch/x86/cpu/vpmu_intel.c +@@ -454,13 +454,11 @@ static int core2_vpmu_alloc_resource(struct vcpu *v) + + if ( is_hvm_vcpu(v) ) + { +- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +- if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) ) ++ if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) ) + goto out_err; + +- if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) ) ++ if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) ) + goto out_err; +- vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0); + } + + core2_vpmu_cxt = xzalloc_bytes(sizeof(*core2_vpmu_cxt) + +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index e86f292fbc..af422b3f92 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -1339,7 +1339,8 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr, + return ((ent < end) && (ent->index == msr)) ? ent : NULL; + } + +-int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type) ++int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val, ++ enum vmx_msr_list_type type) + { + struct arch_vmx_struct *vmx = &v->arch.hvm_vmx; + struct vmx_msr_entry **ptr, *start = NULL, *ent, *end; +@@ -1398,11 +1399,9 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type) + ent = locate_msr_entry(start, end, msr); + + if ( (ent < end) && (ent->index == msr) ) +- { +- rc = 0; +- goto out; +- } ++ goto found; + ++ /* If there isn't an existing entry for msr, insert room for one. */ + if ( total == (PAGE_SIZE / sizeof(*ent)) ) + { + rc = -ENOSPC; +@@ -1417,17 +1416,18 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type) + switch ( type ) + { + case VMX_MSR_HOST: +- rdmsrl(msr, ent->data); + __vmwrite(VM_EXIT_MSR_LOAD_COUNT, ++vmx->host_msr_count); + break; + + case VMX_MSR_GUEST: +- ent->data = 0; + __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_count); + __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_count); + break; + } + ++ /* Set the msr's value. */ ++ found: ++ ent->data = val; + rc = 0; + + out: +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 5f01652d48..5745543e49 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -3142,7 +3142,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + + for ( i = 0; i < lbr->count; i++ ) + { +- int rc = vmx_add_guest_msr(v, lbr->base + i); ++ int rc = vmx_add_guest_msr(v, lbr->base + i, 0); + + if ( unlikely(rc) ) + { +diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h +index 2c9e291bee..f94a108ea5 100644 +--- a/xen/include/asm-x86/hvm/vmx/vmcs.h ++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h +@@ -544,16 +544,24 @@ enum vmx_msr_list_type { + VMX_MSR_GUEST, /* MSRs saved on VMExit, loaded on VMEntry. */ + }; + +-int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type); ++/** ++ * Add an MSR to an MSR list (inserting space for the entry if necessary), and ++ * set the MSRs value. ++ * ++ * May fail if unable to allocate memory for the list, or the total number of ++ * entries exceeds the memory allocated. ++ */ ++int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val, ++ enum vmx_msr_list_type type); + +-static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr) ++static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr, uint64_t val) + { +- return vmx_add_msr(v, msr, VMX_MSR_GUEST); ++ return vmx_add_msr(v, msr, val, VMX_MSR_GUEST); + } +- +-static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr) ++static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr, ++ uint64_t val) + { +- return vmx_add_msr(v, msr, VMX_MSR_HOST); ++ return vmx_add_msr(v, msr, val, VMX_MSR_HOST); + } + + struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr, +-- +2.17.1 + + +From 7b420e8a82cc8664e086ed31ec5e80615bd6225f Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 7 May 2018 11:57:00 +0100 +Subject: [PATCH 28/42] x86/vmx: Support load-only guest MSR list entries +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Currently, the VMX_MSR_GUEST type maintains completely symmetric guest load +and save lists, by pointing VM_EXIT_MSR_STORE_ADDR and VM_ENTRY_MSR_LOAD_ADDR +at the same page, and setting VM_EXIT_MSR_STORE_COUNT and +VM_ENTRY_MSR_LOAD_COUNT to the same value. + +However, for MSRs which we won't let the guest have direct access to, having +hardware save the current value on VMExit is unnecessary overhead. + +To avoid this overhead, we must make the load and save lists asymmetric. By +making the entry load count greater than the exit store count, we can maintain +two adjacent lists of MSRs, the first of which is saved and restored, and the +second of which is only restored on VMEntry. + +For simplicity: + * Both adjacent lists are still sorted by MSR index. + * It undefined behaviour to insert the same MSR into both lists. + * The total size of both lists is still limited at 256 entries (one 4k page). + +Split the current msr_count field into msr_{load,save}_count, and introduce a +new VMX_MSR_GUEST_LOADONLY type, and update vmx_{add,find}_msr() to calculate +which sublist to search, based on type. VMX_MSR_HOST has no logical sublist, +whereas VMX_MSR_GUEST has a sublist between 0 and the save count, while +VMX_MSR_GUEST_LOADONLY has a sublist between the save count and the load +count. + +One subtle point is that inserting an MSR into the load-save list involves +moving the entire load-only list, and updating both counts. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +Reviewed-by: Roger Pau Monné +Acked-by: Kevin Tian +(cherry picked from commit 1ac46b55632626aeb935726e1b0a71605ef6763a) +--- + xen/arch/x86/hvm/vmx/vmcs.c | 46 +++++++++++++++++++++++------- + xen/arch/x86/hvm/vmx/vmx.c | 2 +- + xen/include/asm-x86/hvm/vmx/vmcs.h | 7 ++++- + 3 files changed, 43 insertions(+), 12 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index af422b3f92..ca652c49cb 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -1310,7 +1310,7 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr, + { + const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx; + struct vmx_msr_entry *start = NULL, *ent, *end; +- unsigned int total; ++ unsigned int substart, subend, total; + + ASSERT(v == current || !vcpu_runnable(v)); + +@@ -1318,12 +1318,23 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr, + { + case VMX_MSR_HOST: + start = vmx->host_msr_area; +- total = vmx->host_msr_count; ++ substart = 0; ++ subend = vmx->host_msr_count; ++ total = subend; + break; + + case VMX_MSR_GUEST: + start = vmx->msr_area; +- total = vmx->msr_count; ++ substart = 0; ++ subend = vmx->msr_save_count; ++ total = vmx->msr_load_count; ++ break; ++ ++ case VMX_MSR_GUEST_LOADONLY: ++ start = vmx->msr_area; ++ substart = vmx->msr_save_count; ++ subend = vmx->msr_load_count; ++ total = subend; + break; + + default: +@@ -1334,7 +1345,7 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr, + return NULL; + + end = start + total; +- ent = locate_msr_entry(start, end, msr); ++ ent = locate_msr_entry(start + substart, start + subend, msr); + + return ((ent < end) && (ent->index == msr)) ? ent : NULL; + } +@@ -1344,7 +1355,7 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val, + { + struct arch_vmx_struct *vmx = &v->arch.hvm_vmx; + struct vmx_msr_entry **ptr, *start = NULL, *ent, *end; +- unsigned int total; ++ unsigned int substart, subend, total; + int rc; + + ASSERT(v == current || !vcpu_runnable(v)); +@@ -1353,12 +1364,23 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val, + { + case VMX_MSR_HOST: + ptr = &vmx->host_msr_area; +- total = vmx->host_msr_count; ++ substart = 0; ++ subend = vmx->host_msr_count; ++ total = subend; + break; + + case VMX_MSR_GUEST: + ptr = &vmx->msr_area; +- total = vmx->msr_count; ++ substart = 0; ++ subend = vmx->msr_save_count; ++ total = vmx->msr_load_count; ++ break; ++ ++ case VMX_MSR_GUEST_LOADONLY: ++ ptr = &vmx->msr_area; ++ substart = vmx->msr_save_count; ++ subend = vmx->msr_load_count; ++ total = subend; + break; + + default: +@@ -1388,6 +1410,7 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val, + break; + + case VMX_MSR_GUEST: ++ case VMX_MSR_GUEST_LOADONLY: + __vmwrite(VM_EXIT_MSR_STORE_ADDR, addr); + __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, addr); + break; +@@ -1396,7 +1419,7 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val, + + start = *ptr; + end = start + total; +- ent = locate_msr_entry(start, end, msr); ++ ent = locate_msr_entry(start + substart, start + subend, msr); + + if ( (ent < end) && (ent->index == msr) ) + goto found; +@@ -1420,8 +1443,11 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val, + break; + + case VMX_MSR_GUEST: +- __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_count); +- __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_count); ++ __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_save_count); ++ ++ /* Fallthrough */ ++ case VMX_MSR_GUEST_LOADONLY: ++ __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, ++vmx->msr_load_count); + break; + } + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 5745543e49..1e32f61225 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -4200,7 +4200,7 @@ out: + static void lbr_tsx_fixup(void) + { + struct vcpu *curr = current; +- unsigned int msr_count = curr->arch.hvm_vmx.msr_count; ++ unsigned int msr_count = curr->arch.hvm_vmx.msr_save_count; + struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; + struct vmx_msr_entry *msr; + +diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h +index f94a108ea5..57e5098b99 100644 +--- a/xen/include/asm-x86/hvm/vmx/vmcs.h ++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h +@@ -139,7 +139,8 @@ struct arch_vmx_struct { + */ + struct vmx_msr_entry *msr_area; + struct vmx_msr_entry *host_msr_area; +- unsigned int msr_count; ++ unsigned int msr_load_count; ++ unsigned int msr_save_count; + unsigned int host_msr_count; + + unsigned long eoi_exitmap_changed; +@@ -542,12 +543,16 @@ enum vmx_insn_errno + enum vmx_msr_list_type { + VMX_MSR_HOST, /* MSRs loaded on VMExit. */ + VMX_MSR_GUEST, /* MSRs saved on VMExit, loaded on VMEntry. */ ++ VMX_MSR_GUEST_LOADONLY, /* MSRs loaded on VMEntry only. */ + }; + + /** + * Add an MSR to an MSR list (inserting space for the entry if necessary), and + * set the MSRs value. + * ++ * It is undefined behaviour to try and insert the same MSR into both the ++ * GUEST and GUEST_LOADONLY list. ++ * + * May fail if unable to allocate memory for the list, or the total number of + * entries exceeds the memory allocated. + */ +-- +2.17.1 + + +From 1d32c21975097e64a7ecf0932680a3b6d53d00a4 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Thu, 19 Jul 2018 11:54:45 +0200 +Subject: [PATCH 29/42] VMX: fix vmx_{find,del}_msr() build + +Older gcc at -O2 (and perhaps higher) does not recognize that apparently +uninitialized variables aren't really uninitialized. Pull out the +assignments used by two of the three case blocks and make them +initializers of the variables, as I think I had suggested during review. + +Signed-off-by: Jan Beulich +Reviewed-by: Wei Liu +Acked-by: Kevin Tian +(cherry picked from commit 97cb0516a322ecdf0032fa9d8aa1525c03d7772f) +--- + xen/arch/x86/hvm/vmx/vmcs.c | 12 ++++-------- + 1 file changed, 4 insertions(+), 8 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index ca652c49cb..30a33dd0bd 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -1310,7 +1310,8 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr, + { + const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx; + struct vmx_msr_entry *start = NULL, *ent, *end; +- unsigned int substart, subend, total; ++ unsigned int substart = 0, subend = vmx->msr_save_count; ++ unsigned int total = vmx->msr_load_count; + + ASSERT(v == current || !vcpu_runnable(v)); + +@@ -1318,23 +1319,18 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr, + { + case VMX_MSR_HOST: + start = vmx->host_msr_area; +- substart = 0; + subend = vmx->host_msr_count; + total = subend; + break; + + case VMX_MSR_GUEST: + start = vmx->msr_area; +- substart = 0; +- subend = vmx->msr_save_count; +- total = vmx->msr_load_count; + break; + + case VMX_MSR_GUEST_LOADONLY: + start = vmx->msr_area; +- substart = vmx->msr_save_count; +- subend = vmx->msr_load_count; +- total = subend; ++ substart = subend; ++ subend = total; + break; + + default: +-- +2.17.1 + + +From fa79f9e762be390b56218437ed317a695a03a5e7 Mon Sep 17 00:00:00 2001 +From: Stefano Stabellini +Date: Mon, 13 Aug 2018 17:25:51 +0100 +Subject: [PATCH 30/42] ARM: disable grant table v2 + +It was never expected to work, the implementation is incomplete. + +As a side effect, it also prevents guests from triggering a +"BUG_ON(page_get_owner(pg) != d)" in gnttab_unpopulate_status_frames(). + +This is XSA-268. + +Signed-off-by: Stefano Stabellini +Acked-by: Jan Beulich +(cherry picked from commit 9a5c16a3e75778c8a094ca87784d93b74676f46c) +--- + docs/misc/xen-command-line.markdown | 2 ++ + xen/common/grant_table.c | 6 +++++- + xen/include/asm-arm/grant_table.h | 1 + + 3 files changed, 8 insertions(+), 1 deletion(-) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 3b710b71fb..e5e7fdc405 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -936,6 +936,8 @@ version are 1 and 2. + use of grant table v2 without transitive grants is an ABI breakage from the + guests point of view. + ++The usage of gnttab v2 is not security supported on ARM platforms. ++ + ### gnttab\_max\_frames + > `= ` + +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index c757b7f6f5..231ecf509a 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -97,7 +97,11 @@ static unsigned int __read_mostly max_maptrack_frames = + DEFAULT_MAX_MAPTRACK_FRAMES; + integer_runtime_param("gnttab_max_maptrack_frames", max_maptrack_frames); + +-static unsigned int __read_mostly opt_gnttab_max_version = 2; ++#ifndef GNTTAB_MAX_VERSION ++#define GNTTAB_MAX_VERSION 2 ++#endif ++ ++static unsigned int __read_mostly opt_gnttab_max_version = GNTTAB_MAX_VERSION; + static bool __read_mostly opt_transitive_grants = true; + + static int __init parse_gnttab(const char *s) +diff --git a/xen/include/asm-arm/grant_table.h b/xen/include/asm-arm/grant_table.h +index e52936c79f..24958e4670 100644 +--- a/xen/include/asm-arm/grant_table.h ++++ b/xen/include/asm-arm/grant_table.h +@@ -7,6 +7,7 @@ + #include + + #define INITIAL_NR_GRANT_FRAMES 1U ++#define GNTTAB_MAX_VERSION 1 + + struct grant_table_arch { + gfn_t *shared_gfn; +-- +2.17.1 + + +From 48fb482ef695c6b193ccfca665e6dd302eb230e2 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 13 Aug 2018 17:26:21 +0100 +Subject: [PATCH 31/42] x86/vtx: Fix the checking for unknown/invalid + MSR_DEBUGCTL bits + +The VPMU_MODE_OFF early-exit in vpmu_do_wrmsr() introduced by c/s +11fe998e56 bypasses all reserved bit checking in the general case. As a +result, a guest can enable BTS when it shouldn't be permitted to, and +lock up the entire host. + +With vPMU active (not a security supported configuration, but useful for +debugging), the reserved bit checking in broken, caused by the original +BTS changeset 1a8aa75ed. + +From a correctness standpoint, it is not possible to have two different +pieces of code responsible for different parts of value checking, if +there isn't an accumulation of bits which have been checked. A +practical upshot of this is that a guest can set any value it +wishes (usually resulting in a vmentry failure for bad guest state). + +Therefore, fix this by implementing all the reserved bit checking in the +main MSR_DEBUGCTL block, and removing all handling of DEBUGCTL from the +vPMU MSR logic. + +This is XSA-269. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit 2a8a8e99feb950504559196521bc9fd63ed3a962) +--- + xen/arch/x86/cpu/vpmu_intel.c | 20 -------------------- + xen/arch/x86/hvm/vmx/vmx.c | 29 ++++++++++++++++++++--------- + 2 files changed, 20 insertions(+), 29 deletions(-) + +diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c +index 1fc79c9ff4..6e27f6ec8e 100644 +--- a/xen/arch/x86/cpu/vpmu_intel.c ++++ b/xen/arch/x86/cpu/vpmu_intel.c +@@ -533,27 +533,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content, + uint64_t *enabled_cntrs; + + if ( !core2_vpmu_msr_common_check(msr, &type, &index) ) +- { +- /* Special handling for BTS */ +- if ( msr == MSR_IA32_DEBUGCTLMSR ) +- { +- supported |= IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS | +- IA32_DEBUGCTLMSR_BTINT; +- +- if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) ) +- supported |= IA32_DEBUGCTLMSR_BTS_OFF_OS | +- IA32_DEBUGCTLMSR_BTS_OFF_USR; +- if ( !(msr_content & ~supported) && +- vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) ) +- return 0; +- if ( (msr_content & supported) && +- !vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) ) +- printk(XENLOG_G_WARNING +- "%pv: Debug Store unsupported on this CPU\n", +- current); +- } + return -EINVAL; +- } + + ASSERT(!supported); + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 1e32f61225..c7cf3a8fbc 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -3038,11 +3038,14 @@ void vmx_vlapic_msr_changed(struct vcpu *v) + static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + { + struct vcpu *v = current; ++ const struct cpuid_policy *cp = v->domain->arch.cpuid; + + HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x, msr_value=%#"PRIx64, msr, msr_content); + + switch ( msr ) + { ++ uint64_t rsvd; ++ + case MSR_IA32_SYSENTER_CS: + __vmwrite(GUEST_SYSENTER_CS, msr_content); + break; +@@ -3095,18 +3098,26 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + wrmsrl(MSR_SYSCALL_MASK, msr_content); + break; + +- case MSR_IA32_DEBUGCTLMSR: { +- uint64_t supported = IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF; ++ case MSR_IA32_DEBUGCTLMSR: ++ rsvd = ~(IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF); + +- if ( boot_cpu_has(X86_FEATURE_RTM) ) +- supported |= IA32_DEBUGCTLMSR_RTM; +- if ( msr_content & ~supported ) ++ /* TODO: Wire vPMU settings properly through the CPUID policy */ ++ if ( vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_BTS) ) + { +- /* Perhaps some other bits are supported in vpmu. */ +- if ( vpmu_do_wrmsr(msr, msr_content, supported) ) +- break; ++ rsvd &= ~(IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS | ++ IA32_DEBUGCTLMSR_BTINT); ++ ++ if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) ) ++ rsvd &= ~(IA32_DEBUGCTLMSR_BTS_OFF_OS | ++ IA32_DEBUGCTLMSR_BTS_OFF_USR); + } + ++ if ( cp->feat.rtm ) ++ rsvd &= ~IA32_DEBUGCTLMSR_RTM; ++ ++ if ( msr_content & rsvd ) ++ goto gp_fault; ++ + /* + * When a guest first enables LBR, arrange to save and restore the LBR + * MSRs and allow the guest direct access. +@@ -3165,7 +3176,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + + __vmwrite(GUEST_IA32_DEBUGCTL, msr_content); + break; +- } ++ + case MSR_IA32_FEATURE_CONTROL: + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + /* None of these MSRs are writeable. */ +-- +2.17.1 + + +From e6441a804b76797c6ebac81b7d70ff19e5df9188 Mon Sep 17 00:00:00 2001 +From: Christian Lindig +Date: Mon, 13 Aug 2018 17:26:56 +0100 +Subject: [PATCH 32/42] tools/oxenstored: Make evaluation order explicit + +In Store.path_write(), Path.apply_modify() updates the node_created +reference and both the value of apply_modify() and node_created are +returned by path_write(). + +At least with OCaml 4.06.1 this leads to the value of node_created being +returned *before* it is updated by apply_modify(). This in turn leads +to the quota for a domain not being updated in Store.write(). Hence, a +guest can create an unlimited number of entries in xenstore. + +The fix is to make evaluation order explicit. + +This is XSA-272. + +Signed-off-by: Christian Lindig +Reviewed-by: Rob Hoes +(cherry picked from commit 73392c7fd14c59f8c96e0b2eeeb329e4ae9086b6) +--- + tools/ocaml/xenstored/store.ml | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/tools/ocaml/xenstored/store.ml b/tools/ocaml/xenstored/store.ml +index 13cf3b5bf4..5a8c377603 100644 +--- a/tools/ocaml/xenstored/store.ml ++++ b/tools/ocaml/xenstored/store.ml +@@ -262,7 +262,8 @@ let path_write store perm path value = + Node.check_perm store.root perm Perms.WRITE; + Node.set_value store.root value, false + ) else +- Path.apply_modify store.root path do_write, !node_created ++ let root = Path.apply_modify store.root path do_write in ++ root, !node_created + + let path_rm store perm path = + let do_rm node name = +-- +2.17.1 + + +From d044f6cc590c58178d87ad78f1859d1c7905ee0b Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Wed, 25 Jul 2018 12:10:19 +0000 +Subject: [PATCH 33/42] x86/spec-ctrl: Calculate safe PTE addresses for L1TF + mitigations + +Safe PTE addresses for L1TF mitigations are ones which are within the L1D +address width (may be wider than reported in CPUID), and above the highest +cacheable RAM/NVDIMM/BAR/etc. + +All logic here is best-effort heuristics, which should in practice be fine for +most hardware. Future work will see about disentangling the SRAT handling +further, as well as having L0 pass this information down to lower levels when +virtualised. + +This is part of XSA-273 / CVE-2018-3620. + +Signed-off-by: Andrew Cooper +Signed-off-by: Jan Beulich +(cherry picked from commit b03a57c9383b32181e60add6b6de12b473652aa4) +--- + xen/arch/x86/setup.c | 12 +++ + xen/arch/x86/spec_ctrl.c | 153 ++++++++++++++++++++++++++++++++ + xen/arch/x86/srat.c | 8 +- + xen/common/efi/boot.c | 12 +++ + xen/include/asm-x86/spec_ctrl.h | 7 ++ + 5 files changed, 190 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index 66fd13f93a..3cd3e81b30 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -912,6 +912,18 @@ void __init noreturn __start_xen(unsigned long mbi_p) + /* Sanitise the raw E820 map to produce a final clean version. */ + max_page = raw_max_page = init_e820(memmap_type, &e820_raw); + ++ if ( !efi_enabled(EFI_BOOT) ) ++ { ++ /* ++ * Supplement the heuristics in l1tf_calculations() by assuming that ++ * anything referenced in the E820 may be cacheable. ++ */ ++ l1tf_safe_maddr = ++ max(l1tf_safe_maddr, ++ ROUNDUP(e820_raw.map[e820_raw.nr_map - 1].addr + ++ e820_raw.map[e820_raw.nr_map - 1].size, PAGE_SIZE)); ++ } ++ + /* Create a temporary copy of the E820 map. */ + memcpy(&boot_e820, &e820, sizeof(e820)); + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 32213ace86..fe15a58de0 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -50,6 +50,10 @@ bool __initdata bsp_delay_spec_ctrl; + uint8_t __read_mostly default_xen_spec_ctrl; + uint8_t __read_mostly default_spec_ctrl_flags; + ++paddr_t __read_mostly l1tf_addr_mask, __read_mostly l1tf_safe_maddr; ++static bool __initdata cpu_has_bug_l1tf; ++static unsigned int __initdata l1d_maxphysaddr; ++ + static int __init parse_bti(const char *s) + { + const char *ss; +@@ -420,6 +424,153 @@ static bool __init should_use_eager_fpu(void) + } + } + ++/* Calculate whether this CPU is vulnerable to L1TF. */ ++static __init void l1tf_calculations(uint64_t caps) ++{ ++ bool hit_default = false; ++ ++ l1d_maxphysaddr = paddr_bits; ++ ++ /* L1TF is only known to affect Intel Family 6 processors at this time. */ ++ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && ++ boot_cpu_data.x86 == 6 ) ++ { ++ switch ( boot_cpu_data.x86_model ) ++ { ++ /* ++ * Core processors since at least Penryn are vulnerable. ++ */ ++ case 0x17: /* Penryn */ ++ case 0x1d: /* Dunnington */ ++ cpu_has_bug_l1tf = true; ++ break; ++ ++ case 0x1f: /* Auburndale / Havendale */ ++ case 0x1e: /* Nehalem */ ++ case 0x1a: /* Nehalem EP */ ++ case 0x2e: /* Nehalem EX */ ++ case 0x25: /* Westmere */ ++ case 0x2c: /* Westmere EP */ ++ case 0x2f: /* Westmere EX */ ++ cpu_has_bug_l1tf = true; ++ l1d_maxphysaddr = 44; ++ break; ++ ++ case 0x2a: /* SandyBridge */ ++ case 0x2d: /* SandyBridge EP/EX */ ++ case 0x3a: /* IvyBridge */ ++ case 0x3e: /* IvyBridge EP/EX */ ++ case 0x3c: /* Haswell */ ++ case 0x3f: /* Haswell EX/EP */ ++ case 0x45: /* Haswell D */ ++ case 0x46: /* Haswell H */ ++ case 0x3d: /* Broadwell */ ++ case 0x47: /* Broadwell H */ ++ case 0x4f: /* Broadwell EP/EX */ ++ case 0x56: /* Broadwell D */ ++ case 0x4e: /* Skylake M */ ++ case 0x55: /* Skylake X */ ++ case 0x5e: /* Skylake D */ ++ case 0x66: /* Cannonlake */ ++ case 0x67: /* Cannonlake? */ ++ case 0x8e: /* Kabylake M */ ++ case 0x9e: /* Kabylake D */ ++ cpu_has_bug_l1tf = true; ++ l1d_maxphysaddr = 46; ++ break; ++ ++ /* ++ * Atom processors are not vulnerable. ++ */ ++ case 0x1c: /* Pineview */ ++ case 0x26: /* Lincroft */ ++ case 0x27: /* Penwell */ ++ case 0x35: /* Cloverview */ ++ case 0x36: /* Cedarview */ ++ case 0x37: /* Baytrail / Valleyview (Silvermont) */ ++ case 0x4d: /* Avaton / Rangely (Silvermont) */ ++ case 0x4c: /* Cherrytrail / Brasswell */ ++ case 0x4a: /* Merrifield */ ++ case 0x5a: /* Moorefield */ ++ case 0x5c: /* Goldmont */ ++ case 0x5f: /* Denverton */ ++ case 0x7a: /* Gemini Lake */ ++ break; ++ ++ /* ++ * Knights processors are not vulnerable. ++ */ ++ case 0x57: /* Knights Landing */ ++ case 0x85: /* Knights Mill */ ++ break; ++ ++ default: ++ /* Defer printk() until we've accounted for RDCL_NO. */ ++ hit_default = true; ++ cpu_has_bug_l1tf = true; ++ break; ++ } ++ } ++ ++ /* Any processor advertising RDCL_NO should be not vulnerable to L1TF. */ ++ if ( caps & ARCH_CAPABILITIES_RDCL_NO ) ++ cpu_has_bug_l1tf = false; ++ ++ if ( cpu_has_bug_l1tf && hit_default ) ++ printk("Unrecognised CPU model %#x - assuming vulnerable to L1TF\n", ++ boot_cpu_data.x86_model); ++ ++ /* ++ * L1TF safe address heuristics. These apply to the real hardware we are ++ * running on, and are best-effort-only if Xen is virtualised. ++ * ++ * The address mask which the L1D cache uses, which might be wider than ++ * the CPUID-reported maxphysaddr. ++ */ ++ l1tf_addr_mask = ((1ul << l1d_maxphysaddr) - 1) & PAGE_MASK; ++ ++ /* ++ * To be safe, l1tf_safe_maddr must be above the highest cacheable entity ++ * in system physical address space. However, to preserve space for ++ * paged-out metadata, it should be as low as possible above the highest ++ * cacheable address, so as to require fewer high-order bits being set. ++ * ++ * These heuristics are based on some guesswork to improve the likelihood ++ * of safety in the common case, including Linux's L1TF mitigation of ++ * inverting all address bits in a non-present PTE. ++ * ++ * - If L1D is wider than CPUID (Nehalem and later mobile/desktop/low end ++ * server), setting any address bit beyond CPUID maxphysaddr guarantees ++ * to make the PTE safe. This case doesn't require all the high-order ++ * bits being set, and doesn't require any other source of information ++ * for safety. ++ * ++ * - If L1D is the same as CPUID (Pre-Nehalem, or high end server), we ++ * must sacrifice high order bits from the real address space for ++ * safety. Therefore, make a blind guess that there is nothing ++ * cacheable in the top quarter of physical address space. ++ * ++ * It is exceedingly unlikely for machines to be populated with this ++ * much RAM (likely 512G on pre-Nehalem, 16T on Nehalem/Westmere, 64T on ++ * Sandybridge and later) due to the sheer volume of DIMMs this would ++ * actually take. ++ * ++ * However, it is possible to find machines this large, so the "top ++ * quarter" guess is supplemented to push the limit higher if references ++ * to cacheable mappings (E820/SRAT/EFI/etc) are found above the top ++ * quarter boundary. ++ * ++ * Finally, this top quarter guess gives us a good chance of being safe ++ * when running virtualised (and the CPUID maxphysaddr hasn't been ++ * levelled for heterogeneous migration safety), where the safety ++ * consideration is still in terms of host details, but all E820/etc ++ * information is in terms of guest physical layout. ++ */ ++ l1tf_safe_maddr = max(l1tf_safe_maddr, ((l1d_maxphysaddr > paddr_bits) ++ ? (1ul << paddr_bits) ++ : (3ul << (paddr_bits - 2)))); ++} ++ + int8_t __read_mostly opt_xpti = -1; + + static __init void xpti_init_default(uint64_t caps) +@@ -633,6 +784,8 @@ void __init init_speculation_mitigations(void) + else + setup_clear_cpu_cap(X86_FEATURE_NO_XPTI); + ++ l1tf_calculations(caps); ++ + print_details(thunk, caps); + + /* +diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c +index 166eb44fe2..2d70b45909 100644 +--- a/xen/arch/x86/srat.c ++++ b/xen/arch/x86/srat.c +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + static struct acpi_table_slit *__read_mostly acpi_slit; + +@@ -284,6 +285,11 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma) + if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) + return; + ++ start = ma->base_address; ++ end = start + ma->length; ++ /* Supplement the heuristics in l1tf_calculations(). */ ++ l1tf_safe_maddr = max(l1tf_safe_maddr, ROUNDUP(end, PAGE_SIZE)); ++ + if (num_node_memblks >= NR_NODE_MEMBLKS) + { + dprintk(XENLOG_WARNING, +@@ -292,8 +298,6 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma) + return; + } + +- start = ma->base_address; +- end = start + ma->length; + pxm = ma->proximity_domain; + if (srat_rev < 2) + pxm &= 0xff; +diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c +index 64d12685d3..6be0b3986f 100644 +--- a/xen/common/efi/boot.c ++++ b/xen/common/efi/boot.c +@@ -1304,6 +1304,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable) + + #ifndef CONFIG_ARM /* TODO - runtime service support */ + ++#include ++ + static bool __initdata efi_map_uc; + + static int __init parse_efi_param(const char *s) +@@ -1419,6 +1421,16 @@ void __init efi_init_memory(void) + desc->PhysicalStart, desc->PhysicalStart + len - 1, + desc->Type, desc->Attribute); + ++ if ( (desc->Attribute & (EFI_MEMORY_WB | EFI_MEMORY_WT)) || ++ (efi_bs_revision >= EFI_REVISION(2, 5) && ++ (desc->Attribute & EFI_MEMORY_WP)) ) ++ { ++ /* Supplement the heuristics in l1tf_calculations(). */ ++ l1tf_safe_maddr = ++ max(l1tf_safe_maddr, ++ ROUNDUP(desc->PhysicalStart + len, PAGE_SIZE)); ++ } ++ + if ( !efi_enabled(EFI_RS) || + (!(desc->Attribute & EFI_MEMORY_RUNTIME) && + (!map_bs || +diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h +index fea82603ca..d7e8ed0f5f 100644 +--- a/xen/include/asm-x86/spec_ctrl.h ++++ b/xen/include/asm-x86/spec_ctrl.h +@@ -38,6 +38,13 @@ extern int8_t opt_xpti; + #define OPT_XPTI_DOM0 0x01 + #define OPT_XPTI_DOMU 0x02 + ++/* ++ * The L1D address mask, which might be wider than reported in CPUID, and the ++ * system physical address above which there are believed to be no cacheable ++ * memory regions, thus unable to leak data via the L1TF vulnerability. ++ */ ++extern paddr_t l1tf_addr_mask, l1tf_safe_maddr; ++ + static inline void init_shadow_spec_ctrl_state(void) + { + struct cpu_info *info = get_cpu_info(); +-- +2.17.1 + + +From 57483c09ef4fe9489ec4214989a97949916fecc0 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 23 Jul 2018 13:46:10 +0000 +Subject: [PATCH 34/42] x86/spec-ctrl: Introduce an option to control L1TF + mitigation for PV guests + +Shadowing a PV guest is only available when shadow paging is compiled in. +When shadow paging isn't available, guests can be crashed instead as +mitigation from Xen's point of view. + +Ideally, dom0 would also be potentially-shadowed-by-default, but dom0 has +never been shadowed before, and there are some stability issues under +investigation. + +This is part of XSA-273 / CVE-2018-3620. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit 66a4e986819a86ba66ca2fe9d925e62a4fd30114) +--- + docs/misc/xen-command-line.markdown | 24 ++++++++ + xen/arch/x86/Kconfig | 1 + + xen/arch/x86/spec_ctrl.c | 89 ++++++++++++++++++++++++++++- + xen/include/asm-x86/spec_ctrl.h | 4 ++ + 4 files changed, 115 insertions(+), 3 deletions(-) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index e5e7fdc405..763cc1d878 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -1546,6 +1546,30 @@ do; there may be other custom operating systems which do. If you're + certain you don't plan on having PV guests which use this feature, + turning it off can reduce the attack surface. + ++### pv-l1tf (x86) ++> `= List of [ , dom0=, domu= ]` ++ ++> Default: `false` on believed-unaffected hardware, or in pv-shim mode. ++> `domu` on believed-affected hardware. ++ ++Mitigations for L1TF / XSA-273 / CVE-2018-3620 for PV guests. ++ ++For backwards compatibility, we may not alter an architecturally-legitimate ++pagetable entry a PV guest chooses to write. We can however force such a ++guest into shadow mode so that Xen controls the PTEs which are reachable by ++the CPU pagewalk. ++ ++Shadowing is performed at the point where a PV guest first tries to write an ++L1TF-vulnerable PTE. Therefore, a PV guest kernel which has been updated with ++its own L1TF mitigations will not trigger shadow mode if it is well behaved. ++ ++If CONFIG\_SHADOW\_PAGING is not compiled in, this mitigation instead crashes ++the guest when an L1TF-vulnerable PTE is written, which still allows updated, ++well-behaved PV guests to run, despite Shadow being compiled out. ++ ++In the pv-shim case, Shadow is expected to be compiled out, and a malicious ++guest kernel can only leak data from the shim Xen, rather than the host Xen. ++ + ### pv-shim (x86) + > `= ` + +diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig +index f64fc56739..cfba4a708c 100644 +--- a/xen/arch/x86/Kconfig ++++ b/xen/arch/x86/Kconfig +@@ -72,6 +72,7 @@ config SHADOW_PAGING + * Running HVM guests on hardware lacking hardware paging support + (First-generation Intel VT-x or AMD SVM). + * Live migration of PV guests. ++ * L1TF sidechannel mitigation for PV guests. + + Under a small number of specific workloads, shadow paging may be + deliberately used as a performance optimisation. +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index fe15a58de0..7995e27218 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -203,6 +204,55 @@ static int __init parse_spec_ctrl(const char *s) + } + custom_param("spec-ctrl", parse_spec_ctrl); + ++int8_t __read_mostly opt_pv_l1tf = -1; ++ ++static __init int parse_pv_l1tf(const char *s) ++{ ++ const char *ss; ++ int val, rc = 0; ++ ++ /* Inhibit the defaults as an explicit choice has been given. */ ++ if ( opt_pv_l1tf == -1 ) ++ opt_pv_l1tf = 0; ++ ++ /* Interpret 'pv-l1tf' alone in its positive boolean form. */ ++ if ( *s == '\0' ) ++ opt_xpti = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU; ++ ++ do { ++ ss = strchr(s, ','); ++ if ( !ss ) ++ ss = strchr(s, '\0'); ++ ++ switch ( parse_bool(s, ss) ) ++ { ++ case 0: ++ opt_pv_l1tf = 0; ++ break; ++ ++ case 1: ++ opt_pv_l1tf = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU; ++ break; ++ ++ default: ++ if ( (val = parse_boolean("dom0", s, ss)) >= 0 ) ++ opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOM0) | ++ (val ? OPT_PV_L1TF_DOM0 : 0)); ++ else if ( (val = parse_boolean("domu", s, ss)) >= 0 ) ++ opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOMU) | ++ (val ? OPT_PV_L1TF_DOMU : 0)); ++ else ++ rc = -EINVAL; ++ break; ++ } ++ ++ s = ss + 1; ++ } while ( *ss ); ++ ++ return rc; ++} ++custom_param("pv-l1tf", parse_pv_l1tf); ++ + static void __init print_details(enum ind_thunk thunk, uint64_t caps) + { + unsigned int _7d0 = 0, e8b = 0, tmp; +@@ -226,9 +276,16 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", + (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : ""); + +- /* Compiled-in support which pertains to BTI mitigations. */ +- if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) ) +- printk(" Compiled-in support: INDIRECT_THUNK\n"); ++ /* Compiled-in support which pertains to mitigations. */ ++ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) ++ printk(" Compiled-in support:" ++#ifdef CONFIG_INDIRECT_THUNK ++ " INDIRECT_THUNK" ++#endif ++#ifdef CONFIG_SHADOW_PAGING ++ " SHADOW_PAGING" ++#endif ++ "\n"); + + /* Settings for Xen's protection, irrespective of guests. */ + printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s\n", +@@ -242,6 +299,13 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", + opt_ibpb ? " IBPB" : ""); + ++ /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ ++ if ( cpu_has_bug_l1tf || opt_pv_l1tf ) ++ printk(" L1TF: believed%s vulnerable, maxphysaddr L1D %u, CPUID %u" ++ ", Safe address %"PRIx64"\n", ++ cpu_has_bug_l1tf ? "" : " not", ++ l1d_maxphysaddr, paddr_bits, l1tf_safe_maddr); ++ + /* + * Alternatives blocks for protecting against and/or virtualising + * mitigation support for guests. +@@ -263,6 +327,10 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s\n", + opt_xpti & OPT_XPTI_DOM0 ? "enabled" : "disabled", + opt_xpti & OPT_XPTI_DOMU ? "enabled" : "disabled"); ++ ++ printk(" PV L1TF shadowing: Dom0 %s, DomU %s\n", ++ opt_pv_l1tf & OPT_PV_L1TF_DOM0 ? "enabled" : "disabled", ++ opt_pv_l1tf & OPT_PV_L1TF_DOMU ? "enabled" : "disabled"); + } + + /* Calculate whether Retpoline is known-safe on this CPU. */ +@@ -786,6 +854,21 @@ void __init init_speculation_mitigations(void) + + l1tf_calculations(caps); + ++ /* ++ * By default, enable PV domU L1TF mitigations on all L1TF-vulnerable ++ * hardware, except when running in shim mode. ++ * ++ * In shim mode, SHADOW is expected to be compiled out, and a malicious ++ * guest kernel can only attack the shim Xen, not the host Xen. ++ */ ++ if ( opt_pv_l1tf == -1 ) ++ { ++ if ( pv_shim || !cpu_has_bug_l1tf ) ++ opt_pv_l1tf = 0; ++ else ++ opt_pv_l1tf = OPT_PV_L1TF_DOMU; ++ } ++ + print_details(thunk, caps); + + /* +diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h +index d7e8ed0f5f..cdf5737dc2 100644 +--- a/xen/include/asm-x86/spec_ctrl.h ++++ b/xen/include/asm-x86/spec_ctrl.h +@@ -38,6 +38,10 @@ extern int8_t opt_xpti; + #define OPT_XPTI_DOM0 0x01 + #define OPT_XPTI_DOMU 0x02 + ++extern int8_t opt_pv_l1tf; ++#define OPT_PV_L1TF_DOM0 0x01 ++#define OPT_PV_L1TF_DOMU 0x02 ++ + /* + * The L1D address mask, which might be wider than reported in CPUID, and the + * system physical address above which there are believed to be no cacheable +-- +2.17.1 + + +From 02d2c660935cfd6ff2438afb3892776dfc7db711 Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Mon, 23 Jul 2018 07:11:40 +0100 +Subject: [PATCH 35/42] x86/shadow: Infrastructure to force a PV guest into + shadow mode + +To mitigate L1TF, we cannot alter an architecturally-legitimate PTE a PV guest +chooses to write, but we can force the PV domain into shadow mode so Xen +controls the PTEs which are reachable by the CPU pagewalk. + +Introduce new shadow mode, PG_SH_forced, and a tasklet to perform the +transition. Later patches will introduce the logic to enable this mode at the +appropriate time. + +To simplify vcpu cleanup, make tasklet_kill() idempotent with respect to +tasklet_init(), which involves adding a helper to check for an uninitialised +list head. + +This is part of XSA-273 / CVE-2018-3620. + +Signed-off-by: Juergen Gross +Signed-off-by: Andrew Cooper +Reviewed-by: Tim Deegan +Reviewed-by: Jan Beulich +(cherry picked from commit b76ec3946bf6caca2c3950b857c008bc8db6723f) +--- + xen/arch/x86/mm/paging.c | 2 ++ + xen/arch/x86/mm/shadow/common.c | 36 +++++++++++++++++++++++++++++++++ + xen/arch/x86/pv/domain.c | 5 +++++ + xen/common/tasklet.c | 5 +++++ + xen/include/asm-x86/domain.h | 7 +++++++ + xen/include/asm-x86/paging.h | 4 ++++ + xen/include/asm-x86/shadow.h | 32 +++++++++++++++++++++++++++++ + xen/include/xen/list.h | 5 +++++ + 8 files changed, 96 insertions(+) + +diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c +index 2b0445ffe9..dcee496eb0 100644 +--- a/xen/arch/x86/mm/paging.c ++++ b/xen/arch/x86/mm/paging.c +@@ -873,6 +873,8 @@ void paging_dump_domain_info(struct domain *d) + printk(" paging assistance: "); + if ( paging_mode_shadow(d) ) + printk("shadow "); ++ if ( paging_mode_sh_forced(d) ) ++ printk("forced "); + if ( paging_mode_hap(d) ) + printk("hap "); + if ( paging_mode_refcounts(d) ) +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index dd61b50eb7..fd42d734e7 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -3177,6 +3177,15 @@ static void sh_new_mode(struct domain *d, u32 new_mode) + ASSERT(paging_locked_by_me(d)); + ASSERT(d != current->domain); + ++ /* ++ * If PG_SH_forced has previously been activated because of writing an ++ * L1TF-vulnerable PTE, it must remain active for the remaining lifetime ++ * of the domain, even if the logdirty mode needs to be controlled for ++ * migration purposes. ++ */ ++ if ( paging_mode_sh_forced(d) ) ++ new_mode |= PG_SH_forced | PG_SH_enable; ++ + d->arch.paging.mode = new_mode; + for_each_vcpu(d, v) + sh_update_paging_modes(v); +@@ -4057,6 +4066,33 @@ void shadow_audit_tables(struct vcpu *v) + + #endif /* Shadow audit */ + ++#ifdef CONFIG_PV ++ ++void pv_l1tf_tasklet(unsigned long data) ++{ ++ struct domain *d = (void *)data; ++ ++ domain_pause(d); ++ paging_lock(d); ++ ++ if ( !paging_mode_sh_forced(d) && !d->is_dying ) ++ { ++ int ret = shadow_one_bit_enable(d, PG_SH_forced); ++ ++ if ( ret ) ++ { ++ printk(XENLOG_G_ERR "d%d Failed to enable PG_SH_forced: %d\n", ++ d->domain_id, ret); ++ domain_crash(d); ++ } ++ } ++ ++ paging_unlock(d); ++ domain_unpause(d); ++} ++ ++#endif /* CONFIG_PV */ ++ + /* + * Local variables: + * mode: C +diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c +index a4f0bd239d..3230ac6a22 100644 +--- a/xen/arch/x86/pv/domain.c ++++ b/xen/arch/x86/pv/domain.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + + static __read_mostly enum { + PCID_OFF, +@@ -209,6 +210,8 @@ int pv_vcpu_initialise(struct vcpu *v) + + void pv_domain_destroy(struct domain *d) + { ++ pv_l1tf_domain_destroy(d); ++ + destroy_perdomain_mapping(d, GDT_LDT_VIRT_START, + GDT_LDT_MBYTES << (20 - PAGE_SHIFT)); + +@@ -229,6 +232,8 @@ int pv_domain_initialise(struct domain *d) + }; + int rc = -ENOMEM; + ++ pv_l1tf_domain_init(d); ++ + d->arch.pv_domain.gdt_ldt_l1tab = + alloc_xenheap_pages(0, MEMF_node(domain_to_node(d))); + if ( !d->arch.pv_domain.gdt_ldt_l1tab ) +diff --git a/xen/common/tasklet.c b/xen/common/tasklet.c +index 0f0a6f8365..d4fea3151c 100644 +--- a/xen/common/tasklet.c ++++ b/xen/common/tasklet.c +@@ -156,6 +156,10 @@ void tasklet_kill(struct tasklet *t) + + spin_lock_irqsave(&tasklet_lock, flags); + ++ /* Cope with uninitialised tasklets. */ ++ if ( list_head_is_null(&t->list) ) ++ goto unlock; ++ + if ( !list_empty(&t->list) ) + { + BUG_ON(t->is_dead || t->is_running || (t->scheduled_on < 0)); +@@ -172,6 +176,7 @@ void tasklet_kill(struct tasklet *t) + spin_lock_irqsave(&tasklet_lock, flags); + } + ++ unlock: + spin_unlock_irqrestore(&tasklet_lock, flags); + } + +diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h +index e0d413c7de..61e6900465 100644 +--- a/xen/include/asm-x86/domain.h ++++ b/xen/include/asm-x86/domain.h +@@ -121,6 +121,11 @@ struct shadow_domain { + + /* Has this domain ever used HVMOP_pagetable_dying? */ + bool_t pagetable_dying_op; ++ ++#ifdef CONFIG_PV ++ /* PV L1 Terminal Fault mitigation. */ ++ struct tasklet pv_l1tf_tasklet; ++#endif /* CONFIG_PV */ + #endif + }; + +@@ -257,6 +262,8 @@ struct pv_domain + bool xpti; + /* Use PCID feature? */ + bool pcid; ++ /* Mitigate L1TF with shadow/crashing? */ ++ bool check_l1tf; + + /* map_domain_page() mapping cache. */ + struct mapcache_domain mapcache; +diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h +index f0085511c7..f440e3e53c 100644 +--- a/xen/include/asm-x86/paging.h ++++ b/xen/include/asm-x86/paging.h +@@ -37,11 +37,14 @@ + + #define PG_SH_shift 20 + #define PG_HAP_shift 21 ++#define PG_SHF_shift 22 + /* We're in one of the shadow modes */ + #ifdef CONFIG_SHADOW_PAGING + #define PG_SH_enable (1U << PG_SH_shift) ++#define PG_SH_forced (1U << PG_SHF_shift) + #else + #define PG_SH_enable 0 ++#define PG_SH_forced 0 + #endif + #define PG_HAP_enable (1U << PG_HAP_shift) + +@@ -62,6 +65,7 @@ + + #define paging_mode_enabled(_d) (!!(_d)->arch.paging.mode) + #define paging_mode_shadow(_d) (!!((_d)->arch.paging.mode & PG_SH_enable)) ++#define paging_mode_sh_forced(_d) (!!((_d)->arch.paging.mode & PG_SH_forced)) + #define paging_mode_hap(_d) (!!((_d)->arch.paging.mode & PG_HAP_enable)) + + #define paging_mode_refcounts(_d) (!!((_d)->arch.paging.mode & PG_refcounts)) +diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h +index 94a34fd16a..14afb7db52 100644 +--- a/xen/include/asm-x86/shadow.h ++++ b/xen/include/asm-x86/shadow.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + /***************************************************************************** + * Macros to tell which shadow paging mode a domain is in*/ +@@ -115,6 +116,37 @@ static inline int shadow_domctl(struct domain *d, + + #endif /* CONFIG_SHADOW_PAGING */ + ++/* ++ * Mitigations for L1TF / CVE-2018-3620 for PV guests. ++ * ++ * We cannot alter an architecturally-legitimate PTE which a PV guest has ++ * chosen to write, as traditional paged-out metadata is L1TF-vulnerable. ++ * What we can do is force a PV guest which writes a vulnerable PTE into ++ * shadow mode, so Xen controls the pagetables which are reachable by the CPU ++ * pagewalk. ++ */ ++ ++void pv_l1tf_tasklet(unsigned long data); ++ ++static inline void pv_l1tf_domain_init(struct domain *d) ++{ ++ d->arch.pv_domain.check_l1tf = ++ opt_pv_l1tf & (is_hardware_domain(d) ++ ? OPT_PV_L1TF_DOM0 : OPT_PV_L1TF_DOMU); ++ ++#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV) ++ tasklet_init(&d->arch.paging.shadow.pv_l1tf_tasklet, ++ pv_l1tf_tasklet, (unsigned long)d); ++#endif ++} ++ ++static inline void pv_l1tf_domain_destroy(struct domain *d) ++{ ++#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV) ++ tasklet_kill(&d->arch.paging.shadow.pv_l1tf_tasklet); ++#endif ++} ++ + /* Remove all shadows of the guest mfn. */ + static inline void shadow_remove_all_shadows(struct domain *d, mfn_t gmfn) + { +diff --git a/xen/include/xen/list.h b/xen/include/xen/list.h +index fa07d720ee..1387abb211 100644 +--- a/xen/include/xen/list.h ++++ b/xen/include/xen/list.h +@@ -51,6 +51,11 @@ static inline void INIT_LIST_HEAD(struct list_head *list) + list->prev = list; + } + ++static inline bool list_head_is_null(const struct list_head *list) ++{ ++ return !list->next && !list->prev; ++} ++ + /* + * Insert a new entry between two known consecutive entries. + * +-- +2.17.1 + + +From f4a049ede7ee9e1fafad6248cffc5e6deac1bc39 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 23 Jul 2018 08:11:40 +0200 +Subject: [PATCH 36/42] x86/mm: Plumbing to allow any PTE update to fail with + -ERESTART + +Switching to shadow mode is performed in tasklet context. To facilitate this, +we schedule the tasklet, then create a hypercall continuation to allow the +switch to take place. + +As a consequence, the x86 mm code needs to cope with an L1e operation being +continuable. do_mmu{,ext}_op() may no longer assert that a continuation +doesn't happen on the final iteration. + +To handle the arguments correctly on continuation, compat_update_va_mapping*() +may no longer call into their non-compat counterparts. Move the compat +functions into mm.c rather than exporting __do_update_va_mapping() and +{get,put}_pg_owner(), and fix an unsigned long/int inconsistency with +compat_update_va_mapping_otherdomain(). + +This is part of XSA-273 / CVE-2018-3620. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit c612481d1c9232c6abf91b03ec655e92f808805f) +--- + xen/arch/x86/mm.c | 83 ++++++++++++++++++++++++++------- + xen/arch/x86/x86_64/compat/mm.c | 13 ------ + xen/include/asm-x86/hypercall.h | 2 +- + 3 files changed, 66 insertions(+), 32 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index bcf46c0743..657af50c4c 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -613,6 +613,9 @@ static int alloc_segdesc_page(struct page_info *page) + return i == 512 ? 0 : -EINVAL; + } + ++static int _get_page_type(struct page_info *page, unsigned long type, ++ bool preemptible); ++ + static int get_page_and_type_from_mfn( + mfn_t mfn, unsigned long type, struct domain *d, + int partial, int preemptible) +@@ -624,9 +627,7 @@ static int get_page_and_type_from_mfn( + unlikely(!get_page_from_mfn(mfn, d)) ) + return -EINVAL; + +- rc = (preemptible ? +- get_page_type_preemptible(page, type) : +- (get_page_type(page, type) ? 0 : -EINVAL)); ++ rc = _get_page_type(page, type, preemptible); + + if ( unlikely(rc) && partial >= 0 && + (!preemptible || page != current->arch.old_guest_table) ) +@@ -1456,8 +1457,7 @@ static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e) + return 1; + } + +-static int alloc_l2_table(struct page_info *page, unsigned long type, +- int preemptible) ++static int alloc_l2_table(struct page_info *page, unsigned long type) + { + struct domain *d = page_get_owner(page); + unsigned long pfn = mfn_x(page_to_mfn(page)); +@@ -1469,8 +1469,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type, + + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ ) + { +- if ( preemptible && i > page->nr_validated_ptes +- && hypercall_preempt_check() ) ++ if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) + { + page->nr_validated_ptes = i; + rc = -ERESTART; +@@ -1481,6 +1480,12 @@ static int alloc_l2_table(struct page_info *page, unsigned long type, + (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 ) + continue; + ++ if ( unlikely(rc == -ERESTART) ) ++ { ++ page->nr_validated_ptes = i; ++ break; ++ } ++ + if ( rc < 0 ) + { + gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i); +@@ -1763,7 +1768,7 @@ static void free_l1_table(struct page_info *page) + } + + +-static int free_l2_table(struct page_info *page, int preemptible) ++static int free_l2_table(struct page_info *page) + { + struct domain *d = page_get_owner(page); + unsigned long pfn = mfn_x(page_to_mfn(page)); +@@ -1777,7 +1782,7 @@ static int free_l2_table(struct page_info *page, int preemptible) + do { + if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) && + put_page_from_l2e(pl2e[i], pfn) == 0 && +- preemptible && i && hypercall_preempt_check() ) ++ i && hypercall_preempt_check() ) + { + page->nr_validated_ptes = i; + err = -ERESTART; +@@ -2373,7 +2378,8 @@ static int alloc_page_type(struct page_info *page, unsigned long type, + rc = alloc_l1_table(page); + break; + case PGT_l2_page_table: +- rc = alloc_l2_table(page, type, preemptible); ++ ASSERT(preemptible); ++ rc = alloc_l2_table(page, type); + break; + case PGT_l3_page_table: + ASSERT(preemptible); +@@ -2463,7 +2469,8 @@ int free_page_type(struct page_info *page, unsigned long type, + rc = 0; + break; + case PGT_l2_page_table: +- rc = free_l2_table(page, preemptible); ++ ASSERT(preemptible); ++ rc = free_l2_table(page); + break; + case PGT_l3_page_table: + ASSERT(preemptible); +@@ -3550,12 +3557,9 @@ long do_mmuext_op( + } + + if ( rc == -ERESTART ) +- { +- ASSERT(i < count); + rc = hypercall_create_continuation( + __HYPERVISOR_mmuext_op, "hihi", + uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); +- } + else if ( curr->arch.old_guest_table ) + { + XEN_GUEST_HANDLE_PARAM(void) null; +@@ -3861,12 +3865,9 @@ long do_mmu_update( + } + + if ( rc == -ERESTART ) +- { +- ASSERT(i < count); + rc = hypercall_create_continuation( + __HYPERVISOR_mmu_update, "hihi", + ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); +- } + else if ( curr->arch.old_guest_table ) + { + XEN_GUEST_HANDLE_PARAM(void) null; +@@ -4121,7 +4122,13 @@ static int __do_update_va_mapping( + long do_update_va_mapping(unsigned long va, u64 val64, + unsigned long flags) + { +- return __do_update_va_mapping(va, val64, flags, current->domain); ++ int rc = __do_update_va_mapping(va, val64, flags, current->domain); ++ ++ if ( rc == -ERESTART ) ++ rc = hypercall_create_continuation( ++ __HYPERVISOR_update_va_mapping, "lll", va, val64, flags); ++ ++ return rc; + } + + long do_update_va_mapping_otherdomain(unsigned long va, u64 val64, +@@ -4138,6 +4145,46 @@ long do_update_va_mapping_otherdomain(unsigned long va, u64 val64, + + put_pg_owner(pg_owner); + ++ if ( rc == -ERESTART ) ++ rc = hypercall_create_continuation( ++ __HYPERVISOR_update_va_mapping_otherdomain, ++ "llli", va, val64, flags, domid); ++ ++ return rc; ++} ++ ++int compat_update_va_mapping(unsigned int va, uint32_t lo, uint32_t hi, ++ unsigned int flags) ++{ ++ int rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo, ++ flags, current->domain); ++ ++ if ( rc == -ERESTART ) ++ rc = hypercall_create_continuation( ++ __HYPERVISOR_update_va_mapping, "iiii", va, lo, hi, flags); ++ ++ return rc; ++} ++ ++int compat_update_va_mapping_otherdomain(unsigned int va, ++ uint32_t lo, uint32_t hi, ++ unsigned int flags, domid_t domid) ++{ ++ struct domain *pg_owner; ++ int rc; ++ ++ if ( (pg_owner = get_pg_owner(domid)) == NULL ) ++ return -ESRCH; ++ ++ rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo, flags, pg_owner); ++ ++ put_pg_owner(pg_owner); ++ ++ if ( rc == -ERESTART ) ++ rc = hypercall_create_continuation( ++ __HYPERVISOR_update_va_mapping_otherdomain, ++ "iiiii", va, lo, hi, flags, domid); ++ + return rc; + } + +diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c +index c2aa6f2fdb..02bc75b91e 100644 +--- a/xen/arch/x86/x86_64/compat/mm.c ++++ b/xen/arch/x86/x86_64/compat/mm.c +@@ -163,19 +163,6 @@ int compat_arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) + return rc; + } + +-int compat_update_va_mapping(unsigned int va, u32 lo, u32 hi, +- unsigned int flags) +-{ +- return do_update_va_mapping(va, lo | ((u64)hi << 32), flags); +-} +- +-int compat_update_va_mapping_otherdomain(unsigned long va, u32 lo, u32 hi, +- unsigned long flags, +- domid_t domid) +-{ +- return do_update_va_mapping_otherdomain(va, lo | ((u64)hi << 32), flags, domid); +-} +- + DEFINE_XEN_GUEST_HANDLE(mmuext_op_compat_t); + + int compat_mmuext_op(XEN_GUEST_HANDLE_PARAM(void) arg, +diff --git a/xen/include/asm-x86/hypercall.h b/xen/include/asm-x86/hypercall.h +index 1cc2e37d5c..da38b7991c 100644 +--- a/xen/include/asm-x86/hypercall.h ++++ b/xen/include/asm-x86/hypercall.h +@@ -165,7 +165,7 @@ extern int compat_update_va_mapping( + unsigned int va, u32 lo, u32 hi, unsigned int flags); + + extern int compat_update_va_mapping_otherdomain( +- unsigned long va, u32 lo, u32 hi, unsigned long flags, domid_t domid); ++ unsigned int va, u32 lo, u32 hi, unsigned int flags, domid_t domid); + + DEFINE_XEN_GUEST_HANDLE(trap_info_compat_t); + extern int compat_set_trap_table(XEN_GUEST_HANDLE(trap_info_compat_t) traps); +-- +2.17.1 + + +From 665e7685b4f5a683101ef833c45415e2548d873f Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Mon, 23 Jul 2018 08:11:40 +0200 +Subject: [PATCH 37/42] x86/pv: Force a guest into shadow mode when it writes + an L1TF-vulnerable PTE + +See the comment in shadow.h for an explanation of L1TF and the safety +consideration of the PTEs. + +In the case that CONFIG_SHADOW_PAGING isn't compiled in, crash the domain +instead. This allows well-behaved PV guests to function, while preventing +L1TF from being exploited. (Note: PV guest kernels which haven't been updated +with L1TF mitigations will likely be crashed as soon as they try paging a +piece of userspace out to disk.) + +This is part of XSA-273 / CVE-2018-3620. + +Signed-off-by: Juergen Gross +Signed-off-by: Andrew Cooper +Reviewed-by: Tim Deegan +Reviewed-by: Jan Beulich +(cherry picked from commit 06e8b622d3f3c0fa5075e91b041c6f45549ad70a) +--- + xen/arch/x86/mm.c | 22 ++++++-- + xen/arch/x86/pv/ro-page-fault.c | 5 ++ + xen/include/asm-x86/shadow.h | 94 +++++++++++++++++++++++++++++++++ + xen/include/xen/tasklet.h | 5 ++ + 4 files changed, 123 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 657af50c4c..7d4871b791 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1116,7 +1116,7 @@ get_page_from_l2e( + int rc; + + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) +- return 1; ++ return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1; + + if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) ) + { +@@ -1147,7 +1147,7 @@ get_page_from_l3e( + int rc; + + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) +- return 1; ++ return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1; + + if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) ) + { +@@ -1180,7 +1180,7 @@ get_page_from_l4e( + int rc; + + if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) +- return 1; ++ return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1; + + if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) ) + { +@@ -1390,6 +1390,13 @@ static int alloc_l1_table(struct page_info *page) + + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + { ++ if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) ) ++ { ++ ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0; ++ if ( ret ) ++ goto out; ++ } ++ + switch ( ret = get_page_from_l1e(pl1e[i], d, d) ) + { + default: +@@ -1410,6 +1417,7 @@ static int alloc_l1_table(struct page_info *page) + + fail: + gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i); ++ out: + while ( i-- > 0 ) + put_page_from_l1e(pl1e[i], d); + +@@ -2060,6 +2068,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, + rc = -EBUSY; + } + } ++ else if ( pv_l1tf_check_l1e(pt_dom, nl1e) ) ++ return -ERESTART; + else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, + preserve_ad)) ) + { +@@ -2123,6 +2133,8 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, + rc = -EBUSY; + } + } ++ else if ( pv_l1tf_check_l2e(d, nl2e) ) ++ return -ERESTART; + else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, + preserve_ad)) ) + { +@@ -2184,6 +2196,8 @@ static int mod_l3_entry(l3_pgentry_t *pl3e, + rc = -EFAULT; + } + } ++ else if ( pv_l1tf_check_l3e(d, nl3e) ) ++ return -ERESTART; + else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, + preserve_ad)) ) + { +@@ -2249,6 +2263,8 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, + rc = -EFAULT; + } + } ++ else if ( pv_l1tf_check_l4e(d, nl4e) ) ++ return -ERESTART; + else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, + preserve_ad)) ) + { +diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c +index aa8d5a7556..a3c0c2dd19 100644 +--- a/xen/arch/x86/pv/ro-page-fault.c ++++ b/xen/arch/x86/pv/ro-page-fault.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + #include "emulate.h" + #include "mm.h" +@@ -129,6 +130,10 @@ static int ptwr_emulated_update(unsigned long addr, intpte_t *p_old, + + /* Check the new PTE. */ + nl1e = l1e_from_intpte(val); ++ ++ if ( !(l1e_get_flags(nl1e) & _PAGE_PRESENT) && pv_l1tf_check_l1e(d, nl1e) ) ++ return X86EMUL_RETRY; ++ + switch ( ret = get_page_from_l1e(nl1e, d, d) ) + { + default: +diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h +index 14afb7db52..f40f411871 100644 +--- a/xen/include/asm-x86/shadow.h ++++ b/xen/include/asm-x86/shadow.h +@@ -124,8 +124,102 @@ static inline int shadow_domctl(struct domain *d, + * What we can do is force a PV guest which writes a vulnerable PTE into + * shadow mode, so Xen controls the pagetables which are reachable by the CPU + * pagewalk. ++ * ++ * The core of the L1TF vulnerability is that the address bits of the PTE ++ * (accounting for PSE and factoring in the level-relevant part of the linear ++ * access) are sent for an L1D lookup (to retrieve the next-level PTE, or ++ * eventual memory address) before the Present or reserved bits (which would ++ * cause a terminal fault) are accounted for. If an L1D hit occurs, the ++ * resulting data is available for potentially dependent instructions. ++ * ++ * For Present PTEs, the PV type-count safety logic ensures that the address ++ * bits always point at a guest-accessible frame, which is safe WRT L1TF from ++ * Xen's point of view. In practice, a PV guest should be unable to set any ++ * reserved bits, so should be unable to create any present L1TF-vulnerable ++ * PTEs at all. ++ * ++ * Therefore, these safety checks apply to Not-Present PTEs only, where ++ * traditionally, Xen would have let the guest write any value it chose. ++ * ++ * The all-zero PTE potentially leaks mfn 0. All software on the system is ++ * expected to cooperate and not put any secrets there. In a Xen system, ++ * neither Xen nor dom0 are expected to touch mfn 0, as it typically contains ++ * the real mode IVT and Bios Data Area. Therefore, mfn 0 is considered safe. ++ * ++ * Any PTE whose address is higher than the maximum cacheable address is safe, ++ * as it won't get an L1D hit. ++ * ++ * Speculative superpages also need accounting for, as PSE is considered ++ * irrespective of Present. We disallow PSE being set, as it allows an ++ * attacker to leak 2M or 1G of data starting from mfn 0. Also, because of ++ * recursive/linear pagetables, we must consider PSE even at L4, as hardware ++ * will interpret an L4e as an L3e during a recursive walk. + */ + ++static inline bool is_l1tf_safe_maddr(intpte_t pte) ++{ ++ paddr_t maddr = pte & l1tf_addr_mask; ++ ++ return maddr == 0 || maddr >= l1tf_safe_maddr; ++} ++ ++static inline bool pv_l1tf_check_pte(struct domain *d, unsigned int level, ++ intpte_t pte) ++{ ++ ASSERT(is_pv_domain(d)); ++ ASSERT(!(pte & _PAGE_PRESENT)); ++ ++ if ( d->arch.pv_domain.check_l1tf && !paging_mode_sh_forced(d) && ++ (((level > 1) && (pte & _PAGE_PSE)) || !is_l1tf_safe_maddr(pte)) ) ++ { ++#ifdef CONFIG_SHADOW_PAGING ++ struct tasklet *t = &d->arch.paging.shadow.pv_l1tf_tasklet; ++ ++ printk(XENLOG_G_WARNING ++ "d%d L1TF-vulnerable L%ue %016"PRIx64" - Shadowing\n", ++ d->domain_id, level, pte); ++ /* ++ * Safety consideration for accessing tasklet.scheduled_on without the ++ * tasklet lock. This is a singleshot tasklet with the side effect of ++ * setting PG_SH_forced (checked just above). Multiple vcpus can race ++ * to schedule the tasklet, but if we observe it scheduled anywhere, ++ * that is good enough. ++ */ ++ smp_rmb(); ++ if ( !tasklet_is_scheduled(t) ) ++ tasklet_schedule(t); ++#else ++ printk(XENLOG_G_ERR ++ "d%d L1TF-vulnerable L%ue %016"PRIx64" - Crashing\n", ++ d->domain_id, level, pte); ++ domain_crash(d); ++#endif ++ return true; ++ } ++ ++ return false; ++} ++ ++static inline bool pv_l1tf_check_l1e(struct domain *d, l1_pgentry_t l1e) ++{ ++ return pv_l1tf_check_pte(d, 1, l1e.l1); ++} ++ ++static inline bool pv_l1tf_check_l2e(struct domain *d, l2_pgentry_t l2e) ++{ ++ return pv_l1tf_check_pte(d, 2, l2e.l2); ++} ++ ++static inline bool pv_l1tf_check_l3e(struct domain *d, l3_pgentry_t l3e) ++{ ++ return pv_l1tf_check_pte(d, 3, l3e.l3); ++} ++ ++static inline bool pv_l1tf_check_l4e(struct domain *d, l4_pgentry_t l4e) ++{ ++ return pv_l1tf_check_pte(d, 4, l4e.l4); ++} ++ + void pv_l1tf_tasklet(unsigned long data); + + static inline void pv_l1tf_domain_init(struct domain *d) +diff --git a/xen/include/xen/tasklet.h b/xen/include/xen/tasklet.h +index 23d69c738e..bc9ddace6d 100644 +--- a/xen/include/xen/tasklet.h ++++ b/xen/include/xen/tasklet.h +@@ -50,6 +50,11 @@ static inline bool tasklet_work_to_do(unsigned int cpu) + TASKLET_scheduled); + } + ++static inline bool tasklet_is_scheduled(const struct tasklet *t) ++{ ++ return t->scheduled_on != -1; ++} ++ + void tasklet_schedule_on_cpu(struct tasklet *t, unsigned int cpu); + void tasklet_schedule(struct tasklet *t); + void do_tasklet(void); +-- +2.17.1 + + +From fb78137bb82d3d8bcac36430b8bc331008ee3826 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Wed, 28 Mar 2018 15:21:39 +0100 +Subject: [PATCH 38/42] x86/spec-ctrl: CPUID/MSR definitions for L1D_FLUSH + +This is part of XSA-273 / CVE-2018-3646. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit 3563fc2b2731a63fd7e8372ab0f5cef205bf8477) +--- + docs/misc/xen-command-line.markdown | 8 ++++---- + tools/libxl/libxl_cpuid.c | 1 + + tools/misc/xen-cpuid.c | 2 +- + xen/arch/x86/cpuid.c | 5 +++++ + xen/arch/x86/spec_ctrl.c | 4 +++- + xen/include/asm-x86/msr-index.h | 4 ++++ + xen/include/public/arch-x86/cpufeatureset.h | 1 + + 7 files changed, 19 insertions(+), 6 deletions(-) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 763cc1d878..158b5bb919 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -489,10 +489,10 @@ accounting for hardware capabilities as enumerated via CPUID. + + Currently accepted: + +-The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb`, `ssbd` are +-used by default if available and applicable. They can be ignored, +-e.g. `no-ibrsb`, at which point Xen won't use them itself, and won't offer +-them to guests. ++The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb`, ++`l1d-flush` and `ssbd` are used by default if available and applicable. They can ++be ignored, e.g. `no-ibrsb`, at which point Xen won't use them itself, and ++won't offer them to guests. + + ### cpuid\_mask\_cpu (AMD only) + > `= fam_0f_rev_c | fam_0f_rev_d | fam_0f_rev_e | fam_0f_rev_f | fam_0f_rev_g | fam_10_rev_b | fam_10_rev_c | fam_11_rev_b` +diff --git a/tools/libxl/libxl_cpuid.c b/tools/libxl/libxl_cpuid.c +index 7b0f594c3d..52e16c20ed 100644 +--- a/tools/libxl/libxl_cpuid.c ++++ b/tools/libxl/libxl_cpuid.c +@@ -204,6 +204,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str) + {"avx512-4fmaps",0x00000007, 0, CPUID_REG_EDX, 3, 1}, + {"ibrsb", 0x00000007, 0, CPUID_REG_EDX, 26, 1}, + {"stibp", 0x00000007, 0, CPUID_REG_EDX, 27, 1}, ++ {"l1d-flush", 0x00000007, 0, CPUID_REG_EDX, 28, 1}, + {"arch-caps", 0x00000007, 0, CPUID_REG_EDX, 29, 1}, + {"ssbd", 0x00000007, 0, CPUID_REG_EDX, 31, 1}, + +diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c +index e116339733..3888b4e158 100644 +--- a/tools/misc/xen-cpuid.c ++++ b/tools/misc/xen-cpuid.c +@@ -143,7 +143,7 @@ static const char *str_7d0[32] = + [ 2] = "avx512_4vnniw", [ 3] = "avx512_4fmaps", + + [26] = "ibrsb", [27] = "stibp", +- /* 28 */ [29] = "arch_caps", ++ [28] = "l1d_flush", [29] = "arch_caps", + /* 30 */ [31] = "ssbd", + }; + +diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c +index beee47d0ed..5cc89e2b34 100644 +--- a/xen/arch/x86/cpuid.c ++++ b/xen/arch/x86/cpuid.c +@@ -43,6 +43,11 @@ static int __init parse_xen_cpuid(const char *s) + if ( !val ) + setup_clear_cpu_cap(X86_FEATURE_STIBP); + } ++ else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) ++ { ++ if ( !val ) ++ setup_clear_cpu_cap(X86_FEATURE_L1D_FLUSH); ++ } + else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 ) + { + if ( !val ) +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 7995e27218..9bcc2b6adc 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -266,14 +266,16 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + printk("Speculative mitigation facilities:\n"); + + /* Hardware features which pertain to speculative mitigations. */ +- printk(" Hardware features:%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s\n", + (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "", + (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "", ++ (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "", + (_7d0 & cpufeat_mask(X86_FEATURE_SSBD)) ? " SSBD" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBPB)) ? " IBPB" : "", + (caps & ARCH_CAPABILITIES_IBRS_ALL) ? " IBRS_ALL" : "", + (caps & ARCH_CAPABILITIES_RDCL_NO) ? " RDCL_NO" : "", + (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", ++ (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "", + (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : ""); + + /* Compiled-in support which pertains to mitigations. */ +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index 8fbccc88a7..7235623c86 100644 +--- a/xen/include/asm-x86/msr-index.h ++++ b/xen/include/asm-x86/msr-index.h +@@ -47,8 +47,12 @@ + #define ARCH_CAPABILITIES_RDCL_NO (_AC(1, ULL) << 0) + #define ARCH_CAPABILITIES_IBRS_ALL (_AC(1, ULL) << 1) + #define ARCH_CAPS_RSBA (_AC(1, ULL) << 2) ++#define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3) + #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) + ++#define MSR_FLUSH_CMD 0x0000010b ++#define FLUSH_CMD_L1D (_AC(1, ULL) << 0) ++ + /* Intel MSRs. Some also available on other CPUs */ + #define MSR_IA32_PERFCTR0 0x000000c1 + #define MSR_IA32_A_PERFCTR0 0x000004c1 +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index f1a5ed93e0..9f4c8246a9 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -244,6 +244,7 @@ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions * + XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single Precision */ + XEN_CPUFEATURE(IBRSB, 9*32+26) /*A IBRS and IBPB support (used by Intel) */ + XEN_CPUFEATURE(STIBP, 9*32+27) /*A STIBP */ ++XEN_CPUFEATURE(L1D_FLUSH, 9*32+28) /* MSR_FLUSH_CMD and L1D flush. */ + XEN_CPUFEATURE(ARCH_CAPS, 9*32+29) /* IA32_ARCH_CAPABILITIES MSR */ + XEN_CPUFEATURE(SSBD, 9*32+31) /*A MSR_SPEC_CTRL.SSBD available */ + +-- +2.17.1 + + +From 007752fb9b85b9235fe2820677988c6408c583da Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Fri, 13 Apr 2018 15:34:01 +0000 +Subject: [PATCH 39/42] x86/msr: Virtualise MSR_FLUSH_CMD for guests + +Guests (outside of the nested virt case, which isn't supported yet) don't need +L1D_FLUSH for their L1TF mitigations, but offering/emulating MSR_FLUSH_CMD is +easy and doesn't pose an issue for Xen. + +The MSR is offered to HVM guests only. PV guests attempting to use it would +trap for emulation, and the L1D cache would fill long before the return to +guest context. As such, PV guests can't make any use of the L1D_FLUSH +functionality. + +This is part of XSA-273 / CVE-2018-3646. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit fd9823faf9df057a69a9a53c2e100691d3f4267c) +--- + xen/arch/x86/domctl.c | 3 ++- + xen/arch/x86/hvm/vmx/vmx.c | 6 ++++++ + xen/arch/x86/msr.c | 12 ++++++++++++ + xen/include/public/arch-x86/cpufeatureset.h | 2 +- + 4 files changed, 21 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c +index fa82b6744e..dd91038a67 100644 +--- a/xen/arch/x86/domctl.c ++++ b/xen/arch/x86/domctl.c +@@ -225,7 +225,8 @@ static int update_domain_cpuid_info(struct domain *d, + */ + call_policy_changed = (is_hvm_domain(d) && + ((old_7d0 ^ p->feat.raw[0].d) & +- cpufeat_mask(X86_FEATURE_IBRSB))); ++ (cpufeat_mask(X86_FEATURE_IBRSB) | ++ cpufeat_mask(X86_FEATURE_L1D_FLUSH)))); + break; + + case 0xa: +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index c7cf3a8fbc..b0fababede 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -583,6 +583,12 @@ static void vmx_cpuid_policy_changed(struct vcpu *v) + vmx_clear_msr_intercept(v, MSR_PRED_CMD, VMX_MSR_RW); + else + vmx_set_msr_intercept(v, MSR_PRED_CMD, VMX_MSR_RW); ++ ++ /* MSR_FLUSH_CMD is safe to pass through if the guest knows about it. */ ++ if ( cp->feat.l1d_flush ) ++ vmx_clear_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW); ++ else ++ vmx_set_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW); + } + + int vmx_guest_x86_mode(struct vcpu *v) +diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c +index 1e12ccb729..1a591dd2b5 100644 +--- a/xen/arch/x86/msr.c ++++ b/xen/arch/x86/msr.c +@@ -150,6 +150,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) + case MSR_AMD_PATCHLOADER: + case MSR_IA32_UCODE_WRITE: + case MSR_PRED_CMD: ++ case MSR_FLUSH_CMD: + /* Write-only */ + goto gp_fault; + +@@ -254,6 +255,17 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) + wrmsrl(MSR_PRED_CMD, val); + break; + ++ case MSR_FLUSH_CMD: ++ if ( !cp->feat.l1d_flush ) ++ goto gp_fault; /* MSR available? */ ++ ++ if ( val & ~FLUSH_CMD_L1D ) ++ goto gp_fault; /* Rsvd bit set? */ ++ ++ if ( v == curr ) ++ wrmsrl(MSR_FLUSH_CMD, val); ++ break; ++ + case MSR_INTEL_MISC_FEATURES_ENABLES: + { + bool old_cpuid_faulting = vp->misc_features_enables.cpuid_faulting; +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index 9f4c8246a9..6c82816fd3 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -244,7 +244,7 @@ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions * + XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single Precision */ + XEN_CPUFEATURE(IBRSB, 9*32+26) /*A IBRS and IBPB support (used by Intel) */ + XEN_CPUFEATURE(STIBP, 9*32+27) /*A STIBP */ +-XEN_CPUFEATURE(L1D_FLUSH, 9*32+28) /* MSR_FLUSH_CMD and L1D flush. */ ++XEN_CPUFEATURE(L1D_FLUSH, 9*32+28) /*S MSR_FLUSH_CMD and L1D flush. */ + XEN_CPUFEATURE(ARCH_CAPS, 9*32+29) /* IA32_ARCH_CAPABILITIES MSR */ + XEN_CPUFEATURE(SSBD, 9*32+31) /*A MSR_SPEC_CTRL.SSBD available */ + +-- +2.17.1 + + +From 2a47c7550910f5d591ca0de369234f8c18daa2d2 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 29 May 2018 18:44:16 +0100 +Subject: [PATCH 40/42] x86/spec-ctrl: Introduce an option to control L1D_FLUSH + for HVM HAP guests + +This mitigation requires up-to-date microcode, and is enabled by default on +affected hardware if available, and is used for HVM guests + +The default for SMT/Hyperthreading is far more complicated to reason about, +not least because we don't know if the user is going to want to run any HVM +guests to begin with. If a explicit default isn't given, nag the user to +perform a risk assessment and choose an explicit default, and leave other +configuration to the toolstack. + +This is part of XSA-273 / CVE-2018-3620. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit 3bd36952dab60290f33d6791070b57920e10754b) +--- + docs/misc/xen-command-line.markdown | 9 ++++++- + xen/arch/x86/hvm/vmx/vmcs.c | 5 ++++ + xen/arch/x86/spec_ctrl.c | 38 +++++++++++++++++++++++++++-- + xen/include/asm-x86/spec_ctrl.h | 1 + + 4 files changed, 50 insertions(+), 3 deletions(-) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 158b5bb919..57ef18194a 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -1791,7 +1791,8 @@ false disable the quirk workaround, which is also the default. + + ### spec-ctrl (x86) + > `= List of [ , xen=, {pv,hvm,msr-sc,rsb}=, +-> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu}= ]` ++> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu, ++> l1d-flush}= ]` + + Controls for speculative execution sidechannel mitigations. By default, Xen + will pick the most appropriate mitigations based on compiled in support, +@@ -1846,6 +1847,12 @@ from using fully eager FPU context switches. This is currently implemented as + a global control. By default, Xen will choose to use fully eager context + switches on hardware believed to speculate past #NM exceptions. + ++On hardware supporting L1D_FLUSH, the `l1d-flush=` option can be used to force ++or prevent Xen from issuing an L1 data cache flush on each VMEntry. ++Irrespective of Xen's setting, the feature is virtualised for HVM guests to ++use. By default, Xen will enable this mitigation on hardware believed to be ++vulnerable to L1TF. ++ + ### sync\_console + > `= ` + +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index 30a33dd0bd..2ba0c40808 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -38,6 +38,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -1274,6 +1275,10 @@ static int construct_vmcs(struct vcpu *v) + + vmx_vlapic_msr_changed(v); + ++ if ( opt_l1d_flush && paging_mode_hap(d) ) ++ rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D, ++ VMX_MSR_GUEST_LOADONLY); ++ + out: + vmx_vmcs_exit(v); + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 9bcc2b6adc..59baebb959 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -19,11 +19,13 @@ + #include + #include + #include ++#include + + #include + #include + #include + #include ++#include + #include + #include + +@@ -46,6 +48,7 @@ static int8_t __initdata opt_ibrs = -1; + bool __read_mostly opt_ibpb = true; + bool __read_mostly opt_ssbd = false; + int8_t __read_mostly opt_eager_fpu = -1; ++int8_t __read_mostly opt_l1d_flush = -1; + + bool __initdata bsp_delay_spec_ctrl; + uint8_t __read_mostly default_xen_spec_ctrl; +@@ -139,6 +142,7 @@ static int __init parse_spec_ctrl(const char *s) + opt_ibrs = 0; + opt_ibpb = false; + opt_ssbd = false; ++ opt_l1d_flush = 0; + } + else if ( val > 0 ) + rc = -EINVAL; +@@ -194,6 +198,8 @@ static int __init parse_spec_ctrl(const char *s) + opt_ssbd = val; + else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 ) + opt_eager_fpu = val; ++ else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) ++ opt_l1d_flush = val; + else + rc = -EINVAL; + +@@ -290,7 +296,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + "\n"); + + /* Settings for Xen's protection, irrespective of guests. */ +- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s\n", ++ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s\n", + thunk == THUNK_NONE ? "N/A" : + thunk == THUNK_RETPOLINE ? "RETPOLINE" : + thunk == THUNK_LFENCE ? "LFENCE" : +@@ -299,7 +305,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", + !boot_cpu_has(X86_FEATURE_SSBD) ? "" : + (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", +- opt_ibpb ? " IBPB" : ""); ++ opt_ibpb ? " IBPB" : "", ++ opt_l1d_flush ? " L1D_FLUSH" : ""); + + /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ + if ( cpu_has_bug_l1tf || opt_pv_l1tf ) +@@ -871,6 +878,33 @@ void __init init_speculation_mitigations(void) + opt_pv_l1tf = OPT_PV_L1TF_DOMU; + } + ++ /* ++ * By default, enable L1D_FLUSH on L1TF-vulnerable hardware, unless ++ * instructed to skip the flush on vmentry by our outer hypervisor. ++ */ ++ if ( !boot_cpu_has(X86_FEATURE_L1D_FLUSH) ) ++ opt_l1d_flush = 0; ++ else if ( opt_l1d_flush == -1 ) ++ opt_l1d_flush = cpu_has_bug_l1tf && !(caps & ARCH_CAPS_SKIP_L1DFL); ++ ++ /* ++ * We do not disable HT by default on affected hardware. ++ * ++ * Firstly, if the user intends to use exclusively PV, or HVM shadow ++ * guests, HT isn't a concern and should remain fully enabled. Secondly, ++ * safety for HVM HAP guests can be arranged by the toolstack with core ++ * parking, pinning or cpupool configurations, including mixed setups. ++ * ++ * However, if we are on affected hardware, with HT enabled, and the user ++ * hasn't explicitly chosen whether to use HT or not, nag them to do so. ++ */ ++ if ( opt_smt == -1 && cpu_has_bug_l1tf && !pv_shim && ++ boot_cpu_data.x86_num_siblings > 1 ) ++ warning_add( ++ "Booted on L1TF-vulnerable hardware with SMT/Hyperthreading\n" ++ "enabled. Please assess your configuration and choose an\n" ++ "explicit 'smt=' setting. See XSA-273.\n"); ++ + print_details(thunk, caps); + + /* +diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h +index cdf5737dc2..8f8aad40bb 100644 +--- a/xen/include/asm-x86/spec_ctrl.h ++++ b/xen/include/asm-x86/spec_ctrl.h +@@ -29,6 +29,7 @@ void init_speculation_mitigations(void); + extern bool opt_ibpb; + extern bool opt_ssbd; + extern int8_t opt_eager_fpu; ++extern int8_t opt_l1d_flush; + + extern bool bsp_delay_spec_ctrl; + extern uint8_t default_xen_spec_ctrl; +-- +2.17.1 + + +From 6c7d074a4b5c8e69e21e505a04e7bb3f43658bea Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Mon, 13 Aug 2018 05:07:23 -0600 +Subject: [PATCH 41/42] x86: Make "spec-ctrl=no" a global disable of all + mitigations + +In order to have a simple and easy to remember means to suppress all the +more or less recent workarounds for hardware vulnerabilities, force +settings not controlled by "spec-ctrl=" also to their original defaults, +unless they've been forced to specific values already by earlier command +line options. + +This is part of XSA-273. + +Signed-off-by: Jan Beulich +Acked-by: Andrew Cooper +(cherry picked from commit d8800a82c3840b06b17672eddee4878bbfdacc6d) +--- + docs/misc/xen-command-line.markdown | 13 +++++++++---- + xen/arch/x86/spec_ctrl.c | 9 +++++++++ + 2 files changed, 18 insertions(+), 4 deletions(-) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 57ef18194a..0886706368 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -1804,10 +1804,15 @@ extreme care.** + + An overall boolean value, `spec-ctrl=no`, can be specified to turn off all + mitigations, including pieces of infrastructure used to virtualise certain +-mitigation features for guests. Alternatively, a slightly more restricted +-`spec-ctrl=no-xen` can be used to turn off all of Xen's mitigations, while +-leaving the virtualisation support in place for guests to use. Use of a +-positive boolean value for either of these options is invalid. ++mitigation features for guests. This also includes settings which `xpti`, ++`smt`, `pv-l1tf` control, unless the respective option(s) have been ++specified earlier on the command line. ++ ++Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to ++turn off all of Xen's mitigations, while leaving the virtualisation support ++in place for guests to use. ++ ++Use of a positive boolean value for either of these options is invalid. + + The booleans `pv=`, `hvm=`, `msr-sc=` and `rsb=` offer fine grained control + over the alternative blocks used by Xen. These impact Xen's ability to +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 59baebb959..f0c50d6703 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -134,6 +134,15 @@ static int __init parse_spec_ctrl(const char *s) + + opt_eager_fpu = 0; + ++ if ( opt_xpti < 0 ) ++ opt_xpti = 0; ++ ++ if ( opt_smt < 0 ) ++ opt_smt = 1; ++ ++ if ( opt_pv_l1tf < 0 ) ++ opt_pv_l1tf = 0; ++ + disable_common: + opt_rsb_pv = false; + opt_rsb_hvm = false; +-- +2.17.1 + + +From d757c29ffe2e31b15397e43cd58da88b6318b654 Mon Sep 17 00:00:00 2001 +From: Wei Liu +Date: Tue, 7 Aug 2018 15:35:34 +0100 +Subject: [PATCH 42/42] xl.conf: Add global affinity masks + +XSA-273 involves one hyperthread being able to use Spectre-like +techniques to "spy" on another thread. The details are somewhat +complicated, but the upshot is that after all Xen-based mitigations +have been applied: + +* PV guests cannot spy on sibling threads +* HVM guests can spy on sibling threads + +(NB that for purposes of this vulnerability, PVH and HVM guests are +identical. Whenever this comment refers to 'HVM', this includes PVH.) + +There are many possible mitigations to this, including disabling +hyperthreading entirely. But another solution would be: + +* Specify some cores as PV-only, others as PV or HVM +* Allow HVM guests to only run on thread 0 of the "HVM-or-PV" cores +* Allow PV guests to run on the above cores, as well as any thread of the PV-only cores. + +For example, suppose you had 16 threads across 8 cores (0-7). You +could specify 0-3 as PV-only, and 4-7 as HVM-or-PV. Then you'd set +the affinity of the HVM guests as follows (binary representation): + +0000000010101010 + +And the affinity of the PV guests as follows: + +1111111110101010 + +In order to make this easy, this patches introduces three "global affinity +masks", placed in xl.conf: + + vm.cpumask + vm.hvm.cpumask + vm.pv.cpumask + +These are parsed just like the 'cpus' and 'cpus_soft' options in the +per-domain xl configuration files. The resulting mask is AND-ed with +whatever mask results at the end of the xl configuration file. +`vm.cpumask` would be applied to all guest types, `vm.hvm.cpumask` +would be applied to HVM and PVH guest types, and `vm.pv.cpumask` +would be applied to PV guest types. + +The idea would be that to implement the above mask across all your +VMs, you'd simply add the following two lines to the configuration +file: + + vm.hvm.cpumask=8,10,12,14 + vm.pv.cpumask=0-8,10,12,14 + +See xl.conf manpage for details. + +This is part of XSA-273 / CVE-2018-3646. + +Signed-off-by: George Dunlap +Signed-off-by: Wei Liu +(cherry picked from commit aa67b97ed34279c43a43d9ca46727b5746caa92e) +--- + docs/man/xl.conf.pod.5 | 22 ++++++++++++ + tools/examples/xl.conf | 5 +++ + tools/xl/xl.c | 26 ++++++++++++++ + tools/xl/xl.h | 7 ++++ + tools/xl/xl_cmdtable.c | 6 ++-- + tools/xl/xl_vcpu.c | 80 +++++++++++++++++++++++++++++++++++++++-- + tools/xl/xl_vmcontrol.c | 39 ++++++++++++++++++-- + 7 files changed, 179 insertions(+), 6 deletions(-) + +diff --git a/docs/man/xl.conf.pod.5 b/docs/man/xl.conf.pod.5 +index da91b8626c..37262a7ef8 100644 +--- a/docs/man/xl.conf.pod.5 ++++ b/docs/man/xl.conf.pod.5 +@@ -185,6 +185,28 @@ massively huge guests). + + =back + ++=item B="CPULIST" ++ ++=item B="CPULIST" ++ ++=item B="CPULIST" ++ ++Global masks that are applied when creating guests and pinning vcpus ++to indicate which cpus they are allowed to run on. Specifically, ++C applies to all guest types, C applies to ++both HVM and PVH guests and C applies to PV guests. ++ ++The hard affinity of guest's vcpus are logical-AND'ed with respective ++masks. If the resulting affinity mask is empty, operation will fail. ++ ++Use --ignore-global-affinity-masks to skip applying global masks. ++ ++The default value for these masks are all 1's, i.e. all cpus are allowed. ++ ++Due to bug(s), these options may not interact well with other options ++concerning CPU affinity. One example is CPU pools. Users should always double ++check that the required affinity has taken effect. ++ + =back + + =head1 SEE ALSO +diff --git a/tools/examples/xl.conf b/tools/examples/xl.conf +index 374b6bbc2e..0446deb304 100644 +--- a/tools/examples/xl.conf ++++ b/tools/examples/xl.conf +@@ -37,3 +37,8 @@ + # (which can take a long time to find out if launching huge guests). + # see xl.conf(5) for details. + #claim_mode=1 ++ ++# Specify global vcpu hard affinity masks. See xl.conf(5) for details. ++#vm.cpumask="0-7" ++#vm.pv.cpumask="0-3" ++#vm.hvm.cpumask="3-7" +diff --git a/tools/xl/xl.c b/tools/xl/xl.c +index 179908b4f6..7d2142f16f 100644 +--- a/tools/xl/xl.c ++++ b/tools/xl/xl.c +@@ -28,6 +28,9 @@ + #include + #include + #include "xl.h" ++#include "xl_parse.h" ++ ++#include "xl_utils.h" + + xentoollog_logger_stdiostream *logger; + int dryrun_only; +@@ -42,6 +45,9 @@ char *default_gatewaydev = NULL; + char *default_vifbackend = NULL; + char *default_remus_netbufscript = NULL; + char *default_colo_proxy_script = NULL; ++libxl_bitmap global_vm_affinity_mask; ++libxl_bitmap global_hvm_affinity_mask; ++libxl_bitmap global_pv_affinity_mask; + enum output_format default_output_format = OUTPUT_FORMAT_JSON; + int claim_mode = 1; + bool progress_use_cr = 0; +@@ -203,6 +209,26 @@ static void parse_global_config(const char *configfile, + if (!xlu_cfg_get_long (config, "max_maptrack_frames", &l, 0)) + max_maptrack_frames = l; + ++ libxl_bitmap_init(&global_vm_affinity_mask); ++ libxl_cpu_bitmap_alloc(ctx, &global_vm_affinity_mask, 0); ++ libxl_bitmap_init(&global_hvm_affinity_mask); ++ libxl_cpu_bitmap_alloc(ctx, &global_hvm_affinity_mask, 0); ++ libxl_bitmap_init(&global_pv_affinity_mask); ++ libxl_cpu_bitmap_alloc(ctx, &global_pv_affinity_mask, 0); ++ ++ if (!xlu_cfg_get_string (config, "vm.cpumask", &buf, 0)) ++ parse_cpurange(buf, &global_vm_affinity_mask); ++ else ++ libxl_bitmap_set_any(&global_vm_affinity_mask); ++ if (!xlu_cfg_get_string (config, "vm.hvm.cpumask", &buf, 0)) ++ parse_cpurange(buf, &global_hvm_affinity_mask); ++ else ++ libxl_bitmap_set_any(&global_hvm_affinity_mask); ++ if (!xlu_cfg_get_string (config, "vm.pv.cpumask", &buf, 0)) ++ parse_cpurange(buf, &global_pv_affinity_mask); ++ else ++ libxl_bitmap_set_any(&global_pv_affinity_mask); ++ + xlu_cfg_destroy(config); + } + +diff --git a/tools/xl/xl.h b/tools/xl/xl.h +index 4e784ff402..7e97144b50 100644 +--- a/tools/xl/xl.h ++++ b/tools/xl/xl.h +@@ -41,6 +41,7 @@ struct domain_create { + int vncautopass; + int console_autoconnect; + int checkpointed_stream; ++ int ignore_global_affinity_masks; + const char *config_file; + char *extra_config; /* extra config string */ + const char *restore_file; +@@ -279,6 +280,9 @@ extern char *default_colo_proxy_script; + extern char *blkdev_start; + extern int max_grant_frames; + extern int max_maptrack_frames; ++extern libxl_bitmap global_vm_affinity_mask; ++extern libxl_bitmap global_hvm_affinity_mask; ++extern libxl_bitmap global_pv_affinity_mask; + + enum output_format { + OUTPUT_FORMAT_JSON, +@@ -294,6 +298,9 @@ typedef enum { + } domain_restart_type; + + extern void printf_info_sexp(int domid, libxl_domain_config *d_config, FILE *fh); ++extern void apply_global_affinity_masks(libxl_domain_type type, ++ libxl_bitmap *vcpu_affinity_array, ++ unsigned int size); + + #define XL_GLOBAL_CONFIG XEN_CONFIG_DIR "/xl.conf" + #define XL_LOCK_FILE XEN_LOCK_DIR "/xl" +diff --git a/tools/xl/xl_cmdtable.c b/tools/xl/xl_cmdtable.c +index bf2ced8140..54c2db6022 100644 +--- a/tools/xl/xl_cmdtable.c ++++ b/tools/xl/xl_cmdtable.c +@@ -34,7 +34,8 @@ struct cmd_spec cmd_table[] = { + "-e Do not wait in the background for the death of the domain.\n" + "-V, --vncviewer Connect to the VNC display after the domain is created.\n" + "-A, --vncviewer-autopass\n" +- " Pass VNC password to viewer via stdin." ++ " Pass VNC password to viewer via stdin.\n" ++ "--ignore-global-affinity-masks Ignore global masks in xl.conf." + }, + { "config-update", + &main_config_update, 1, 1, +@@ -224,7 +225,8 @@ struct cmd_spec cmd_table[] = { + &main_vcpupin, 1, 1, + "Set which CPUs a VCPU can use", + "[option] ", +- "-f, --force undo an override pinning done by the kernel", ++ "-f, --force undo an override pinning done by the kernel\n" ++ "--ignore-global-affinity-masks Ignore global masks in xl.conf", + }, + { "vcpu-set", + &main_vcpuset, 0, 1, +diff --git a/tools/xl/xl_vcpu.c b/tools/xl/xl_vcpu.c +index 8e735b38c1..3384eeed06 100644 +--- a/tools/xl/xl_vcpu.c ++++ b/tools/xl/xl_vcpu.c +@@ -68,6 +68,61 @@ static void print_domain_vcpuinfo(uint32_t domid, uint32_t nr_cpus) + libxl_vcpuinfo_list_free(vcpuinfo, nb_vcpu); + } + ++void apply_global_affinity_masks(libxl_domain_type type, ++ libxl_bitmap *vcpu_affinity_array, ++ unsigned int size) ++{ ++ libxl_bitmap *mask = &global_vm_affinity_mask; ++ libxl_bitmap *type_mask; ++ unsigned int i; ++ ++ switch (type) { ++ case LIBXL_DOMAIN_TYPE_HVM: ++ case LIBXL_DOMAIN_TYPE_PVH: ++ type_mask = &global_hvm_affinity_mask; ++ break; ++ case LIBXL_DOMAIN_TYPE_PV: ++ type_mask = &global_pv_affinity_mask; ++ break; ++ default: ++ fprintf(stderr, "Unknown guest type\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ for (i = 0; i < size; i++) { ++ int rc; ++ libxl_bitmap *t = &vcpu_affinity_array[i]; ++ libxl_bitmap b1, b2; ++ ++ libxl_bitmap_init(&b1); ++ libxl_bitmap_init(&b2); ++ ++ rc = libxl_bitmap_and(ctx, &b1, t, mask); ++ if (rc) { ++ fprintf(stderr, "libxl_bitmap_and errored\n"); ++ exit(EXIT_FAILURE); ++ } ++ rc = libxl_bitmap_and(ctx, &b2, &b1, type_mask); ++ if (rc) { ++ fprintf(stderr, "libxl_bitmap_and errored\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ if (libxl_bitmap_is_empty(&b2)) { ++ fprintf(stderr, "vcpu hard affinity map is empty\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ /* Replace target bitmap with the result */ ++ libxl_bitmap_dispose(t); ++ libxl_bitmap_init(t); ++ libxl_bitmap_copy_alloc(ctx, t, &b2); ++ ++ libxl_bitmap_dispose(&b1); ++ libxl_bitmap_dispose(&b2); ++ } ++} ++ + static void vcpulist(int argc, char **argv) + { + libxl_dominfo *dominfo; +@@ -118,6 +173,7 @@ int main_vcpupin(int argc, char **argv) + { + static struct option opts[] = { + {"force", 0, 0, 'f'}, ++ {"ignore-global-affinity-masks", 0, 0, 'i'}, + COMMON_LONG_OPTS + }; + libxl_vcpuinfo *vcpuinfo; +@@ -132,15 +188,18 @@ int main_vcpupin(int argc, char **argv) + const char *vcpu, *hard_str, *soft_str; + char *endptr; + int opt, nb_cpu, nb_vcpu, rc = EXIT_FAILURE; +- bool force = false; ++ bool force = false, ignore_masks = false; + + libxl_bitmap_init(&cpumap_hard); + libxl_bitmap_init(&cpumap_soft); + +- SWITCH_FOREACH_OPT(opt, "f", opts, "vcpu-pin", 3) { ++ SWITCH_FOREACH_OPT(opt, "fi", opts, "vcpu-pin", 3) { + case 'f': + force = true; + break; ++ case 'i': ++ ignore_masks = true; ++ break; + default: + break; + } +@@ -222,6 +281,23 @@ int main_vcpupin(int argc, char **argv) + goto out; + } + ++ /* Only hard affinity matters here */ ++ if (!ignore_masks) { ++ libxl_domain_config d_config; ++ ++ libxl_domain_config_init(&d_config); ++ rc = libxl_retrieve_domain_configuration(ctx, domid, &d_config); ++ if (rc) { ++ fprintf(stderr, "Could not retrieve domain configuration\n"); ++ libxl_domain_config_dispose(&d_config); ++ goto out; ++ } ++ ++ apply_global_affinity_masks(d_config.b_info.type, hard, 1); ++ ++ libxl_domain_config_dispose(&d_config); ++ } ++ + if (force) { + if (libxl_set_vcpuaffinity_force(ctx, domid, vcpuid, hard, soft)) { + fprintf(stderr, "Could not set affinity for vcpu `%ld'.\n", +diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c +index 89c2b25ded..a1d633795c 100644 +--- a/tools/xl/xl_vmcontrol.c ++++ b/tools/xl/xl_vmcontrol.c +@@ -804,6 +804,36 @@ int create_domain(struct domain_create *dom_info) + parse_config_data(config_source, config_data, config_len, &d_config); + } + ++ if (!dom_info->ignore_global_affinity_masks) { ++ libxl_domain_build_info *b_info = &d_config.b_info; ++ ++ /* It is possible that no hard affinity is specified in config file. ++ * Generate hard affinity maps now if we care about those. ++ */ ++ if (b_info->num_vcpu_hard_affinity == 0 && ++ (!libxl_bitmap_is_full(&global_vm_affinity_mask) || ++ (b_info->type == LIBXL_DOMAIN_TYPE_PV && ++ !libxl_bitmap_is_full(&global_pv_affinity_mask)) || ++ (b_info->type != LIBXL_DOMAIN_TYPE_PV && ++ !libxl_bitmap_is_full(&global_hvm_affinity_mask)) ++ )) { ++ b_info->num_vcpu_hard_affinity = b_info->max_vcpus; ++ b_info->vcpu_hard_affinity = ++ xmalloc(b_info->max_vcpus * sizeof(libxl_bitmap)); ++ ++ for (i = 0; i < b_info->num_vcpu_hard_affinity; i++) { ++ libxl_bitmap *m = &b_info->vcpu_hard_affinity[i]; ++ libxl_bitmap_init(m); ++ libxl_cpu_bitmap_alloc(ctx, m, 0); ++ libxl_bitmap_set_any(m); ++ } ++ } ++ ++ apply_global_affinity_masks(b_info->type, ++ b_info->vcpu_hard_affinity, ++ b_info->num_vcpu_hard_affinity); ++ } ++ + if (migrate_fd >= 0) { + if (d_config.c_info.name) { + /* when we receive a domain we get its name from the config +@@ -1124,7 +1154,7 @@ int main_create(int argc, char **argv) + const char *filename = NULL; + struct domain_create dom_info; + int paused = 0, debug = 0, daemonize = 1, console_autoconnect = 0, +- quiet = 0, monitor = 1, vnc = 0, vncautopass = 0; ++ quiet = 0, monitor = 1, vnc = 0, vncautopass = 0, ignore_masks = 0; + int opt, rc; + static struct option opts[] = { + {"dryrun", 0, 0, 'n'}, +@@ -1132,6 +1162,7 @@ int main_create(int argc, char **argv) + {"defconfig", 1, 0, 'f'}, + {"vncviewer", 0, 0, 'V'}, + {"vncviewer-autopass", 0, 0, 'A'}, ++ {"ignore-global-affinity-masks", 0, 0, 'i'}, + COMMON_LONG_OPTS + }; + +@@ -1142,7 +1173,7 @@ int main_create(int argc, char **argv) + argc--; argv++; + } + +- SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVA", opts, "create", 0) { ++ SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVAi", opts, "create", 0) { + case 'f': + filename = optarg; + break; +@@ -1174,6 +1205,9 @@ int main_create(int argc, char **argv) + case 'A': + vnc = vncautopass = 1; + break; ++ case 'i': ++ ignore_masks = 1; ++ break; + } + + memset(&dom_info, 0, sizeof(dom_info)); +@@ -1203,6 +1237,7 @@ int main_create(int argc, char **argv) + dom_info.vnc = vnc; + dom_info.vncautopass = vncautopass; + dom_info.console_autoconnect = console_autoconnect; ++ dom_info.ignore_global_affinity_masks = ignore_masks; + + rc = create_domain(&dom_info); + if (rc < 0) { +-- +2.17.1 +