Blob Blame History Raw
From e932371d6ae0f69b89abb2dce725483c75356de2 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 30 Jul 2018 11:17:27 +0200
Subject: [PATCH 02/42] xen: Port the array_index_nospec() infrastructure from
 Linux

This is as the infrastructure appeared in Linux 4.17, adapted slightly for
Xen.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Julien Grall <julien.grall@arm.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
master commit: 2ddfae51d8b1d7b8cd33a4f6ad4d16d27cb869ae
master date: 2018-07-06 16:49:57 +0100
---
 xen/include/asm-arm/arm32/system.h | 18 ++++++++
 xen/include/asm-arm/arm64/system.h | 22 ++++++++++
 xen/include/asm-x86/system.h       | 24 ++++++++++
 xen/include/xen/compiler.h         |  3 ++
 xen/include/xen/nospec.h           | 70 ++++++++++++++++++++++++++++++
 5 files changed, 137 insertions(+)
 create mode 100644 xen/include/xen/nospec.h

diff --git a/xen/include/asm-arm/arm32/system.h b/xen/include/asm-arm/arm32/system.h
index c617b40438..ab57abfbc5 100644
--- a/xen/include/asm-arm/arm32/system.h
+++ b/xen/include/asm-arm/arm32/system.h
@@ -48,6 +48,24 @@ static inline int local_fiq_is_enabled(void)
     return !(flags & PSR_FIQ_MASK);
 }
 
+#define CSDB    ".inst  0xe320f014"
+
+static inline unsigned long array_index_mask_nospec(unsigned long idx,
+                                                    unsigned long sz)
+{
+    unsigned long mask;
+
+    asm volatile( "cmp    %1, %2\n"
+                  "sbc    %0, %1, %1\n"
+                  CSDB
+                  : "=r" (mask)
+                  : "r" (idx), "Ir" (sz)
+                  : "cc" );
+
+    return mask;
+}
+#define array_index_mask_nospec array_index_mask_nospec
+
 #endif
 /*
  * Local variables:
diff --git a/xen/include/asm-arm/arm64/system.h b/xen/include/asm-arm/arm64/system.h
index 2e2ee212a1..2e36573ac6 100644
--- a/xen/include/asm-arm/arm64/system.h
+++ b/xen/include/asm-arm/arm64/system.h
@@ -58,6 +58,28 @@ static inline int local_fiq_is_enabled(void)
     return !(flags & PSR_FIQ_MASK);
 }
 
+#define csdb()  asm volatile ( "hint #20" : : : "memory" )
+
+/*
+ * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz
+ * and 0 otherwise.
+ */
+static inline unsigned long array_index_mask_nospec(unsigned long idx,
+                                                    unsigned long sz)
+{
+    unsigned long mask;
+
+    asm volatile ( "cmp     %1, %2\n"
+                   "sbc     %0, xzr, xzr\n"
+                   : "=r" (mask)
+                   : "r" (idx), "Ir" (sz)
+                   : "cc" );
+    csdb();
+
+    return mask;
+}
+#define array_index_mask_nospec array_index_mask_nospec
+
 #endif
 /*
  * Local variables:
diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h
index 43fb6fe489..483cd20afd 100644
--- a/xen/include/asm-x86/system.h
+++ b/xen/include/asm-x86/system.h
@@ -221,6 +221,30 @@ static always_inline unsigned long __xadd(
 #define set_mb(var, value) do { xchg(&var, value); } while (0)
 #define set_wmb(var, value) do { var = value; smp_wmb(); } while (0)
 
+/**
+ * array_index_mask_nospec() - generate a mask that is ~0UL when the
+ *      bounds check succeeds and 0 otherwise
+ * @index: array element index
+ * @size: number of elements in array
+ *
+ * Returns:
+ *     0 - (index < size)
+ */
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+                                                    unsigned long size)
+{
+    unsigned long mask;
+
+    asm volatile ( "cmp %[size], %[index]; sbb %[mask], %[mask];"
+                   : [mask] "=r" (mask)
+                   : [size] "g" (size), [index] "r" (index) );
+
+    return mask;
+}
+
+/* Override default implementation in nospec.h. */
+#define array_index_mask_nospec array_index_mask_nospec
+
 #define local_irq_disable()     asm volatile ( "cli" : : : "memory" )
 #define local_irq_enable()      asm volatile ( "sti" : : : "memory" )
 
diff --git a/xen/include/xen/compiler.h b/xen/include/xen/compiler.h
index 533a8ea0f3..a7e05681c9 100644
--- a/xen/include/xen/compiler.h
+++ b/xen/include/xen/compiler.h
@@ -81,6 +81,9 @@
 #pragma GCC visibility push(hidden)
 #endif
 
+/* Make the optimizer believe the variable can be manipulated arbitrarily. */
+#define OPTIMIZER_HIDE_VAR(var) __asm__ ( "" : "+g" (var) )
+
 /* This macro obfuscates arithmetic on a variable address so that gcc
    shouldn't recognize the original var, and make assumptions about it */
 /*
diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h
new file mode 100644
index 0000000000..48793996e8
--- /dev/null
+++ b/xen/include/xen/nospec.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2018 Linus Torvalds. All rights reserved. */
+/* Copyright(c) 2018 Alexei Starovoitov. All rights reserved. */
+/* Copyright(c) 2018 Intel Corporation. All rights reserved. */
+/* Copyright(c) 2018 Citrix Systems R&D Ltd. All rights reserved. */
+
+#ifndef XEN_NOSPEC_H
+#define XEN_NOSPEC_H
+
+#include <asm/system.h>
+
+/**
+ * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
+ * @index: array element index
+ * @size: number of elements in array
+ *
+ * When @index is out of bounds (@index >= @size), the sign bit will be
+ * set.  Extend the sign bit to all bits and invert, giving a result of
+ * zero for an out of bounds index, or ~0 if within bounds [0, @size).
+ */
+#ifndef array_index_mask_nospec
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+                                                    unsigned long size)
+{
+    /*
+     * Always calculate and emit the mask even if the compiler
+     * thinks the mask is not needed. The compiler does not take
+     * into account the value of @index under speculation.
+     */
+    OPTIMIZER_HIDE_VAR(index);
+    return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1);
+}
+#endif
+
+/*
+ * array_index_nospec - sanitize an array index after a bounds check
+ *
+ * For a code sequence like:
+ *
+ *     if (index < size) {
+ *         index = array_index_nospec(index, size);
+ *         val = array[index];
+ *     }
+ *
+ * ...if the CPU speculates past the bounds check then
+ * array_index_nospec() will clamp the index within the range of [0,
+ * size).
+ */
+#define array_index_nospec(index, size)                                 \
+({                                                                      \
+    typeof(index) _i = (index);                                         \
+    typeof(size) _s = (size);                                           \
+    unsigned long _mask = array_index_mask_nospec(_i, _s);              \
+                                                                        \
+    BUILD_BUG_ON(sizeof(_i) > sizeof(long));                            \
+    BUILD_BUG_ON(sizeof(_s) > sizeof(long));                            \
+                                                                        \
+    (typeof(_i)) (_i & _mask);                                          \
+})
+
+#endif /* XEN_NOSPEC_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
-- 
2.17.1


From da33530ab393dcc04d3e35424956277669b8d8ce Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Mon, 30 Jul 2018 11:18:54 +0200
Subject: [PATCH 03/42] x86: correctly set nonlazy_xstate_used when loading
 full state

In this case, just like xcr0_accum, nonlazy_xstate_used should always be
set to the intended new value, rather than possibly leaving the flag set
from a prior state load.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
master commit: f46bf0e101ca63118b9db2616e8f51e972d7f563
master date: 2018-07-09 10:51:02 +0200
---
 xen/arch/x86/domctl.c  | 3 +--
 xen/arch/x86/hvm/hvm.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index 8fbbf3aeb3..b04388d663 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1187,8 +1187,7 @@ long arch_do_domctl(
                 vcpu_pause(v);
                 v->arch.xcr0 = _xcr0;
                 v->arch.xcr0_accum = _xcr0_accum;
-                if ( _xcr0_accum & XSTATE_NONLAZY )
-                    v->arch.nonlazy_xstate_used = 1;
+                v->arch.nonlazy_xstate_used = _xcr0_accum & XSTATE_NONLAZY;
                 compress_xsave_states(v, _xsave_area,
                                       evc->size - PV_XSAVE_HDR_SIZE);
                 vcpu_unpause(v);
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index c23983cdff..279cb88e45 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1324,8 +1324,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
 
     v->arch.xcr0 = ctxt->xcr0;
     v->arch.xcr0_accum = ctxt->xcr0_accum;
-    if ( ctxt->xcr0_accum & XSTATE_NONLAZY )
-        v->arch.nonlazy_xstate_used = 1;
+    v->arch.nonlazy_xstate_used = ctxt->xcr0_accum & XSTATE_NONLAZY;
     compress_xsave_states(v, &ctxt->save_area,
                           size - offsetof(struct hvm_hw_cpu_xsave, save_area));
 
-- 
2.17.1


From 4bdeedbd611c59f07878eb22955f655a81452835 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Mon, 30 Jul 2018 11:19:41 +0200
Subject: [PATCH 04/42] x86/spec-ctrl: command line handling adjustments

For one, "no-xen" should not imply "no-eager-fpu", as "eager FPU" mode
is to guard guests, not Xen itself, which is also expressed so by
print_details().

And then opt_ssbd, despite being off by default, should also be cleared
by the "no" and "no-xen" sub-options.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
master commit: ac3f9a72141a48d40fabfff561d5a7dc0e1b810d
master date: 2018-07-10 12:22:31 +0200
---
 xen/arch/x86/spec_ctrl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 08e6784c4c..73dc7170c7 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -124,6 +124,8 @@ static int __init parse_spec_ctrl(const char *s)
             opt_msr_sc_pv = false;
             opt_msr_sc_hvm = false;
 
+            opt_eager_fpu = 0;
+
         disable_common:
             opt_rsb_pv = false;
             opt_rsb_hvm = false;
@@ -131,7 +133,7 @@ static int __init parse_spec_ctrl(const char *s)
             opt_thunk = THUNK_JMP;
             opt_ibrs = 0;
             opt_ibpb = false;
-            opt_eager_fpu = 0;
+            opt_ssbd = false;
         }
         else if ( val > 0 )
             rc = -EINVAL;
-- 
2.17.1


From 10c548215b052a266c53abd9d37d08b06ed91bb3 Mon Sep 17 00:00:00 2001
From: Ian Jackson <Ian.Jackson@eu.citrix.com>
Date: Mon, 30 Jul 2018 11:20:44 +0200
Subject: [PATCH 05/42] xen: oprofile/nmi_int.c: Drop unwanted sexual reference

This is not really very nice.

This line doesn't have much value in itself.  The rest of this comment
block is pretty clear what it wants to convey.  So delete it.

(While we are here, adopt the CODING_STYLE-mandated formatting.)

Signed-off-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
Acked-by: Wei Liu <wei.liu2@citrix.com>
Acked-by: Lars Kurth <lars.kurth.xen@gmail.com>
Acked-by: George Dunlap <dunlapg@umich.edu
Acked-by: Jan Beulich <JBeulich@suse.com>
master commit: 41cb2db62627a7438d938aae487550c3f4acb1da
master date: 2018-07-12 16:38:30 +0100
---
 xen/arch/x86/oprofile/nmi_int.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xen/arch/x86/oprofile/nmi_int.c b/xen/arch/x86/oprofile/nmi_int.c
index d8f5230906..3dfb8fef93 100644
--- a/xen/arch/x86/oprofile/nmi_int.c
+++ b/xen/arch/x86/oprofile/nmi_int.c
@@ -182,7 +182,7 @@ int nmi_reserve_counters(void)
 	if (!allocate_msrs())
 		return -ENOMEM;
 
-	/* We walk a thin line between law and rape here.
+	/*
 	 * We need to be careful to install our NMI handler
 	 * without actually triggering any NMIs as this will
 	 * break the core code horrifically.
-- 
2.17.1


From ac35e050b64a565fe234dd42e8dac163e946e58d Mon Sep 17 00:00:00 2001
From: Sergey Dyasli <sergey.dyasli@citrix.com>
Date: Mon, 30 Jul 2018 11:21:28 +0200
Subject: [PATCH 06/42] mm/page_alloc: correct first_dirty calculations during
 block merging

Currently it's possible to hit an assertion in alloc_heap_pages():

Assertion 'first_dirty != INVALID_DIRTY_IDX || !(pg[i].count_info & PGC_need_scrub)' failed at page_alloc.c:988

This can happen because a piece of logic to calculate first_dirty
during block merging in free_heap_pages() is missing for the following
scenario:

1. Current block's first_dirty equals to INVALID_DIRTY_IDX
2. Successor block is free but its first_dirty != INVALID_DIRTY_IDX
3. The successor is merged into current block
4. Current block's first_dirty still equals to INVALID_DIRTY_IDX

This will trigger the assertion during allocation of such block in
alloc_heap_pages() because there will be pages with PGC_need_scrub
bit set despite the claim of first_dirty that the block is scrubbed.

Add the missing piece of logic and slightly update the comment for
the predecessor case to better capture the code's intent.

Fixes 1a37f33ea613 ("mm: Place unscrubbed pages at the end of pagelist")

Signed-off-by: Sergey Dyasli <sergey.dyasli@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
master commit: 1e2df9608857b5355f2ec3b1a34b87a2007dcd16
master date: 2018-07-12 10:45:11 +0200
---
 xen/common/page_alloc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 20ee1e4897..02aeed7c47 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1426,7 +1426,7 @@ static void free_heap_pages(
 
             page_list_del(predecessor, &heap(node, zone, order));
 
-            /* Keep predecessor's first_dirty if it is already set. */
+            /* Update predecessor's first_dirty if necessary. */
             if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX &&
                  pg->u.free.first_dirty != INVALID_DIRTY_IDX )
                 predecessor->u.free.first_dirty = (1U << order) +
@@ -1447,6 +1447,12 @@ static void free_heap_pages(
 
             check_and_stop_scrub(successor);
 
+            /* Update pg's first_dirty if necessary. */
+            if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX &&
+                 successor->u.free.first_dirty != INVALID_DIRTY_IDX )
+                pg->u.free.first_dirty = (1U << order) +
+                                         successor->u.free.first_dirty;
+
             page_list_del(successor, &heap(node, zone, order));
         }
 
-- 
2.17.1


From a44cf0c8728e08858638170a057675ca5479fdc7 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Mon, 30 Jul 2018 11:22:06 +0200
Subject: [PATCH 07/42] allow cpu_down() to be called earlier

The function's use of the stop-machine logic has so far prevented its
use ahead of the processing of the "ordinary" initcalls. Since at this
early time we're in a controlled environment anyway, there's no need for
such a heavy tool. Additionally this ought to have less of a performance
impact especially on large systems, compared to the alternative of
making stop-machine functionality available earlier.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
master commit: 5894c0a2da66243a89088d309c7e1ea212ab28d6
master date: 2018-07-16 15:15:12 +0200
---
 xen/common/cpu.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/xen/common/cpu.c b/xen/common/cpu.c
index 6350f150bd..653a56b840 100644
--- a/xen/common/cpu.c
+++ b/xen/common/cpu.c
@@ -67,12 +67,17 @@ void __init register_cpu_notifier(struct notifier_block *nb)
     spin_unlock(&cpu_add_remove_lock);
 }
 
-static int take_cpu_down(void *unused)
+static void _take_cpu_down(void *unused)
 {
     void *hcpu = (void *)(long)smp_processor_id();
     int notifier_rc = notifier_call_chain(&cpu_chain, CPU_DYING, hcpu, NULL);
     BUG_ON(notifier_rc != NOTIFY_DONE);
     __cpu_disable();
+}
+
+static int take_cpu_down(void *arg)
+{
+    _take_cpu_down(arg);
     return 0;
 }
 
@@ -98,7 +103,9 @@ int cpu_down(unsigned int cpu)
         goto fail;
     }
 
-    if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 )
+    if ( unlikely(system_state < SYS_STATE_active) )
+        on_selected_cpus(cpumask_of(cpu), _take_cpu_down, NULL, true);
+    else if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 )
         goto fail;
 
     __cpu_die(cpu);
-- 
2.17.1


From b53e0defcea1400c03f83d1d5cc30a3b237c8cfe Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 30 Jul 2018 11:22:42 +0200
Subject: [PATCH 08/42] x86/svm Fixes and cleanup to svm_inject_event()

 * State adjustments (and debug tracing) for #DB/#BP/#PF should not be done
   for `int $n` instructions.  Updates to %cr2 occur even if the exception
   combines to #DF.
 * Don't opencode DR_STEP when updating %dr6.
 * Simplify the logic for calling svm_emul_swint_injection() as in the common
   case, every condition needs checking.
 * Fix comments which have become stale as code has moved between components.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
master commit: 8dab867c81ede455009028a9a88edc4ff3b9da88
master date: 2018-07-17 10:12:40 +0100
---
 xen/arch/x86/hvm/svm/svm.c | 41 ++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index 165500e3f2..b964c59dad 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1432,24 +1432,18 @@ static void svm_inject_event(const struct x86_event *event)
      * Xen must emulate enough of the event injection to be sure that a
      * further fault shouldn't occur during delivery.  This covers the fact
      * that hardware doesn't perform DPL checking on injection.
-     *
-     * Also, it accounts for proper positioning of %rip for an event with trap
-     * semantics (where %rip should point after the instruction) which suffers
-     * a fault during injection (at which point %rip should point at the
-     * instruction).
      */
     if ( event->type == X86_EVENTTYPE_PRI_SW_EXCEPTION ||
-         (!cpu_has_svm_nrips && (event->type == X86_EVENTTYPE_SW_INTERRUPT ||
-                                 event->type == X86_EVENTTYPE_SW_EXCEPTION)) )
+         (!cpu_has_svm_nrips && (event->type >= X86_EVENTTYPE_SW_INTERRUPT)) )
         svm_emul_swint_injection(&_event);
 
-    switch ( _event.vector )
+    switch ( _event.vector | -(_event.type == X86_EVENTTYPE_SW_INTERRUPT) )
     {
     case TRAP_debug:
         if ( regs->eflags & X86_EFLAGS_TF )
         {
             __restore_debug_registers(vmcb, curr);
-            vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000);
+            vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | DR_STEP);
         }
         /* fall through */
     case TRAP_int3:
@@ -1459,6 +1453,13 @@ static void svm_inject_event(const struct x86_event *event)
             domain_pause_for_debugger();
             return;
         }
+        break;
+
+    case TRAP_page_fault:
+        ASSERT(_event.type == X86_EVENTTYPE_HW_EXCEPTION);
+        curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
+        vmcb_set_cr2(vmcb, _event.cr2);
+        break;
     }
 
     if ( unlikely(eventinj.fields.v) &&
@@ -1481,13 +1482,9 @@ static void svm_inject_event(const struct x86_event *event)
      * icebp, software events with trap semantics need emulating, so %rip in
      * the trap frame points after the instruction.
      *
-     * The x86 emulator (if requested by the x86_swint_emulate_* choice) will
-     * have performed checks such as presence/dpl/etc and believes that the
-     * event injection will succeed without faulting.
-     *
-     * The x86 emulator will always provide fault semantics for software
-     * events, with _trap.insn_len set appropriately.  If the injection
-     * requires emulation, move %rip forwards at this point.
+     * svm_emul_swint_injection() has already confirmed that events with trap
+     * semantics won't fault on injection.  Position %rip/NextRIP suitably,
+     * and restrict the event type to what hardware will tolerate.
      */
     switch ( _event.type )
     {
@@ -1544,16 +1541,12 @@ static void svm_inject_event(const struct x86_event *event)
            eventinj.fields.errorcode == (uint16_t)eventinj.fields.errorcode);
     vmcb->eventinj = eventinj;
 
-    if ( _event.vector == TRAP_page_fault )
-    {
-        curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
-        vmcb_set_cr2(vmcb, _event.cr2);
-        HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, TRC_PAR_LONG(_event.cr2));
-    }
+    if ( _event.vector == TRAP_page_fault &&
+         _event.type == X86_EVENTTYPE_HW_EXCEPTION )
+        HVMTRACE_LONG_2D(PF_INJECT, _event.error_code,
+                         TRC_PAR_LONG(_event.cr2));
     else
-    {
         HVMTRACE_2D(INJ_EXC, _event.vector, _event.error_code);
-    }
 }
 
 static int svm_event_pending(struct vcpu *v)
-- 
2.17.1


From 0a2016ca2fabfe674c311dcfd8e15fec0ba3f7b6 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Mon, 30 Jul 2018 11:23:22 +0200
Subject: [PATCH 09/42] cpupools: fix state when downing a CPU failed

While I've run into the issue with further patches in place which no
longer guarantee the per-CPU area to start out as all zeros, the
CPU_DOWN_FAILED processing looks to have the same issue: By not zapping
the per-CPU cpupool pointer, cpupool_cpu_add()'s (indirect) invocation
of schedule_cpu_switch() will trigger the "c != old_pool" assertion
there.

Clearing the field during CPU_DOWN_PREPARE is too early (afaict this
should not happen before cpu_disable_scheduler()). Clearing it in
CPU_DEAD and CPU_DOWN_FAILED would be an option, but would take the same
piece of code twice. Since the field's value shouldn't matter while the
CPU is offline, simply clear it (implicitly) for CPU_ONLINE and
CPU_DOWN_FAILED, but only for other than the suspend/resume case (which
gets specially handled in cpupool_cpu_remove()).

By adjusting the conditional in cpupool_cpu_add() CPU_DOWN_FAILED
handling in the suspend case should now also be handled better.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
master commit: cb1ae9a27819cea0c5008773c68a7be6f37eb0e5
master date: 2018-07-19 09:41:55 +0200
---
 xen/common/cpupool.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c
index 999839444e..1e8edcbd57 100644
--- a/xen/common/cpupool.c
+++ b/xen/common/cpupool.c
@@ -490,7 +490,7 @@ static int cpupool_cpu_add(unsigned int cpu)
     cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
     cpumask_set_cpu(cpu, &cpupool_free_cpus);
 
-    if ( system_state == SYS_STATE_resume )
+    if ( system_state == SYS_STATE_suspend || system_state == SYS_STATE_resume )
     {
         struct cpupool **c;
 
@@ -522,6 +522,7 @@ static int cpupool_cpu_add(unsigned int cpu)
          * (or unplugging would have failed) and that is the default behavior
          * anyway.
          */
+        per_cpu(cpupool, cpu) = NULL;
         ret = cpupool_assign_cpu_locked(cpupool0, cpu);
     }
  out:
-- 
2.17.1


From bd51a6424202a5f1cd13dee6614bcb69ecbd2458 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Mon, 30 Jul 2018 11:24:01 +0200
Subject: [PATCH 10/42] x86/AMD: distinguish compute units from hyper-threads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fam17 replaces CUs by HTs, which we should reflect accordingly, even if
the difference is not very big. The most relevant change (requiring some
code restructuring) is that the topoext feature no longer means there is
a valid CU ID.

Take the opportunity and convert wrongly plain int variables in
set_cpu_sibling_map() to unsigned int.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Brian Woods <brian.woods@amd.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
master commit: 9429b07a0af7f92a5f25e4068e11db881e157495
master date: 2018-07-19 09:42:42 +0200
---
 xen/arch/x86/cpu/amd.c | 16 +++++++++++-----
 xen/arch/x86/smpboot.c | 32 ++++++++++++++++++++------------
 2 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 458a3fe60c..76078b55b2 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -505,17 +505,23 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
                 u32 eax, ebx, ecx, edx;
 
                 cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-                c->compute_unit_id = ebx & 0xFF;
                 c->x86_num_siblings = ((ebx >> 8) & 0x3) + 1;
+
+                if (c->x86 < 0x17)
+                        c->compute_unit_id = ebx & 0xFF;
+                else {
+                        c->cpu_core_id = ebx & 0xFF;
+                        c->x86_max_cores /= c->x86_num_siblings;
+                }
         }
         
         if (opt_cpu_info)
                 printk("CPU %d(%d) -> Processor %d, %s %d\n",
                        cpu, c->x86_max_cores, c->phys_proc_id,
-                       cpu_has(c, X86_FEATURE_TOPOEXT) ? "Compute Unit" : 
-                                                         "Core",
-                       cpu_has(c, X86_FEATURE_TOPOEXT) ? c->compute_unit_id :
-                                                         c->cpu_core_id);
+                       c->compute_unit_id != INVALID_CUID ? "Compute Unit"
+                                                          : "Core",
+                       c->compute_unit_id != INVALID_CUID ? c->compute_unit_id
+                                                          : c->cpu_core_id);
 }
 
 static void early_init_amd(struct cpuinfo_x86 *c)
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index d4478e6132..78ba73578a 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -234,33 +234,41 @@ static void link_thread_siblings(int cpu1, int cpu2)
     cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1));
 }
 
-static void set_cpu_sibling_map(int cpu)
+static void set_cpu_sibling_map(unsigned int cpu)
 {
-    int i;
+    unsigned int i;
     struct cpuinfo_x86 *c = cpu_data;
 
     cpumask_set_cpu(cpu, &cpu_sibling_setup_map);
 
     cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
+    cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, cpu));
+    cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
 
     if ( c[cpu].x86_num_siblings > 1 )
     {
         for_each_cpu ( i, &cpu_sibling_setup_map )
         {
-            if ( cpu_has(c, X86_FEATURE_TOPOEXT) ) {
-                if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
-                     (c[cpu].compute_unit_id == c[i].compute_unit_id) )
+            if ( cpu == i || c[cpu].phys_proc_id != c[i].phys_proc_id )
+                continue;
+            if ( c[cpu].compute_unit_id != INVALID_CUID &&
+                 c[i].compute_unit_id != INVALID_CUID )
+            {
+                if ( c[cpu].compute_unit_id == c[i].compute_unit_id )
+                    link_thread_siblings(cpu, i);
+            }
+            else if ( c[cpu].cpu_core_id != XEN_INVALID_CORE_ID &&
+                      c[i].cpu_core_id != XEN_INVALID_CORE_ID )
+            {
+                if ( c[cpu].cpu_core_id == c[i].cpu_core_id )
                     link_thread_siblings(cpu, i);
-            } else if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
-                        (c[cpu].cpu_core_id == c[i].cpu_core_id) ) {
-                link_thread_siblings(cpu, i);
             }
+            else
+                printk(XENLOG_WARNING
+                       "CPU%u: unclear relationship with CPU%u\n",
+                       cpu, i);
         }
     }
-    else
-    {
-        cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
-    }
 
     if ( c[cpu].x86_max_cores == 1 )
     {
-- 
2.17.1


From 5908b4866b682d9189c36eddf7c898fd95b27ec1 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Mon, 30 Jul 2018 11:24:53 +0200
Subject: [PATCH 11/42] x86: distinguish CPU offlining from CPU removal

In order to be able to service #MC on offlined CPUs, the GDT, IDT,
stack, and per-CPU data (which includes the TSS) need to be kept
allocated. They should only be freed upon CPU removal (which we
currently don't support, so some code is becoming effectively dead for
the moment).

Note that for now park_offline_cpus doesn't get set to true anywhere -
this is going to be the subject of a subsequent patch.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
master commit: 2e6c8f182c9c50129b1c7a620242861e6ad6a9fb
master date: 2018-07-19 13:43:33 +0100
---
 xen/arch/x86/cpu/mcheck/mce.c | 15 ++++++--
 xen/arch/x86/domain.c         |  9 +++--
 xen/arch/x86/genapic/x2apic.c |  9 +++--
 xen/arch/x86/percpu.c         |  9 +++--
 xen/arch/x86/smpboot.c        | 71 ++++++++++++++++++++++-------------
 xen/include/asm-x86/smp.h     |  2 +
 xen/include/xen/cpu.h         |  2 +
 xen/include/xen/cpumask.h     | 23 ++++++++++++
 xen/include/xen/mm.h          |  8 ++++
 xen/include/xen/xmalloc.h     |  6 +++
 10 files changed, 115 insertions(+), 39 deletions(-)

diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
index a8c287d124..32273d9208 100644
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -692,12 +692,15 @@ static void cpu_bank_free(unsigned int cpu)
 
     mcabanks_free(poll);
     mcabanks_free(clr);
+
+    per_cpu(poll_bankmask, cpu) = NULL;
+    per_cpu(mce_clear_banks, cpu) = NULL;
 }
 
 static int cpu_bank_alloc(unsigned int cpu)
 {
-    struct mca_banks *poll = mcabanks_alloc();
-    struct mca_banks *clr = mcabanks_alloc();
+    struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc();
+    struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc();
 
     if ( !poll || !clr )
     {
@@ -725,7 +728,13 @@ static int cpu_callback(
 
     case CPU_UP_CANCELED:
     case CPU_DEAD:
-        cpu_bank_free(cpu);
+        if ( !park_offline_cpus )
+            cpu_bank_free(cpu);
+        break;
+
+    case CPU_REMOVE:
+        if ( park_offline_cpus )
+            cpu_bank_free(cpu);
         break;
     }
 
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 9850a782ec..c39cf2c6e5 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -107,10 +107,11 @@ static void play_dead(void)
     local_irq_disable();
 
     /*
-     * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible,
-     * as they may be freed at any time. In this case, heap corruption or
-     * #PF can occur (when heap debugging is enabled). For example, even
-     * printk() can involve tasklet scheduling, which touches per-cpu vars.
+     * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible,
+     * as they may be freed at any time if offline CPUs don't get parked. In
+     * this case, heap corruption or #PF can occur (when heap debugging is
+     * enabled). For example, even printk() can involve tasklet scheduling,
+     * which touches per-cpu vars.
      * 
      * Consider very carefully when adding code to *dead_idle. Most hypervisor
      * subsystems are unsafe to call.
diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c
index 4779b0d0d5..d997806272 100644
--- a/xen/arch/x86/genapic/x2apic.c
+++ b/xen/arch/x86/genapic/x2apic.c
@@ -201,18 +201,21 @@ static int update_clusterinfo(
         if ( !cluster_cpus_spare )
             cluster_cpus_spare = xzalloc(cpumask_t);
         if ( !cluster_cpus_spare ||
-             !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
+             !cond_alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
             err = -ENOMEM;
         break;
     case CPU_UP_CANCELED:
     case CPU_DEAD:
+    case CPU_REMOVE:
+        if ( park_offline_cpus == (action != CPU_REMOVE) )
+            break;
         if ( per_cpu(cluster_cpus, cpu) )
         {
             cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu));
             if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) )
-                xfree(per_cpu(cluster_cpus, cpu));
+                XFREE(per_cpu(cluster_cpus, cpu));
         }
-        free_cpumask_var(per_cpu(scratch_mask, cpu));
+        FREE_CPUMASK_VAR(per_cpu(scratch_mask, cpu));
         break;
     }
 
diff --git a/xen/arch/x86/percpu.c b/xen/arch/x86/percpu.c
index c9997b7937..8be4ebddf4 100644
--- a/xen/arch/x86/percpu.c
+++ b/xen/arch/x86/percpu.c
@@ -28,7 +28,7 @@ static int init_percpu_area(unsigned int cpu)
     char *p;
 
     if ( __per_cpu_offset[cpu] != INVALID_PERCPU_AREA )
-        return -EBUSY;
+        return 0;
 
     if ( (p = alloc_xenheap_pages(PERCPU_ORDER, 0)) == NULL )
         return -ENOMEM;
@@ -76,9 +76,12 @@ static int cpu_percpu_callback(
         break;
     case CPU_UP_CANCELED:
     case CPU_DEAD:
-        free_percpu_area(cpu);
+        if ( !park_offline_cpus )
+            free_percpu_area(cpu);
         break;
-    default:
+    case CPU_REMOVE:
+        if ( park_offline_cpus )
+            free_percpu_area(cpu);
         break;
     }
 
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 78ba73578a..7e76cc3d68 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -63,6 +63,8 @@ static cpumask_t scratch_cpu0mask;
 cpumask_t cpu_online_map __read_mostly;
 EXPORT_SYMBOL(cpu_online_map);
 
+bool __read_mostly park_offline_cpus;
+
 unsigned int __read_mostly nr_sockets;
 cpumask_t **__read_mostly socket_cpumask;
 static cpumask_t *secondary_socket_cpumask;
@@ -895,7 +897,14 @@ static void cleanup_cpu_root_pgt(unsigned int cpu)
     }
 }
 
-static void cpu_smpboot_free(unsigned int cpu)
+/*
+ * The 'remove' boolean controls whether a CPU is just getting offlined (and
+ * parked), or outright removed / offlined without parking. Parked CPUs need
+ * things like their stack, GDT, IDT, TSS, and per-CPU data still available.
+ * A few other items, in particular CPU masks, are also retained, as it's
+ * difficult to prove that they're entirely unreferenced from parked CPUs.
+ */
+static void cpu_smpboot_free(unsigned int cpu, bool remove)
 {
     unsigned int order, socket = cpu_to_socket(cpu);
     struct cpuinfo_x86 *c = cpu_data;
@@ -906,15 +915,19 @@ static void cpu_smpboot_free(unsigned int cpu)
         socket_cpumask[socket] = NULL;
     }
 
-    c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
-    c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
-    c[cpu].compute_unit_id = INVALID_CUID;
     cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);
 
-    free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
-    free_cpumask_var(per_cpu(cpu_core_mask, cpu));
-    if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
-        free_cpumask_var(per_cpu(scratch_cpumask, cpu));
+    if ( remove )
+    {
+        c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
+        c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
+        c[cpu].compute_unit_id = INVALID_CUID;
+
+        FREE_CPUMASK_VAR(per_cpu(cpu_sibling_mask, cpu));
+        FREE_CPUMASK_VAR(per_cpu(cpu_core_mask, cpu));
+        if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
+            FREE_CPUMASK_VAR(per_cpu(scratch_cpumask, cpu));
+    }
 
     cleanup_cpu_root_pgt(cpu);
 
@@ -936,19 +949,21 @@ static void cpu_smpboot_free(unsigned int cpu)
     }
 
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
-    free_xenheap_pages(per_cpu(gdt_table, cpu), order);
+    if ( remove )
+        FREE_XENHEAP_PAGES(per_cpu(gdt_table, cpu), order);
 
     free_xenheap_pages(per_cpu(compat_gdt_table, cpu), order);
 
-    order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
-    free_xenheap_pages(idt_tables[cpu], order);
-    idt_tables[cpu] = NULL;
-
-    if ( stack_base[cpu] != NULL )
+    if ( remove )
     {
-        memguard_unguard_stack(stack_base[cpu]);
-        free_xenheap_pages(stack_base[cpu], STACK_ORDER);
-        stack_base[cpu] = NULL;
+        order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
+        FREE_XENHEAP_PAGES(idt_tables[cpu], order);
+
+        if ( stack_base[cpu] )
+        {
+            memguard_unguard_stack(stack_base[cpu]);
+            FREE_XENHEAP_PAGES(stack_base[cpu], STACK_ORDER);
+        }
     }
 }
 
@@ -963,15 +978,17 @@ static int cpu_smpboot_alloc(unsigned int cpu)
     if ( node != NUMA_NO_NODE )
         memflags = MEMF_node(node);
 
-    stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
+    if ( stack_base[cpu] == NULL )
+        stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
     if ( stack_base[cpu] == NULL )
         goto out;
     memguard_guard_stack(stack_base[cpu]);
 
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
-    per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
+    gdt = per_cpu(gdt_table, cpu) ?: alloc_xenheap_pages(order, memflags);
     if ( gdt == NULL )
         goto out;
+    per_cpu(gdt_table, cpu) = gdt;
     memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
     BUILD_BUG_ON(NR_CPUS > 0x10000);
     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
@@ -983,7 +1000,8 @@ static int cpu_smpboot_alloc(unsigned int cpu)
     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
 
     order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
-    idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
+    if ( idt_tables[cpu] == NULL )
+        idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
     if ( idt_tables[cpu] == NULL )
         goto out;
     memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
@@ -1011,16 +1029,16 @@ static int cpu_smpboot_alloc(unsigned int cpu)
          (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL )
         goto out;
 
-    if ( !(zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
-           zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
-           alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu))) )
+    if ( !(cond_zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
+           cond_zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
+           cond_alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu))) )
         goto out;
 
     rc = 0;
 
  out:
     if ( rc )
-        cpu_smpboot_free(cpu);
+        cpu_smpboot_free(cpu, true);
 
     return rc;
 }
@@ -1038,9 +1056,10 @@ static int cpu_smpboot_callback(
         break;
     case CPU_UP_CANCELED:
     case CPU_DEAD:
-        cpu_smpboot_free(cpu);
+        cpu_smpboot_free(cpu, !park_offline_cpus);
         break;
-    default:
+    case CPU_REMOVE:
+        cpu_smpboot_free(cpu, true);
         break;
     }
 
diff --git a/xen/include/asm-x86/smp.h b/xen/include/asm-x86/smp.h
index 4e5f673fec..09c55458df 100644
--- a/xen/include/asm-x86/smp.h
+++ b/xen/include/asm-x86/smp.h
@@ -26,6 +26,8 @@ DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask);
 DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
 DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);
 
+extern bool park_offline_cpus;
+
 void smp_send_nmi_allbutself(void);
 
 void send_IPI_mask(const cpumask_t *, int vector);
diff --git a/xen/include/xen/cpu.h b/xen/include/xen/cpu.h
index ffefc09f8e..2fe3ec05d8 100644
--- a/xen/include/xen/cpu.h
+++ b/xen/include/xen/cpu.h
@@ -47,6 +47,8 @@ void register_cpu_notifier(struct notifier_block *nb);
 #define CPU_DYING        (0x0007 | NOTIFY_REVERSE)
 /* CPU_DEAD: CPU is dead. */
 #define CPU_DEAD         (0x0008 | NOTIFY_REVERSE)
+/* CPU_REMOVE: CPU was removed. */
+#define CPU_REMOVE       (0x0009 | NOTIFY_REVERSE)
 
 /* Perform CPU hotplug. May return -EAGAIN. */
 int cpu_down(unsigned int cpu);
diff --git a/xen/include/xen/cpumask.h b/xen/include/xen/cpumask.h
index 42340a098e..4a11bcc3f3 100644
--- a/xen/include/xen/cpumask.h
+++ b/xen/include/xen/cpumask.h
@@ -351,16 +351,35 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask)
 	return *mask != NULL;
 }
 
+static inline bool cond_alloc_cpumask_var(cpumask_var_t *mask)
+{
+	if (*mask == NULL)
+		*mask = _xmalloc(nr_cpumask_bits / 8, sizeof(long));
+	return *mask != NULL;
+}
+
 static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
 {
 	*(void **)mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long));
 	return *mask != NULL;
 }
 
+static inline bool cond_zalloc_cpumask_var(cpumask_var_t *mask)
+{
+	if (*mask == NULL)
+		*mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long));
+	else
+		cpumask_clear(*mask);
+	return *mask != NULL;
+}
+
 static inline void free_cpumask_var(cpumask_var_t mask)
 {
 	xfree(mask);
 }
+
+/* Free an allocated mask, and zero the pointer to it. */
+#define FREE_CPUMASK_VAR(m) XFREE(m)
 #else
 typedef cpumask_t cpumask_var_t[1];
 
@@ -368,16 +387,20 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask)
 {
 	return 1;
 }
+#define cond_alloc_cpumask_var alloc_cpumask_var
 
 static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
 {
 	cpumask_clear(*mask);
 	return 1;
 }
+#define cond_zalloc_cpumask_var zalloc_cpumask_var
 
 static inline void free_cpumask_var(cpumask_var_t mask)
 {
 }
+
+#define FREE_CPUMASK_VAR(m) free_cpumask_var(m)
 #endif
 
 #if NR_CPUS > 1
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index e928551c91..24654e8e22 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -162,6 +162,14 @@ void free_xenheap_pages(void *v, unsigned int order);
 bool scrub_free_pages(void);
 #define alloc_xenheap_page() (alloc_xenheap_pages(0,0))
 #define free_xenheap_page(v) (free_xenheap_pages(v,0))
+
+/* Free an allocation, and zero the pointer to it. */
+#define FREE_XENHEAP_PAGES(p, o) do { \
+    free_xenheap_pages(p, o);         \
+    (p) = NULL;                       \
+} while ( false )
+#define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0)
+
 /* Map machine page range in Xen virtual address space. */
 int map_pages_to_xen(
     unsigned long virt,
diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h
index cc2673d8ae..9aa5edf593 100644
--- a/xen/include/xen/xmalloc.h
+++ b/xen/include/xen/xmalloc.h
@@ -26,6 +26,12 @@
 /* Free any of the above. */
 extern void xfree(void *);
 
+/* Free an allocation, and zero the pointer to it. */
+#define XFREE(p) do { \
+    xfree(p);         \
+    (p) = NULL;       \
+} while ( false )
+
 /* Underlying functions */
 extern void *_xmalloc(unsigned long size, unsigned long align);
 extern void *_xzalloc(unsigned long size, unsigned long align);
-- 
2.17.1


From 75313e478e894176056e1fc5852136b344a0dc70 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Mon, 30 Jul 2018 11:25:38 +0200
Subject: [PATCH 12/42] x86: possibly bring up all CPUs even if not all are
 supposed to be used
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reportedly Intel CPUs which can't broadcast #MC to all targeted
cores/threads because some have CR4.MCE clear will shut down. Therefore
we want to keep CR4.MCE enabled when offlining a CPU, and we need to
bring up all CPUs in order to be able to set CR4.MCE in the first place.

The use of clear_in_cr4() in cpu_mcheck_disable() was ill advised
anyway, and to avoid future similar mistakes I'm removing clear_in_cr4()
altogether right here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
master commit: 8797d20a6ec2dd75195585a107ce345c51c0a59a
master date: 2018-07-19 13:43:33 +0100
---
 xen/arch/x86/cpu/common.c           |  4 ++++
 xen/arch/x86/cpu/mcheck/mce_intel.c |  2 --
 xen/arch/x86/mpparse.c              | 15 +++++++++++----
 xen/arch/x86/setup.c                | 18 +++++++++++++++---
 xen/include/asm-x86/processor.h     |  6 ------
 5 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 528aff1811..fdb022875a 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -14,6 +14,7 @@
 #include <public/sysctl.h> /* for XEN_INVALID_{SOCKET,CORE}_ID */
 
 #include "cpu.h"
+#include "mcheck/x86_mca.h"
 
 bool_t opt_arat = 1;
 boolean_param("arat", opt_arat);
@@ -355,6 +356,9 @@ static void __init early_cpu_detect(void)
 			hap_paddr_bits = PADDR_BITS;
 	}
 
+	if (c->x86_vendor != X86_VENDOR_AMD)
+		park_offline_cpus = opt_mce;
+
 	initialize_cpu_data(0);
 }
 
diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
index e5dd956a24..4474a34e34 100644
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -636,8 +636,6 @@ static void clear_cmci(void)
 
 static void cpu_mcheck_disable(void)
 {
-    clear_in_cr4(X86_CR4_MCE);
-
     if ( cmci_support && opt_mce )
         clear_cmci();
 }
diff --git a/xen/arch/x86/mpparse.c b/xen/arch/x86/mpparse.c
index 49140e46f0..f3f6d48668 100644
--- a/xen/arch/x86/mpparse.c
+++ b/xen/arch/x86/mpparse.c
@@ -68,19 +68,26 @@ physid_mask_t phys_cpu_present_map;
 
 void __init set_nr_cpu_ids(unsigned int max_cpus)
 {
+	unsigned int tot_cpus = num_processors + disabled_cpus;
+
 	if (!max_cpus)
-		max_cpus = num_processors + disabled_cpus;
+		max_cpus = tot_cpus;
 	if (max_cpus > NR_CPUS)
 		max_cpus = NR_CPUS;
 	else if (!max_cpus)
 		max_cpus = 1;
 	printk(XENLOG_INFO "SMP: Allowing %u CPUs (%d hotplug CPUs)\n",
 	       max_cpus, max_t(int, max_cpus - num_processors, 0));
-	nr_cpu_ids = max_cpus;
+
+	if (!park_offline_cpus)
+		tot_cpus = max_cpus;
+	nr_cpu_ids = min(tot_cpus, NR_CPUS + 0u);
+	if (park_offline_cpus && nr_cpu_ids < num_processors)
+		printk(XENLOG_WARNING "SMP: Cannot bring up %u further CPUs\n",
+		       num_processors - nr_cpu_ids);
 
 #ifndef nr_cpumask_bits
-	nr_cpumask_bits = (max_cpus + (BITS_PER_LONG - 1)) &
-			  ~(BITS_PER_LONG - 1);
+	nr_cpumask_bits = ROUNDUP(nr_cpu_ids, BITS_PER_LONG);
 	printk(XENLOG_DEBUG "NR_CPUS:%u nr_cpumask_bits:%u\n",
 	       NR_CPUS, nr_cpumask_bits);
 #endif
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index a3172ca92c..984c948216 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -665,7 +665,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
 {
     char *memmap_type = NULL;
     char *cmdline, *kextra, *loader;
-    unsigned int initrdidx;
+    unsigned int initrdidx, num_parked = 0;
     multiboot_info_t *mbi;
     module_t *mod;
     unsigned long nr_pages, raw_max_page, modules_headroom, *module_map;
@@ -1494,7 +1494,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
     else
     {
         set_nr_cpu_ids(max_cpus);
-        max_cpus = nr_cpu_ids;
+        if ( !max_cpus )
+            max_cpus = nr_cpu_ids;
     }
 
     if ( xen_guest )
@@ -1617,16 +1618,27 @@ void __init noreturn __start_xen(unsigned long mbi_p)
             /* Set up node_to_cpumask based on cpu_to_node[]. */
             numa_add_cpu(i);
 
-            if ( (num_online_cpus() < max_cpus) && !cpu_online(i) )
+            if ( (park_offline_cpus || num_online_cpus() < max_cpus) &&
+                 !cpu_online(i) )
             {
                 int ret = cpu_up(i);
                 if ( ret != 0 )
                     printk("Failed to bring up CPU %u (error %d)\n", i, ret);
+                else if ( num_online_cpus() > max_cpus )
+                {
+                    ret = cpu_down(i);
+                    if ( !ret )
+                        ++num_parked;
+                    else
+                        printk("Could not re-offline CPU%u (%d)\n", i, ret);
+                }
             }
         }
     }
 
     printk("Brought up %ld CPUs\n", (long)num_online_cpus());
+    if ( num_parked )
+        printk(XENLOG_INFO "Parked %u CPUs\n", num_parked);
     smp_cpus_done();
 
     do_initcalls();
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 9924cdf1f3..2bd9e69684 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -337,12 +337,6 @@ static always_inline void set_in_cr4 (unsigned long mask)
     write_cr4(read_cr4() | mask);
 }
 
-static always_inline void clear_in_cr4 (unsigned long mask)
-{
-    mmu_cr4_features &= ~mask;
-    write_cr4(read_cr4() & ~mask);
-}
-
 static inline unsigned int read_pkru(void)
 {
     unsigned int pkru;
-- 
2.17.1


From 353edf12c865d2a1e24173aac841452b90614915 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Mon, 30 Jul 2018 11:26:16 +0200
Subject: [PATCH 13/42] x86: command line option to avoid use of secondary
 hyper-threads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Shared resources (L1 cache and TLB in particular) present a risk of
information leak via side channels. Provide a means to avoid use of
hyperthreads in such cases.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
master commit: d8f974f1a646c0200b97ebcabb808324b288fadb
master date: 2018-07-19 13:43:33 +0100
---
 docs/misc/xen-command-line.markdown |  7 +++++++
 xen/arch/x86/setup.c                |  8 +++++++-
 xen/arch/x86/sysctl.c               | 16 +++++++++++++++-
 xen/include/asm-x86/setup.h         |  2 ++
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 075e5ea159..3b710b71fb 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1748,6 +1748,13 @@ Use `smap=hvm` to allow SMAP use by HVM guests only.
 Flag to enable Supervisor Mode Execution Protection
 Use `smep=hvm` to allow SMEP use by HVM guests only.
 
+### smt (x86)
+> `= <boolean>`
+
+Default: `true`
+
+Control bring up of multiple hyper-threads per CPU core.
+
 ### snb\_igd\_quirk
 > `= <boolean> | cap | <integer>`
 
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 984c948216..66fd13f93a 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -62,6 +62,9 @@ boolean_param("nosmp", opt_nosmp);
 static unsigned int __initdata max_cpus;
 integer_param("maxcpus", max_cpus);
 
+int8_t __read_mostly opt_smt = -1;
+boolean_param("smt", opt_smt);
+
 /* opt_invpcid: If false, don't use INVPCID instruction even if available. */
 static bool __initdata opt_invpcid = true;
 boolean_param("invpcid", opt_invpcid);
@@ -1624,7 +1627,10 @@ void __init noreturn __start_xen(unsigned long mbi_p)
                 int ret = cpu_up(i);
                 if ( ret != 0 )
                     printk("Failed to bring up CPU %u (error %d)\n", i, ret);
-                else if ( num_online_cpus() > max_cpus )
+                else if ( num_online_cpus() > max_cpus ||
+                          (!opt_smt &&
+                           cpu_data[i].compute_unit_id == INVALID_CUID &&
+                           cpumask_weight(per_cpu(cpu_sibling_mask, i)) > 1) )
                 {
                     ret = cpu_down(i);
                     if ( !ret )
diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c
index 4d372db12b..e704ed7f1c 100644
--- a/xen/arch/x86/sysctl.c
+++ b/xen/arch/x86/sysctl.c
@@ -23,6 +23,7 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 #include <asm/processor.h>
+#include <asm/setup.h>
 #include <asm/smp.h>
 #include <asm/numa.h>
 #include <xen/nodemask.h>
@@ -48,14 +49,27 @@ static void l3_cache_get(void *arg)
 
 long cpu_up_helper(void *data)
 {
-    int cpu = (unsigned long)data;
+    unsigned int cpu = (unsigned long)data;
     int ret = cpu_up(cpu);
+
     if ( ret == -EBUSY )
     {
         /* On EBUSY, flush RCU work and have one more go. */
         rcu_barrier();
         ret = cpu_up(cpu);
     }
+
+    if ( !ret && !opt_smt &&
+         cpu_data[cpu].compute_unit_id == INVALID_CUID &&
+         cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) > 1 )
+    {
+        ret = cpu_down_helper(data);
+        if ( ret )
+            printk("Could not re-offline CPU%u (%d)\n", cpu, ret);
+        else
+            ret = -EPERM;
+    }
+
     return ret;
 }
 
diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h
index 19232afa01..c09a5ff381 100644
--- a/xen/include/asm-x86/setup.h
+++ b/xen/include/asm-x86/setup.h
@@ -66,6 +66,8 @@ extern uint8_t kbd_shift_flags;
 extern unsigned long highmem_start;
 #endif
 
+extern int8_t opt_smt;
+
 #ifdef CONFIG_SHADOW_PAGING
 extern bool opt_dom0_shadow;
 #else
-- 
2.17.1


From 037fe82cf5fadf0f74c3da70560ee7592a8f2083 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 30 Jul 2018 11:26:53 +0200
Subject: [PATCH 14/42] x86/vmx: Don't clobber %dr6 while debugging state is
 lazy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

c/s 4f36452b63 introduced a write to %dr6 in the #DB intercept case, but the
guests debug registers may be lazy at this point, at which point the guests
later attempt to read %dr6 will discard this value and use the older stale
value.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
master commit: 3cdac2805692c7accde2f405d81cc0be799aee48
master date: 2018-07-19 14:06:48 +0100
---
 xen/arch/x86/hvm/vmx/vmx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 610c8d6eb9..7189820bfc 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3701,6 +3701,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
              */
             __vmread(EXIT_QUALIFICATION, &exit_qualification);
             HVMTRACE_1D(TRAP_DEBUG, exit_qualification);
+            __restore_debug_registers(v);
             write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE);
             if ( !v->domain->debugger_attached )
             {
-- 
2.17.1


From 543027c9842d8416047ef38846d2de1295052e92 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 30 Jul 2018 11:27:33 +0200
Subject: [PATCH 15/42] x86/xstate: Use a guests CPUID policy, rather than
 allowing all features

It turns out that Xen has never enforced that a domain remain within the
xstate features advertised in CPUID.

The check of new_bv against xfeature_mask ensures that a domain stays within
the set of features that Xen has enabled in hardware (and therefore isn't a
security problem), but this does means that attempts to level a guest for
migration safety might not be effective if the guest ignores CPUID.

Check the CPUID policy in validate_xstate() (for incoming migration) and in
handle_xsetbv() (for guest XSETBV instructions).  This subsumes the PKRU check
for PV guests in handle_xsetbv() (and also demonstrates that I should have
spotted this problem while reviewing c/s fbf9971241f).

For migration, this is correct despite the current (mis)ordering of data
because d->arch.cpuid is the applicable max policy.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
master commit: 361b835fa00d9f45167c50a60e054ccf22c065d7
master date: 2018-07-19 19:57:26 +0100
---
 xen/arch/x86/domctl.c        |  2 +-
 xen/arch/x86/hvm/hvm.c       |  2 +-
 xen/arch/x86/xstate.c        | 17 +++++++++++------
 xen/include/asm-x86/xstate.h |  5 +++--
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index b04388d663..fa82b6744e 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1163,7 +1163,7 @@ long arch_do_domctl(
             if ( _xcr0_accum )
             {
                 if ( evc->size >= PV_XSAVE_HDR_SIZE + XSTATE_AREA_MIN_SIZE )
-                    ret = validate_xstate(_xcr0, _xcr0_accum,
+                    ret = validate_xstate(d, _xcr0, _xcr0_accum,
                                           &_xsave_area->xsave_hdr);
             }
             else if ( !_xcr0 )
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 279cb88e45..d544720876 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1269,7 +1269,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
     ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
     h->cur += desc->length;
 
-    err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum,
+    err = validate_xstate(d, ctxt->xcr0, ctxt->xcr0_accum,
                           (const void *)&ctxt->save_area.xsave_hdr);
     if ( err )
     {
diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c
index b4aea4b50a..1fbb0871d0 100644
--- a/xen/arch/x86/xstate.c
+++ b/xen/arch/x86/xstate.c
@@ -670,12 +670,17 @@ static bool valid_xcr0(u64 xcr0)
     return !(xcr0 & X86_XCR0_BNDREGS) == !(xcr0 & X86_XCR0_BNDCSR);
 }
 
-int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr)
+int validate_xstate(const struct domain *d, uint64_t xcr0, uint64_t xcr0_accum,
+                    const struct xsave_hdr *hdr)
 {
+    const struct cpuid_policy *cp = d->arch.cpuid;
+    uint64_t xcr0_max =
+        ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low;
     unsigned int i;
 
     if ( (hdr->xstate_bv & ~xcr0_accum) ||
          (xcr0 & ~xcr0_accum) ||
+         (xcr0_accum & ~xcr0_max) ||
          !valid_xcr0(xcr0) ||
          !valid_xcr0(xcr0_accum) )
         return -EINVAL;
@@ -694,18 +699,18 @@ int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr)
 int handle_xsetbv(u32 index, u64 new_bv)
 {
     struct vcpu *curr = current;
+    const struct cpuid_policy *cp = curr->domain->arch.cpuid;
+    uint64_t xcr0_max =
+        ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low;
     u64 mask;
 
     if ( index != XCR_XFEATURE_ENABLED_MASK )
         return -EOPNOTSUPP;
 
-    if ( (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) )
+    if ( (new_bv & ~xcr0_max) ||
+         (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) )
         return -EINVAL;
 
-    /* XCR0.PKRU is disabled on PV mode. */
-    if ( is_pv_vcpu(curr) && (new_bv & X86_XCR0_PKRU) )
-        return -EOPNOTSUPP;
-
     if ( !set_xcr0(new_bv) )
         return -EFAULT;
 
diff --git a/xen/include/asm-x86/xstate.h b/xen/include/asm-x86/xstate.h
index 86a4a1f75c..47f602b855 100644
--- a/xen/include/asm-x86/xstate.h
+++ b/xen/include/asm-x86/xstate.h
@@ -97,8 +97,9 @@ void xsave(struct vcpu *v, uint64_t mask);
 void xrstor(struct vcpu *v, uint64_t mask);
 void xstate_set_init(uint64_t mask);
 bool xsave_enabled(const struct vcpu *v);
-int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum,
-                                 const struct xsave_hdr *);
+int __must_check validate_xstate(const struct domain *d,
+                                 uint64_t xcr0, uint64_t xcr0_accum,
+                                 const struct xsave_hdr *hdr);
 int __must_check handle_xsetbv(u32 index, u64 new_bv);
 void expand_xsave_states(struct vcpu *v, void *dest, unsigned int size);
 void compress_xsave_states(struct vcpu *v, const void *src, unsigned int size);
-- 
2.17.1


From 06d2a763d07d53a4ccc7bd1255ffc9ea01ec1609 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 30 Jul 2018 11:29:00 +0200
Subject: [PATCH 16/42] x86/xstate: Make errors in xstate calculations more
 obvious by crashing the domain

If xcr0_max exceeds xfeature_mask, then something is broken with the CPUID
policy derivation or auditing logic.  If hardware rejects new_bv, then
something is broken with Xen's xstate logic.

In both cases, crash the domain with an obvious error message, to help
highlight the issues.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
master commit: d6371ccb93012db4ad6615fe666205b86308cb4e
master date: 2018-07-19 19:57:26 +0100
---
 xen/arch/x86/xstate.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c
index 1fbb0871d0..15edd5df96 100644
--- a/xen/arch/x86/xstate.c
+++ b/xen/arch/x86/xstate.c
@@ -707,12 +707,32 @@ int handle_xsetbv(u32 index, u64 new_bv)
     if ( index != XCR_XFEATURE_ENABLED_MASK )
         return -EOPNOTSUPP;
 
-    if ( (new_bv & ~xcr0_max) ||
-         (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) )
+    /*
+     * The CPUID logic shouldn't be able to hand out an XCR0 exceeding Xen's
+     * maximum features, but keep the check for robustness.
+     */
+    if ( unlikely(xcr0_max & ~xfeature_mask) )
+    {
+        gprintk(XENLOG_ERR,
+                "xcr0_max %016" PRIx64 " exceeds hardware max %016" PRIx64 "\n",
+                xcr0_max, xfeature_mask);
+        domain_crash(curr->domain);
+
+        return -EINVAL;
+    }
+
+    if ( (new_bv & ~xcr0_max) || !valid_xcr0(new_bv) )
         return -EINVAL;
 
-    if ( !set_xcr0(new_bv) )
+    /* By this point, new_bv really should be accepted by hardware. */
+    if ( unlikely(!set_xcr0(new_bv)) )
+    {
+        gprintk(XENLOG_ERR, "new_bv %016" PRIx64 " rejected by hardware\n",
+                new_bv);
+        domain_crash(curr->domain);
+
         return -EFAULT;
+    }
 
     mask = new_bv & ~curr->arch.xcr0_accum;
     curr->arch.xcr0 = new_bv;
-- 
2.17.1


From 7de21555730367497eb01edf6e9e9530224105e7 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 30 Jul 2018 11:29:39 +0200
Subject: [PATCH 17/42] x86/hvm: Disallow unknown MSR_EFER bits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It turns out that nothing ever prevented HVM guests from trying to set unknown
EFER bits.  Generally, this results in a vmentry failure.

For Intel hardware, all implemented bits are covered by the checks.

For AMD hardware, the only EFER bit which isn't covered by the checks is TCE
(which AFAICT is specific to AMD Fam15/16 hardware).  We never advertise TCE
in CPUID, but it isn't a security problem to have TCE unexpected enabled in
guest context.

Disallow the setting of bits outside of the EFER_KNOWN_MASK, which prevents
any vmentry failures for guests, yielding #GP instead.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
master commit: ef0269c6215d642a709866f04ba1a1f9f13f3614
master date: 2018-07-24 11:25:53 +0100
---
 xen/arch/x86/hvm/hvm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index d544720876..4cbb688c05 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -907,6 +907,9 @@ const char *hvm_efer_valid(const struct vcpu *v, uint64_t value,
     else
         p = &host_cpuid_policy;
 
+    if ( value & ~EFER_KNOWN_MASK )
+        return "Unknown bits set";
+
     if ( (value & EFER_SCE) && !p->extd.syscall )
         return "SCE without feature";
 
-- 
2.17.1


From 33ced725e11af4eabd3334d12f53ed807e9e2586 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 30 Jul 2018 11:30:09 +0200
Subject: [PATCH 18/42] x86/spec-ctrl: Fix the parsing of xpti= on fixed Intel
 hardware

The calls to xpti_init_default() in parse_xpti() are buggy.  The CPUID data
hasn't been fetched that early, and boot_cpu_has(X86_FEATURE_ARCH_CAPS) will
always evaluate false.

As a result, the default case won't disable XPTI on Intel hardware which
advertises ARCH_CAPABILITIES_RDCL_NO.

Simplify parse_xpti() to solely the setting of opt_xpti according to the
passed string, and have init_speculation_mitigations() call
xpti_init_default() if appropiate.  Drop the force parameter, and pass caps
instead, to avoid redundant re-reading of MSR_ARCH_CAPS.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
master commit: be5e2ff6f54e0245331ed360b8786760f82fd673
master date: 2018-07-24 11:25:54 +0100
---
 xen/arch/x86/spec_ctrl.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 73dc7170c7..32a4ea6e99 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -423,17 +423,10 @@ static bool __init should_use_eager_fpu(void)
 #define OPT_XPTI_DEFAULT  0xff
 uint8_t __read_mostly opt_xpti = OPT_XPTI_DEFAULT;
 
-static __init void xpti_init_default(bool force)
+static __init void xpti_init_default(uint64_t caps)
 {
-    uint64_t caps = 0;
-
-    if ( !force && (opt_xpti != OPT_XPTI_DEFAULT) )
-        return;
-
     if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
         caps = ARCH_CAPABILITIES_RDCL_NO;
-    else if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
-        rdmsrl(MSR_ARCH_CAPABILITIES, caps);
 
     if ( caps & ARCH_CAPABILITIES_RDCL_NO )
         opt_xpti = 0;
@@ -446,8 +439,6 @@ static __init int parse_xpti(const char *s)
     const char *ss;
     int val, rc = 0;
 
-    xpti_init_default(false);
-
     do {
         ss = strchr(s, ',');
         if ( !ss )
@@ -465,7 +456,7 @@ static __init int parse_xpti(const char *s)
 
         default:
             if ( !strcmp(s, "default") )
-                xpti_init_default(true);
+                opt_xpti = OPT_XPTI_DEFAULT;
             else if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
                 opt_xpti = (opt_xpti & ~OPT_XPTI_DOM0) |
                            (val ? OPT_XPTI_DOM0 : 0);
@@ -627,7 +618,9 @@ void __init init_speculation_mitigations(void)
     if ( default_xen_spec_ctrl )
         setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE);
 
-    xpti_init_default(false);
+    if ( opt_xpti == OPT_XPTI_DEFAULT )
+        xpti_init_default(caps);
+
     if ( opt_xpti == 0 )
         setup_force_cpu_cap(X86_FEATURE_NO_XPTI);
     else
-- 
2.17.1


From 6fe9726aebc11433083b9810402501f1a71d02fd Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Thu, 9 Aug 2018 17:22:17 +0100
Subject: [PATCH 19/42] x86/spec-ctrl: Yet more fixes for xpti= parsing

As it currently stands, 'xpti=dom0' is indistinguishable from the default
value, which means it will be overridden by ARCH_CAPABILITIES_RDCL_NO on fixed
hardware.

Switch opt_xpti to use -1 as a default like all our other related options, and
clobber it as soon as we have a string to parse.

In addition, 'xpti' alone should be interpreted in its positive boolean form,
rather than resulting in a parse error.

  (XEN) parameter "xpti" has invalid value "", rc=-22!

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit 2a3b34ec47817048ab59586855cf0709fc77487e)
---
 xen/arch/x86/spec_ctrl.c        | 15 +++++++++++----
 xen/include/asm-x86/spec_ctrl.h |  2 +-
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 32a4ea6e99..32213ace86 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -420,8 +420,7 @@ static bool __init should_use_eager_fpu(void)
     }
 }
 
-#define OPT_XPTI_DEFAULT  0xff
-uint8_t __read_mostly opt_xpti = OPT_XPTI_DEFAULT;
+int8_t __read_mostly opt_xpti = -1;
 
 static __init void xpti_init_default(uint64_t caps)
 {
@@ -439,6 +438,14 @@ static __init int parse_xpti(const char *s)
     const char *ss;
     int val, rc = 0;
 
+    /* Inhibit the defaults as an explicit choice has been given. */
+    if ( opt_xpti == -1 )
+        opt_xpti = 0;
+
+    /* Interpret 'xpti' alone in its positive boolean form. */
+    if ( *s == '\0' )
+        opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU;
+
     do {
         ss = strchr(s, ',');
         if ( !ss )
@@ -456,7 +463,7 @@ static __init int parse_xpti(const char *s)
 
         default:
             if ( !strcmp(s, "default") )
-                opt_xpti = OPT_XPTI_DEFAULT;
+                opt_xpti = -1;
             else if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
                 opt_xpti = (opt_xpti & ~OPT_XPTI_DOM0) |
                            (val ? OPT_XPTI_DOM0 : 0);
@@ -618,7 +625,7 @@ void __init init_speculation_mitigations(void)
     if ( default_xen_spec_ctrl )
         setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE);
 
-    if ( opt_xpti == OPT_XPTI_DEFAULT )
+    if ( opt_xpti == -1 )
         xpti_init_default(caps);
 
     if ( opt_xpti == 0 )
diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
index 5b40afbab0..fea82603ca 100644
--- a/xen/include/asm-x86/spec_ctrl.h
+++ b/xen/include/asm-x86/spec_ctrl.h
@@ -34,7 +34,7 @@ extern bool bsp_delay_spec_ctrl;
 extern uint8_t default_xen_spec_ctrl;
 extern uint8_t default_spec_ctrl_flags;
 
-extern uint8_t opt_xpti;
+extern int8_t opt_xpti;
 #define OPT_XPTI_DOM0  0x01
 #define OPT_XPTI_DOMU  0x02
 
-- 
2.17.1


From 4254e9874006cc2641b67d0531a3a65374f34c35 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Thu, 24 May 2018 17:20:09 +0000
Subject: [PATCH 20/42] x86/vmx: Fix handing of MSR_DEBUGCTL on VMExit

Currently, whenever the guest writes a nonzero value to MSR_DEBUGCTL, Xen
updates a host MSR load list entry with the current hardware value of
MSR_DEBUGCTL.

On VMExit, hardware automatically resets MSR_DEBUGCTL to 0.  Later, when the
guest writes to MSR_DEBUGCTL, the current value in hardware (0) is fed back
into guest load list.  As a practical result, `ler` debugging gets lost on any
PCPU which has ever scheduled an HVM vcpu, and the common case when `ler`
debugging isn't active, guest actions result in an unnecessary load list entry
repeating the MSR_DEBUGCTL reset.

Restoration of Xen's debugging setting needs to happen from the very first
vmexit.  Due to the automatic reset, Xen need take no action in the general
case, and only needs to load a value when debugging is active.

This could be fixed by using a host MSR load list entry set up during
construct_vmcs().  However, a more efficient option is to use an alternative
block in the VMExit path, keyed on whether hypervisor debugging has been
enabled.

In order to set this up, drop the per cpu ler_msr variable (as there is no
point having it per cpu when it will be the same everywhere), and use a single
read_mostly variable instead.  Split calc_ler_msr() out of percpu_traps_init()
for clarity.

Finally, clean up do_debug().  Reinstate LBR early to help catch cascade
errors, which allows for the removal of the out label.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
(cherry picked from commit 730dc8d2c9e1b6402e66973cf99a7c56bc78be4c)
---
 xen/arch/x86/hvm/vmx/entry.S      |  9 +++++
 xen/arch/x86/hvm/vmx/vmx.c        |  3 +-
 xen/arch/x86/traps.c              | 64 +++++++++++++++----------------
 xen/arch/x86/x86_64/traps.c       |  7 ++--
 xen/include/asm-x86/cpufeature.h  |  1 +
 xen/include/asm-x86/cpufeatures.h |  1 +
 xen/include/asm-x86/msr.h         |  2 +-
 7 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S
index aa2f103895..afd552f2b9 100644
--- a/xen/arch/x86/hvm/vmx/entry.S
+++ b/xen/arch/x86/hvm/vmx/entry.S
@@ -41,6 +41,15 @@ ENTRY(vmx_asm_vmexit_handler)
         SPEC_CTRL_ENTRY_FROM_HVM    /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */
         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
 
+        /* Hardware clears MSR_DEBUGCTL on VMExit.  Reinstate it if debugging Xen. */
+        .macro restore_lbr
+            mov $IA32_DEBUGCTLMSR_LBR, %eax
+            mov $MSR_IA32_DEBUGCTLMSR, %ecx
+            xor %edx, %edx
+            wrmsr
+        .endm
+        ALTERNATIVE "", restore_lbr, X86_FEATURE_XEN_LBR
+
         mov  %rsp,%rdi
         call vmx_vmexit_handler
 
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 7189820bfc..bb164359bb 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3124,8 +3124,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
                     }
         }
 
-        if ( (rc < 0) ||
-             (msr_content && (vmx_add_host_load_msr(msr) < 0)) )
+        if ( rc < 0 )
             hvm_inject_hw_exception(TRAP_machine_check, X86_EVENT_NO_EC);
         else
             __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 9f045a2045..789d7ff8cd 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -96,8 +96,6 @@ string_param("nmi", opt_nmi);
 DEFINE_PER_CPU(uint64_t, efer);
 static DEFINE_PER_CPU(unsigned long, last_extable_addr);
 
-DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
-
 DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table);
 DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table);
 
@@ -117,6 +115,9 @@ integer_param("debug_stack_lines", debug_stack_lines);
 static bool opt_ler;
 boolean_param("ler", opt_ler);
 
+/* LastExceptionFromIP on this hardware.  Zero if LER is not in use. */
+unsigned int __read_mostly ler_msr;
+
 #define stack_words_per_line 4
 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
 
@@ -1778,17 +1779,6 @@ void do_device_not_available(struct cpu_user_regs *regs)
     return;
 }
 
-static void ler_enable(void)
-{
-    u64 debugctl;
-
-    if ( !this_cpu(ler_msr) )
-        return;
-
-    rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-    wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | IA32_DEBUGCTLMSR_LBR);
-}
-
 void do_debug(struct cpu_user_regs *regs)
 {
     unsigned long dr6;
@@ -1821,6 +1811,10 @@ void do_debug(struct cpu_user_regs *regs)
      */
     write_debugreg(6, X86_DR6_DEFAULT);
 
+    /* #DB automatically disabled LBR.  Reinstate it if debugging Xen. */
+    if ( cpu_has_xen_lbr )
+        wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
+
     if ( !guest_mode(regs) )
     {
         /*
@@ -1838,7 +1832,7 @@ void do_debug(struct cpu_user_regs *regs)
             {
                 if ( regs->rip == (unsigned long)sysenter_eflags_saved )
                     regs->eflags &= ~X86_EFLAGS_TF;
-                goto out;
+                return;
             }
             if ( !debugger_trap_fatal(TRAP_debug, regs) )
             {
@@ -1895,20 +1889,14 @@ void do_debug(struct cpu_user_regs *regs)
                 regs->cs, _p(regs->rip), _p(regs->rip),
                 regs->ss, _p(regs->rsp), dr6);
 
-        goto out;
+        return;
     }
 
     /* Save debug status register where guest OS can peek at it */
     v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT);
     v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT);
 
-    ler_enable();
     pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
-    return;
-
- out:
-    ler_enable();
-    return;
 }
 
 static void __init noinline __set_intr_gate(unsigned int n,
@@ -1952,38 +1940,46 @@ void load_TR(void)
         : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
 }
 
-void percpu_traps_init(void)
+static unsigned int calc_ler_msr(void)
 {
-    subarch_percpu_traps_init();
-
-    if ( !opt_ler )
-        return;
-
     switch ( boot_cpu_data.x86_vendor )
     {
     case X86_VENDOR_INTEL:
         switch ( boot_cpu_data.x86 )
         {
         case 6:
-            this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
-            break;
+            return MSR_IA32_LASTINTFROMIP;
+
         case 15:
-            this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
-            break;
+            return MSR_P4_LER_FROM_LIP;
         }
         break;
+
     case X86_VENDOR_AMD:
         switch ( boot_cpu_data.x86 )
         {
         case 6:
         case 0xf ... 0x17:
-            this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
-            break;
+            return MSR_IA32_LASTINTFROMIP;
         }
         break;
     }
 
-    ler_enable();
+    return 0;
+}
+
+void percpu_traps_init(void)
+{
+    subarch_percpu_traps_init();
+
+    if ( !opt_ler )
+        return;
+
+    if ( !ler_msr && (ler_msr = calc_ler_msr()) )
+        setup_force_cpu_cap(X86_FEATURE_XEN_LBR);
+
+    if ( cpu_has_xen_lbr )
+        wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
 }
 
 void __init init_idt_traps(void)
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
index f7f6928d70..b0401850ef 100644
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -144,11 +144,12 @@ void show_registers(const struct cpu_user_regs *regs)
     printk("CPU:    %d\n", smp_processor_id());
     _show_registers(&fault_regs, fault_crs, context, v);
 
-    if ( this_cpu(ler_msr) && !guest_mode(regs) )
+    if ( ler_msr && !guest_mode(regs) )
     {
         u64 from, to;
-        rdmsrl(this_cpu(ler_msr), from);
-        rdmsrl(this_cpu(ler_msr) + 1, to);
+
+        rdmsrl(ler_msr, from);
+        rdmsrl(ler_msr + 1, to);
         printk("ler: %016lx -> %016lx\n", from, to);
     }
 }
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index 2cf8f7ea2a..b237da165c 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -113,6 +113,7 @@
 #define cpu_has_aperfmperf      boot_cpu_has(X86_FEATURE_APERFMPERF)
 #define cpu_has_lfence_dispatch boot_cpu_has(X86_FEATURE_LFENCE_DISPATCH)
 #define cpu_has_no_xpti         boot_cpu_has(X86_FEATURE_NO_XPTI)
+#define cpu_has_xen_lbr         boot_cpu_has(X86_FEATURE_XEN_LBR)
 
 enum _cache_type {
     CACHE_TYPE_NULL = 0,
diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
index b90aa2d046..8e5cc53dde 100644
--- a/xen/include/asm-x86/cpufeatures.h
+++ b/xen/include/asm-x86/cpufeatures.h
@@ -32,3 +32,4 @@ XEN_CPUFEATURE(SC_RSB_PV,       (FSCAPINTS+0)*32+18) /* RSB overwrite needed for
 XEN_CPUFEATURE(SC_RSB_HVM,      (FSCAPINTS+0)*32+19) /* RSB overwrite needed for HVM */
 XEN_CPUFEATURE(NO_XPTI,         (FSCAPINTS+0)*32+20) /* XPTI mitigation not in use */
 XEN_CPUFEATURE(SC_MSR_IDLE,     (FSCAPINTS+0)*32+21) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */
+XEN_CPUFEATURE(XEN_LBR,         (FSCAPINTS+0)*32+22) /* Xen uses MSR_DEBUGCTL.LBR */
diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h
index f14f265aa5..afbeb7f155 100644
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -241,7 +241,7 @@ static inline void write_efer(uint64_t val)
     wrmsrl(MSR_EFER, val);
 }
 
-DECLARE_PER_CPU(u32, ler_msr);
+extern unsigned int ler_msr;
 
 DECLARE_PER_CPU(uint32_t, tsc_aux);
 
-- 
2.17.1


From 61cc8769a917c646b9bc99ee8adbea602f8d50d2 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 28 May 2018 15:02:34 +0100
Subject: [PATCH 21/42] x86/vmx: Defer vmx_vmcs_exit() as long as possible in
 construct_vmcs()

paging_update_paging_modes() and vmx_vlapic_msr_changed() both operate on the
VMCS being constructed.  Avoid dropping and re-acquiring the reference
multiple times.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
(cherry picked from commit f30e3cf34042846e391e3f8361fc6a76d181a7ee)
---
 xen/arch/x86/hvm/vmx/vmcs.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 258fc08f72..15d63663e5 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -996,6 +996,7 @@ static int construct_vmcs(struct vcpu *v)
     struct domain *d = v->domain;
     u32 vmexit_ctl = vmx_vmexit_control;
     u32 vmentry_ctl = vmx_vmentry_control;
+    int rc = 0;
 
     vmx_vmcs_enter(v);
 
@@ -1083,8 +1084,8 @@ static int construct_vmcs(struct vcpu *v)
 
         if ( msr_bitmap == NULL )
         {
-            vmx_vmcs_exit(v);
-            return -ENOMEM;
+            rc = -ENOMEM;
+            goto out;
         }
 
         memset(msr_bitmap, ~0, PAGE_SIZE);
@@ -1268,14 +1269,15 @@ static int construct_vmcs(struct vcpu *v)
     if ( cpu_has_vmx_tsc_scaling )
         __vmwrite(TSC_MULTIPLIER, d->arch.hvm_domain.tsc_scaling_ratio);
 
-    vmx_vmcs_exit(v);
-
     /* will update HOST & GUEST_CR3 as reqd */
     paging_update_paging_modes(v);
 
     vmx_vlapic_msr_changed(v);
 
-    return 0;
+ out:
+    vmx_vmcs_exit(v);
+
+    return rc;
 }
 
 static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
-- 
2.17.1


From 935e9c404714f5fa6d31890034a7e2cc11c6e0b9 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 7 May 2018 11:57:00 +0100
Subject: [PATCH 22/42] x86/vmx: API improvements for MSR load/save
 infrastructure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Collect together related infrastructure in vmcs.h, rather than having it
spread out.  Turn vmx_{read,write}_guest_msr() into static inlines, as they
are simple enough.

Replace 'int type' with 'enum vmx_msr_list_type', and use switch statements
internally.  Later changes are going to introduce a new type.

Rename the type identifiers for consistency with the other VMX_MSR_*
constants.

No functional change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
(cherry picked from commit f54b63e8617ada823be43d60467a43c8224b7909)
---
 xen/arch/x86/hvm/vmx/vmcs.c        | 93 +++++++++++++-----------------
 xen/arch/x86/hvm/vmx/vmx.c         |  8 +--
 xen/include/asm-x86/hvm/vmx/vmcs.h | 62 +++++++++++++++-----
 3 files changed, 91 insertions(+), 72 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 15d63663e5..6bc6597242 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1293,22 +1293,26 @@ static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
     return 0;
 }
 
-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type)
+struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type)
 {
     struct vcpu *curr = current;
     unsigned int msr_count;
-    struct vmx_msr_entry *msr_area;
+    struct vmx_msr_entry *msr_area = NULL;
 
-    if ( type == VMX_GUEST_MSR )
+    switch ( type )
     {
-        msr_count = curr->arch.hvm_vmx.msr_count;
-        msr_area = curr->arch.hvm_vmx.msr_area;
-    }
-    else
-    {
-        ASSERT(type == VMX_HOST_MSR);
+    case VMX_MSR_HOST:
         msr_count = curr->arch.hvm_vmx.host_msr_count;
         msr_area = curr->arch.hvm_vmx.host_msr_area;
+        break;
+
+    case VMX_MSR_GUEST:
+        msr_count = curr->arch.hvm_vmx.msr_count;
+        msr_area = curr->arch.hvm_vmx.msr_area;
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
     }
 
     if ( msr_area == NULL )
@@ -1318,48 +1322,27 @@ struct vmx_msr_entry *vmx_find_msr(u32 msr, int type)
                    vmx_msr_entry_key_cmp);
 }
 
-int vmx_read_guest_msr(u32 msr, u64 *val)
-{
-    struct vmx_msr_entry *ent;
-
-    if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
-    {
-        *val = ent->data;
-        return 0;
-    }
-
-    return -ESRCH;
-}
-
-int vmx_write_guest_msr(u32 msr, u64 val)
-{
-    struct vmx_msr_entry *ent;
-
-    if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
-    {
-        ent->data = val;
-        return 0;
-    }
-
-    return -ESRCH;
-}
-
-int vmx_add_msr(u32 msr, int type)
+int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
 {
     struct vcpu *curr = current;
     unsigned int idx, *msr_count;
     struct vmx_msr_entry **msr_area, *msr_area_elem;
 
-    if ( type == VMX_GUEST_MSR )
+    switch ( type )
     {
-        msr_count = &curr->arch.hvm_vmx.msr_count;
-        msr_area = &curr->arch.hvm_vmx.msr_area;
-    }
-    else
-    {
-        ASSERT(type == VMX_HOST_MSR);
+    case VMX_MSR_HOST:
         msr_count = &curr->arch.hvm_vmx.host_msr_count;
         msr_area = &curr->arch.hvm_vmx.host_msr_area;
+        break;
+
+    case VMX_MSR_GUEST:
+        msr_count = &curr->arch.hvm_vmx.msr_count;
+        msr_area = &curr->arch.hvm_vmx.msr_area;
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+        return -EINVAL;
     }
 
     if ( *msr_area == NULL )
@@ -1367,13 +1350,17 @@ int vmx_add_msr(u32 msr, int type)
         if ( (*msr_area = alloc_xenheap_page()) == NULL )
             return -ENOMEM;
 
-        if ( type == VMX_GUEST_MSR )
+        switch ( type )
         {
+        case VMX_MSR_HOST:
+            __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
+            break;
+
+        case VMX_MSR_GUEST:
             __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area));
             __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
+            break;
         }
-        else
-            __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
     }
 
     for ( idx = 0; idx < *msr_count && (*msr_area)[idx].index <= msr; idx++ )
@@ -1392,16 +1379,18 @@ int vmx_add_msr(u32 msr, int type)
 
     ++*msr_count;
 
-    if ( type == VMX_GUEST_MSR )
+    switch ( type )
     {
+    case VMX_MSR_HOST:
+        rdmsrl(msr, msr_area_elem->data);
+        __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count);
+        break;
+
+    case VMX_MSR_GUEST:
         msr_area_elem->data = 0;
         __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count);
         __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count);
-    }
-    else
-    {
-        rdmsrl(msr, msr_area_elem->data);
-        __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count);
+        break;
     }
 
     return 0;
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index bb164359bb..d4ebae8945 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4169,7 +4169,7 @@ static void lbr_tsx_fixup(void)
     struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
     struct vmx_msr_entry *msr;
 
-    if ( (msr = vmx_find_msr(lbr_from_start, VMX_GUEST_MSR)) != NULL )
+    if ( (msr = vmx_find_msr(lbr_from_start, VMX_MSR_GUEST)) != NULL )
     {
         /*
          * Sign extend into bits 61:62 while preserving bit 63
@@ -4179,7 +4179,7 @@ static void lbr_tsx_fixup(void)
             msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
     }
 
-    if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_GUEST_MSR)) != NULL )
+    if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_MSR_GUEST)) != NULL )
         msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
 }
 
@@ -4207,8 +4207,8 @@ static void bdw_erratum_bdf14_fixup(void)
      * erratum BDF14. Fix up MSR_IA32_LASTINT{FROM,TO}IP by
      * sign-extending into bits 48:63.
      */
-    sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_GUEST_MSR);
-    sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_GUEST_MSR);
+    sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST);
+    sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST);
 }
 
 static void lbr_fixup(void)
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 06c3179cec..20882d13e0 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -514,9 +514,6 @@ enum vmcs_field {
 
 #define VMCS_VPID_WIDTH 16
 
-#define VMX_GUEST_MSR 0
-#define VMX_HOST_MSR  1
-
 /* VM Instruction error numbers */
 enum vmx_insn_errno
 {
@@ -534,6 +531,52 @@ enum vmx_insn_errno
     VMX_INSN_FAIL_INVALID                  = ~0,
 };
 
+/* MSR load/save list infrastructure. */
+enum vmx_msr_list_type {
+    VMX_MSR_HOST,           /* MSRs loaded on VMExit.                   */
+    VMX_MSR_GUEST,          /* MSRs saved on VMExit, loaded on VMEntry. */
+};
+
+int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type);
+
+static inline int vmx_add_host_load_msr(uint32_t msr)
+{
+    return vmx_add_msr(msr, VMX_MSR_HOST);
+}
+
+static inline int vmx_add_guest_msr(uint32_t msr)
+{
+    return vmx_add_msr(msr, VMX_MSR_GUEST);
+}
+
+struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type);
+
+static inline int vmx_read_guest_msr(uint32_t msr, uint64_t *val)
+{
+    const struct vmx_msr_entry *ent = vmx_find_msr(msr, VMX_MSR_GUEST);
+
+    if ( !ent )
+        return -ESRCH;
+
+    *val = ent->data;
+
+    return 0;
+}
+
+static inline int vmx_write_guest_msr(uint32_t msr, uint64_t val)
+{
+    struct vmx_msr_entry *ent = vmx_find_msr(msr, VMX_MSR_GUEST);
+
+    if ( !ent )
+        return -ESRCH;
+
+    ent->data = val;
+
+    return 0;
+}
+
+
+/* MSR intercept bitmap infrastructure. */
 enum vmx_msr_intercept_type {
     VMX_MSR_R  = 1,
     VMX_MSR_W  = 2,
@@ -544,10 +587,6 @@ void vmx_clear_msr_intercept(struct vcpu *v, unsigned int msr,
                              enum vmx_msr_intercept_type type);
 void vmx_set_msr_intercept(struct vcpu *v, unsigned int msr,
                            enum vmx_msr_intercept_type type);
-int vmx_read_guest_msr(u32 msr, u64 *val);
-int vmx_write_guest_msr(u32 msr, u64 val);
-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type);
-int vmx_add_msr(u32 msr, int type);
 void vmx_vmcs_switch(paddr_t from, paddr_t to);
 void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector);
 void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector);
@@ -562,15 +601,6 @@ void virtual_vmcs_vmwrite(const struct vcpu *, u32 encoding, u64 val);
 enum vmx_insn_errno virtual_vmcs_vmwrite_safe(const struct vcpu *v,
                                               u32 vmcs_encoding, u64 val);
 
-static inline int vmx_add_guest_msr(u32 msr)
-{
-    return vmx_add_msr(msr, VMX_GUEST_MSR);
-}
-static inline int vmx_add_host_load_msr(u32 msr)
-{
-    return vmx_add_msr(msr, VMX_HOST_MSR);
-}
-
 DECLARE_PER_CPU(bool_t, vmxon);
 
 bool_t vmx_vcpu_pml_enabled(const struct vcpu *v);
-- 
2.17.1


From 52b8f9ae22a5daa1f2cad0aa5065b72b48c33ce4 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 7 May 2018 11:57:00 +0100
Subject: [PATCH 23/42] x86/vmx: Internal cleanup for MSR load/save
 infrastructure

 * Use an arch_vmx_struct local variable to reduce later code volume.
 * Use start/total instead of msr_area/msr_count.  This is in preparation for
   more finegrained handling with later changes.
 * Use ent/end pointers (again for preparation), and to make the vmx_add_msr()
   logic easier to follow.
 * Make the memory allocation block of vmx_add_msr() unlikely, and calculate
   virt_to_maddr() just once.

No practical change to functionality.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
(cherry picked from commit 94fda356fcdcc847662a4c9f6cc63511f25c1247)
---
 xen/arch/x86/hvm/vmx/vmcs.c | 75 ++++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 35 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 6bc6597242..a6ddba3132 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1296,48 +1296,49 @@ static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
 struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type)
 {
     struct vcpu *curr = current;
-    unsigned int msr_count;
-    struct vmx_msr_entry *msr_area = NULL;
+    struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx;
+    struct vmx_msr_entry *start = NULL;
+    unsigned int total;
 
     switch ( type )
     {
     case VMX_MSR_HOST:
-        msr_count = curr->arch.hvm_vmx.host_msr_count;
-        msr_area = curr->arch.hvm_vmx.host_msr_area;
+        start    = vmx->host_msr_area;
+        total    = vmx->host_msr_count;
         break;
 
     case VMX_MSR_GUEST:
-        msr_count = curr->arch.hvm_vmx.msr_count;
-        msr_area = curr->arch.hvm_vmx.msr_area;
+        start    = vmx->msr_area;
+        total    = vmx->msr_count;
         break;
 
     default:
         ASSERT_UNREACHABLE();
     }
 
-    if ( msr_area == NULL )
+    if ( !start )
         return NULL;
 
-    return bsearch(&msr, msr_area, msr_count, sizeof(struct vmx_msr_entry),
-                   vmx_msr_entry_key_cmp);
+    return bsearch(&msr, start, total, sizeof(*start), vmx_msr_entry_key_cmp);
 }
 
 int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
 {
     struct vcpu *curr = current;
-    unsigned int idx, *msr_count;
-    struct vmx_msr_entry **msr_area, *msr_area_elem;
+    struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx;
+    struct vmx_msr_entry **ptr, *start = NULL, *ent, *end;
+    unsigned int total;
 
     switch ( type )
     {
     case VMX_MSR_HOST:
-        msr_count = &curr->arch.hvm_vmx.host_msr_count;
-        msr_area = &curr->arch.hvm_vmx.host_msr_area;
+        ptr      = &vmx->host_msr_area;
+        total    = vmx->host_msr_count;
         break;
 
     case VMX_MSR_GUEST:
-        msr_count = &curr->arch.hvm_vmx.msr_count;
-        msr_area = &curr->arch.hvm_vmx.msr_area;
+        ptr      = &vmx->msr_area;
+        total    = vmx->msr_count;
         break;
 
     default:
@@ -1345,51 +1346,55 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
         return -EINVAL;
     }
 
-    if ( *msr_area == NULL )
+    /* Allocate memory on first use. */
+    if ( unlikely(!*ptr) )
     {
-        if ( (*msr_area = alloc_xenheap_page()) == NULL )
+        paddr_t addr;
+
+        if ( (*ptr = alloc_xenheap_page()) == NULL )
             return -ENOMEM;
 
+        addr = virt_to_maddr(*ptr);
+
         switch ( type )
         {
         case VMX_MSR_HOST:
-            __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
+            __vmwrite(VM_EXIT_MSR_LOAD_ADDR, addr);
             break;
 
         case VMX_MSR_GUEST:
-            __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area));
-            __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
+            __vmwrite(VM_EXIT_MSR_STORE_ADDR, addr);
+            __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, addr);
             break;
         }
     }
 
-    for ( idx = 0; idx < *msr_count && (*msr_area)[idx].index <= msr; idx++ )
-        if ( (*msr_area)[idx].index == msr )
+    start = *ptr;
+    end   = start + total;
+
+    for ( ent = start; ent < end && ent->index <= msr; ++ent )
+        if ( ent->index == msr )
             return 0;
 
-    if ( *msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
+    if ( total == (PAGE_SIZE / sizeof(*ent)) )
         return -ENOSPC;
 
-    memmove(*msr_area + idx + 1, *msr_area + idx,
-            sizeof(*msr_area_elem) * (*msr_count - idx));
-
-    msr_area_elem = *msr_area + idx;
-    msr_area_elem->index = msr;
-    msr_area_elem->mbz = 0;
+    memmove(ent + 1, ent, sizeof(*ent) * (end - ent));
 
-    ++*msr_count;
+    ent->index = msr;
+    ent->mbz = 0;
 
     switch ( type )
     {
     case VMX_MSR_HOST:
-        rdmsrl(msr, msr_area_elem->data);
-        __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count);
+        rdmsrl(msr, ent->data);
+        __vmwrite(VM_EXIT_MSR_LOAD_COUNT, ++vmx->host_msr_count);
         break;
 
     case VMX_MSR_GUEST:
-        msr_area_elem->data = 0;
-        __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count);
-        __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count);
+        ent->data = 0;
+        __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_count);
+        __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_count);
         break;
     }
 
-- 
2.17.1


From b52017c904ae770ab86a62bf3219ee21d23bb55b Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 7 May 2018 11:57:00 +0100
Subject: [PATCH 24/42] x86/vmx: Factor locate_msr_entry() out of
 vmx_find_msr() and vmx_add_msr()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of having multiple algorithms searching the MSR lists, implement a
single one.  It has the semantics required by vmx_add_msr(), to identify the
position in which an MSR should live, if it isn't already present.

There will be a marginal improvement for vmx_find_msr() by avoiding the
function pointer calls to vmx_msr_entry_key_cmp(), and a major improvement for
vmx_add_msr() by using a binary search instead of a linear search.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
(cherry picked from commit 4d94828cf11104256dccea1fa7762f00575dfaa0)
---
 xen/arch/x86/hvm/vmx/vmcs.c | 41 +++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index a6ddba3132..c75b0ee5c3 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1280,24 +1280,36 @@ static int construct_vmcs(struct vcpu *v)
     return rc;
 }
 
-static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
+/*
+ * Search an MSR list looking for an MSR entry, or the slot in which it should
+ * live (to keep the data sorted) if an entry is not found.
+ *
+ * The return pointer is guaranteed to be bounded by start and end.  However,
+ * it may point at end, and may be invalid for the caller to dereference.
+ */
+static struct vmx_msr_entry *locate_msr_entry(
+    struct vmx_msr_entry *start, struct vmx_msr_entry *end, uint32_t msr)
 {
-    const u32 *msr = key;
-    const struct vmx_msr_entry *entry = elt;
+    while ( start < end )
+    {
+        struct vmx_msr_entry *mid = start + (end - start) / 2;
 
-    if ( *msr > entry->index )
-        return 1;
-    if ( *msr < entry->index )
-        return -1;
+        if ( msr < mid->index )
+            end = mid;
+        else if ( msr > mid->index )
+            start = mid + 1;
+        else
+            return mid;
+    }
 
-    return 0;
+    return start;
 }
 
 struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type)
 {
     struct vcpu *curr = current;
     struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx;
-    struct vmx_msr_entry *start = NULL;
+    struct vmx_msr_entry *start = NULL, *ent, *end;
     unsigned int total;
 
     switch ( type )
@@ -1319,7 +1331,10 @@ struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type)
     if ( !start )
         return NULL;
 
-    return bsearch(&msr, start, total, sizeof(*start), vmx_msr_entry_key_cmp);
+    end = start + total;
+    ent = locate_msr_entry(start, end, msr);
+
+    return ((ent < end) && (ent->index == msr)) ? ent : NULL;
 }
 
 int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
@@ -1371,10 +1386,10 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
 
     start = *ptr;
     end   = start + total;
+    ent   = locate_msr_entry(start, end, msr);
 
-    for ( ent = start; ent < end && ent->index <= msr; ++ent )
-        if ( ent->index == msr )
-            return 0;
+    if ( (ent < end) && (ent->index == msr) )
+        return 0;
 
     if ( total == (PAGE_SIZE / sizeof(*ent)) )
         return -ENOSPC;
-- 
2.17.1


From 218d403ad944f47548752d4a60e8f77e5f8e1950 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 7 May 2018 11:57:00 +0100
Subject: [PATCH 25/42] x86/vmx: Support remote access to the MSR lists

At the moment, all modifications of the MSR lists are in current context.
However, future changes may need to put MSR_EFER into the lists from domctl
hypercall context.

Plumb a struct vcpu parameter down through the infrastructure, and use
vmx_vmcs_{enter,exit}() for safe access to the VMCS in vmx_add_msr().  Use
assertions to ensure that access is either in current context, or while the
vcpu is paused.

Note these expectations beside the fields in arch_vmx_struct, and reorder the
fields to avoid unnecessary padding.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit 80599f0b770199116aa753bfdfac9bfe2e8ea86a)
---
 xen/arch/x86/cpu/vpmu_intel.c      | 14 +++++------
 xen/arch/x86/hvm/vmx/vmcs.c        | 40 ++++++++++++++++++++++--------
 xen/arch/x86/hvm/vmx/vmx.c         | 22 ++++++++--------
 xen/include/asm-x86/hvm/vmx/vmcs.h | 34 ++++++++++++++++---------
 xen/include/xen/sched.h            |  2 +-
 5 files changed, 72 insertions(+), 40 deletions(-)

diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c
index 207e2e712c..c499e69f2f 100644
--- a/xen/arch/x86/cpu/vpmu_intel.c
+++ b/xen/arch/x86/cpu/vpmu_intel.c
@@ -455,12 +455,12 @@ static int core2_vpmu_alloc_resource(struct vcpu *v)
     if ( is_hvm_vcpu(v) )
     {
         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-        if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
+        if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
             goto out_err;
 
-        if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
+        if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
             goto out_err;
-        vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+        vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0);
     }
 
     core2_vpmu_cxt = xzalloc_bytes(sizeof(*core2_vpmu_cxt) +
@@ -613,7 +613,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
             return -EINVAL;
 
         if ( is_hvm_vcpu(v) )
-            vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
+            vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL,
                                &core2_vpmu_cxt->global_ctrl);
         else
             rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
@@ -682,7 +682,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
                 return -EINVAL;
 
             if ( is_hvm_vcpu(v) )
-                vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
+                vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL,
                                    &core2_vpmu_cxt->global_ctrl);
             else
                 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
@@ -701,7 +701,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
     else
     {
         if ( is_hvm_vcpu(v) )
-            vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+            vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
         else
             wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
     }
@@ -735,7 +735,7 @@ static int core2_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
             break;
         case MSR_CORE_PERF_GLOBAL_CTRL:
             if ( is_hvm_vcpu(v) )
-                vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+                vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
             else
                 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, *msr_content);
             break;
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index c75b0ee5c3..e86f292fbc 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1305,13 +1305,15 @@ static struct vmx_msr_entry *locate_msr_entry(
     return start;
 }
 
-struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type)
+struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
+                                   enum vmx_msr_list_type type)
 {
-    struct vcpu *curr = current;
-    struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx;
+    const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
     struct vmx_msr_entry *start = NULL, *ent, *end;
     unsigned int total;
 
+    ASSERT(v == current || !vcpu_runnable(v));
+
     switch ( type )
     {
     case VMX_MSR_HOST:
@@ -1337,12 +1339,14 @@ struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type)
     return ((ent < end) && (ent->index == msr)) ? ent : NULL;
 }
 
-int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
+int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type)
 {
-    struct vcpu *curr = current;
-    struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx;
+    struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
     struct vmx_msr_entry **ptr, *start = NULL, *ent, *end;
     unsigned int total;
+    int rc;
+
+    ASSERT(v == current || !vcpu_runnable(v));
 
     switch ( type )
     {
@@ -1361,13 +1365,18 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
         return -EINVAL;
     }
 
+    vmx_vmcs_enter(v);
+
     /* Allocate memory on first use. */
     if ( unlikely(!*ptr) )
     {
         paddr_t addr;
 
         if ( (*ptr = alloc_xenheap_page()) == NULL )
-            return -ENOMEM;
+        {
+            rc = -ENOMEM;
+            goto out;
+        }
 
         addr = virt_to_maddr(*ptr);
 
@@ -1389,10 +1398,16 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
     ent   = locate_msr_entry(start, end, msr);
 
     if ( (ent < end) && (ent->index == msr) )
-        return 0;
+    {
+        rc = 0;
+        goto out;
+    }
 
     if ( total == (PAGE_SIZE / sizeof(*ent)) )
-        return -ENOSPC;
+    {
+        rc = -ENOSPC;
+        goto out;
+    }
 
     memmove(ent + 1, ent, sizeof(*ent) * (end - ent));
 
@@ -1413,7 +1428,12 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
         break;
     }
 
-    return 0;
+    rc = 0;
+
+ out:
+    vmx_vmcs_exit(v);
+
+    return rc;
 }
 
 void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector)
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index d4ebae8945..95162bf187 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2822,7 +2822,7 @@ static int is_last_branch_msr(u32 ecx)
 
 static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
 {
-    const struct vcpu *curr = current;
+    struct vcpu *curr = current;
 
     HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x", msr);
 
@@ -2901,7 +2901,7 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
         if ( passive_domain_do_rdmsr(msr, msr_content) )
             goto done;
 
-        if ( vmx_read_guest_msr(msr, msr_content) == 0 )
+        if ( vmx_read_guest_msr(curr, msr, msr_content) == 0 )
             break;
 
         if ( is_last_branch_msr(msr) )
@@ -3113,7 +3113,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
 
             for ( ; (rc == 0) && lbr->count; lbr++ )
                 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
-                    if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
+                    if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
                     {
                         vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
                         if ( lbr_tsx_fixup_needed )
@@ -3153,7 +3153,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
         if ( wrmsr_viridian_regs(msr, msr_content) ) 
             break;
 
-        if ( vmx_write_guest_msr(msr, msr_content) == 0 ||
+        if ( vmx_write_guest_msr(v, msr, msr_content) == 0 ||
              is_last_branch_msr(msr) )
             break;
 
@@ -4169,7 +4169,7 @@ static void lbr_tsx_fixup(void)
     struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
     struct vmx_msr_entry *msr;
 
-    if ( (msr = vmx_find_msr(lbr_from_start, VMX_MSR_GUEST)) != NULL )
+    if ( (msr = vmx_find_msr(curr, lbr_from_start, VMX_MSR_GUEST)) != NULL )
     {
         /*
          * Sign extend into bits 61:62 while preserving bit 63
@@ -4179,15 +4179,15 @@ static void lbr_tsx_fixup(void)
             msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
     }
 
-    if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_MSR_GUEST)) != NULL )
+    if ( (msr = vmx_find_msr(curr, lbr_lastint_from, VMX_MSR_GUEST)) != NULL )
         msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
 }
 
-static void sign_extend_msr(u32 msr, int type)
+static void sign_extend_msr(struct vcpu *v, u32 msr, int type)
 {
     struct vmx_msr_entry *entry;
 
-    if ( (entry = vmx_find_msr(msr, type)) != NULL )
+    if ( (entry = vmx_find_msr(v, msr, type)) != NULL )
     {
         if ( entry->data & VADDR_TOP_BIT )
             entry->data |= CANONICAL_MASK;
@@ -4198,6 +4198,8 @@ static void sign_extend_msr(u32 msr, int type)
 
 static void bdw_erratum_bdf14_fixup(void)
 {
+    struct vcpu *curr = current;
+
     /*
      * Occasionally, on certain Broadwell CPUs MSR_IA32_LASTINTTOIP has
      * been observed to have the top three bits corrupted as though the
@@ -4207,8 +4209,8 @@ static void bdw_erratum_bdf14_fixup(void)
      * erratum BDF14. Fix up MSR_IA32_LASTINT{FROM,TO}IP by
      * sign-extending into bits 48:63.
      */
-    sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST);
-    sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST);
+    sign_extend_msr(curr, MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST);
+    sign_extend_msr(curr, MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST);
 }
 
 static void lbr_fixup(void)
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 20882d13e0..62afebec11 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -130,10 +130,17 @@ struct arch_vmx_struct {
     uint64_t             sfmask;
 
     struct vmx_msr_bitmap *msr_bitmap;
-    unsigned int         msr_count;
+
+    /*
+     * Most accesses to the MSR host/guest load/save lists are in current
+     * context.  However, the data can be modified by toolstack/migration
+     * actions.  Remote access is only permitted for paused vcpus, and is
+     * protected under the domctl lock.
+     */
     struct vmx_msr_entry *msr_area;
-    unsigned int         host_msr_count;
     struct vmx_msr_entry *host_msr_area;
+    unsigned int         msr_count;
+    unsigned int         host_msr_count;
 
     unsigned long        eoi_exitmap_changed;
     DECLARE_BITMAP(eoi_exit_bitmap, NR_VECTORS);
@@ -537,23 +544,25 @@ enum vmx_msr_list_type {
     VMX_MSR_GUEST,          /* MSRs saved on VMExit, loaded on VMEntry. */
 };
 
-int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type);
+int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type);
 
-static inline int vmx_add_host_load_msr(uint32_t msr)
+static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr)
 {
-    return vmx_add_msr(msr, VMX_MSR_HOST);
+    return vmx_add_msr(v, msr, VMX_MSR_GUEST);
 }
 
-static inline int vmx_add_guest_msr(uint32_t msr)
+static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr)
 {
-    return vmx_add_msr(msr, VMX_MSR_GUEST);
+    return vmx_add_msr(v, msr, VMX_MSR_HOST);
 }
 
-struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type);
+struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
+                                   enum vmx_msr_list_type type);
 
-static inline int vmx_read_guest_msr(uint32_t msr, uint64_t *val)
+static inline int vmx_read_guest_msr(const struct vcpu *v, uint32_t msr,
+                                     uint64_t *val)
 {
-    const struct vmx_msr_entry *ent = vmx_find_msr(msr, VMX_MSR_GUEST);
+    const struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST);
 
     if ( !ent )
         return -ESRCH;
@@ -563,9 +572,10 @@ static inline int vmx_read_guest_msr(uint32_t msr, uint64_t *val)
     return 0;
 }
 
-static inline int vmx_write_guest_msr(uint32_t msr, uint64_t val)
+static inline int vmx_write_guest_msr(struct vcpu *v, uint32_t msr,
+                                      uint64_t val)
 {
-    struct vmx_msr_entry *ent = vmx_find_msr(msr, VMX_MSR_GUEST);
+    struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST);
 
     if ( !ent )
         return -ESRCH;
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 99d2af2e1f..e79d5a36ca 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -788,7 +788,7 @@ static inline struct domain *next_domain_in_cpupool(
 #define _VPF_parked          8
 #define VPF_parked           (1UL<<_VPF_parked)
 
-static inline int vcpu_runnable(struct vcpu *v)
+static inline bool vcpu_runnable(const struct vcpu *v)
 {
     return !(v->pause_flags |
              atomic_read(&v->pause_count) |
-- 
2.17.1


From cfdd4e846a77ca5510b6c35adeec55014a73efb9 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 7 May 2018 11:57:00 +0100
Subject: [PATCH 26/42] x86/vmx: Improvements to LBR MSR handling

The main purpose of this patch is to only ever insert the LBR MSRs into the
guest load/save list once, as a future patch wants to change the behaviour of
vmx_add_guest_msr().

The repeated processing of lbr_info and the guests MSR load/save list is
redundant, and a guest using LBR itself will have to re-enable
MSR_DEBUGCTL.LBR in its #DB handler, meaning that Xen will repeat this
redundant processing every time the guest gets a debug exception.

Rename lbr_fixup_enabled to lbr_flags to be a little more generic, and use one
bit to indicate that the MSRs have been inserted into the load/save list.
Shorten the existing FIXUP* identifiers to reduce code volume.

Furthermore, handing the guest #MC on an error isn't a legitimate action.  Two
of the three failure cases are definitely hypervisor bugs, and the third is a
boundary case which shouldn't occur in practice.  The guest also won't execute
correctly, so handle errors by cleanly crashing the guest.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit be73a842e642772d7372004c9c105de35b771020)
---
 xen/arch/x86/hvm/vmx/vmx.c         | 81 +++++++++++++++++++++---------
 xen/include/asm-x86/hvm/vmx/vmcs.h |  2 +-
 2 files changed, 59 insertions(+), 24 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 95162bf187..5f01652d48 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2758,8 +2758,10 @@ enum
 
 #define LBR_FROM_SIGNEXT_2MSB  ((1ULL << 59) | (1ULL << 60))
 
-#define FIXUP_LBR_TSX            (1u << 0)
-#define FIXUP_BDW_ERRATUM_BDF14  (1u << 1)
+#define LBR_MSRS_INSERTED      (1u << 0)
+#define LBR_FIXUP_TSX          (1u << 1)
+#define LBR_FIXUP_BDF14        (1u << 2)
+#define LBR_FIXUP_MASK         (LBR_FIXUP_TSX | LBR_FIXUP_BDF14)
 
 static bool __read_mostly lbr_tsx_fixup_needed;
 static bool __read_mostly bdw_erratum_bdf14_fixup_needed;
@@ -3094,7 +3096,6 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
         break;
 
     case MSR_IA32_DEBUGCTLMSR: {
-        int i, rc = 0;
         uint64_t supported = IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF;
 
         if ( boot_cpu_has(X86_FEATURE_RTM) )
@@ -3105,30 +3106,64 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
             if ( vpmu_do_wrmsr(msr, msr_content, supported) )
                 break;
         }
-        if ( msr_content & IA32_DEBUGCTLMSR_LBR )
+
+        /*
+         * When a guest first enables LBR, arrange to save and restore the LBR
+         * MSRs and allow the guest direct access.
+         *
+         * MSR_DEBUGCTL and LBR has existed almost as long as MSRs have
+         * existed, and there is no architectural way to hide the feature, or
+         * fail the attempt to enable LBR.
+         *
+         * Unknown host LBR MSRs or hitting -ENOSPC with the guest load/save
+         * list are definitely hypervisor bugs, whereas -ENOMEM for allocating
+         * the load/save list is simply unlucky (and shouldn't occur with
+         * sensible management by the toolstack).
+         *
+         * Either way, there is nothing we can do right now to recover, and
+         * the guest won't execute correctly either.  Simply crash the domain
+         * to make the failure obvious.
+         */
+        if ( !(v->arch.hvm_vmx.lbr_flags & LBR_MSRS_INSERTED) &&
+             (msr_content & IA32_DEBUGCTLMSR_LBR) )
         {
             const struct lbr_info *lbr = last_branch_msr_get();
-            if ( lbr == NULL )
-                break;
 
-            for ( ; (rc == 0) && lbr->count; lbr++ )
-                for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
-                    if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
+            if ( unlikely(!lbr) )
+            {
+                gprintk(XENLOG_ERR, "Unknown Host LBR MSRs\n");
+                domain_crash(v->domain);
+                return X86EMUL_OKAY;
+            }
+
+            for ( ; lbr->count; lbr++ )
+            {
+                unsigned int i;
+
+                for ( i = 0; i < lbr->count; i++ )
+                {
+                    int rc = vmx_add_guest_msr(v, lbr->base + i);
+
+                    if ( unlikely(rc) )
                     {
-                        vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
-                        if ( lbr_tsx_fixup_needed )
-                            v->arch.hvm_vmx.lbr_fixup_enabled |= FIXUP_LBR_TSX;
-                        if ( bdw_erratum_bdf14_fixup_needed )
-                            v->arch.hvm_vmx.lbr_fixup_enabled |=
-                                FIXUP_BDW_ERRATUM_BDF14;
+                        gprintk(XENLOG_ERR,
+                                "Guest load/save list error %d\n", rc);
+                        domain_crash(v->domain);
+                        return X86EMUL_OKAY;
                     }
-        }
 
-        if ( rc < 0 )
-            hvm_inject_hw_exception(TRAP_machine_check, X86_EVENT_NO_EC);
-        else
-            __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
+                    vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
+                }
+            }
+
+            v->arch.hvm_vmx.lbr_flags |= LBR_MSRS_INSERTED;
+            if ( lbr_tsx_fixup_needed )
+                v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_TSX;
+            if ( bdw_erratum_bdf14_fixup_needed )
+                v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_BDF14;
+        }
 
+        __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
         break;
     }
     case MSR_IA32_FEATURE_CONTROL:
@@ -4217,9 +4252,9 @@ static void lbr_fixup(void)
 {
     struct vcpu *curr = current;
 
-    if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_LBR_TSX )
+    if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_TSX )
         lbr_tsx_fixup();
-    if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_BDW_ERRATUM_BDF14 )
+    if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_BDF14 )
         bdw_erratum_bdf14_fixup();
 }
 
@@ -4287,7 +4322,7 @@ bool vmx_vmenter_helper(const struct cpu_user_regs *regs)
     }
 
  out:
-    if ( unlikely(curr->arch.hvm_vmx.lbr_fixup_enabled) )
+    if ( unlikely(curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_MASK) )
         lbr_fixup();
 
     HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 62afebec11..2c9e291bee 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -156,7 +156,7 @@ struct arch_vmx_struct {
     /* Are we emulating rather than VMENTERing? */
     uint8_t              vmx_emulate;
 
-    uint8_t              lbr_fixup_enabled;
+    uint8_t              lbr_flags;
 
     /* Bitmask of segments that we can't safely use in virtual 8086 mode */
     uint16_t             vm86_segment_mask;
-- 
2.17.1


From 8b35b978a273a153ceadccd9c02d433f8be1c9bd Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 7 May 2018 11:57:00 +0100
Subject: [PATCH 27/42] x86/vmx: Pass an MSR value into vmx_msr_add()

The main purpose of this change is to allow us to set a specific MSR value,
without needing to know whether there is already a load/save list slot for it.

Previously, callers wanting this property needed to call both vmx_add_*_msr()
and vmx_write_*_msr() to cover both cases, and there are no callers which want
the old behaviour of being a no-op if an entry already existed for the MSR.

As a result of this API improvement, the default value for guest MSRs need not
be 0, and the default for host MSRs need not be passed via hardware register.
In practice, this cleans up the VPMU allocation logic, and avoids an MSR read
as part of vcpu construction.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit ee7689b94ac7094b975ab4a023cfeae209da0a36)
---
 xen/arch/x86/cpu/vpmu_intel.c      |  6 ++----
 xen/arch/x86/hvm/vmx/vmcs.c        | 14 +++++++-------
 xen/arch/x86/hvm/vmx/vmx.c         |  2 +-
 xen/include/asm-x86/hvm/vmx/vmcs.h | 20 ++++++++++++++------
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c
index c499e69f2f..1fc79c9ff4 100644
--- a/xen/arch/x86/cpu/vpmu_intel.c
+++ b/xen/arch/x86/cpu/vpmu_intel.c
@@ -454,13 +454,11 @@ static int core2_vpmu_alloc_resource(struct vcpu *v)
 
     if ( is_hvm_vcpu(v) )
     {
-        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-        if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
+        if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) )
             goto out_err;
 
-        if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
+        if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) )
             goto out_err;
-        vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0);
     }
 
     core2_vpmu_cxt = xzalloc_bytes(sizeof(*core2_vpmu_cxt) +
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index e86f292fbc..af422b3f92 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1339,7 +1339,8 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
     return ((ent < end) && (ent->index == msr)) ? ent : NULL;
 }
 
-int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type)
+int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
+                enum vmx_msr_list_type type)
 {
     struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
     struct vmx_msr_entry **ptr, *start = NULL, *ent, *end;
@@ -1398,11 +1399,9 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type)
     ent   = locate_msr_entry(start, end, msr);
 
     if ( (ent < end) && (ent->index == msr) )
-    {
-        rc = 0;
-        goto out;
-    }
+        goto found;
 
+    /* If there isn't an existing entry for msr, insert room for one. */
     if ( total == (PAGE_SIZE / sizeof(*ent)) )
     {
         rc = -ENOSPC;
@@ -1417,17 +1416,18 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type)
     switch ( type )
     {
     case VMX_MSR_HOST:
-        rdmsrl(msr, ent->data);
         __vmwrite(VM_EXIT_MSR_LOAD_COUNT, ++vmx->host_msr_count);
         break;
 
     case VMX_MSR_GUEST:
-        ent->data = 0;
         __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_count);
         __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_count);
         break;
     }
 
+    /* Set the msr's value. */
+ found:
+    ent->data = val;
     rc = 0;
 
  out:
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 5f01652d48..5745543e49 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3142,7 +3142,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
 
                 for ( i = 0; i < lbr->count; i++ )
                 {
-                    int rc = vmx_add_guest_msr(v, lbr->base + i);
+                    int rc = vmx_add_guest_msr(v, lbr->base + i, 0);
 
                     if ( unlikely(rc) )
                     {
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 2c9e291bee..f94a108ea5 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -544,16 +544,24 @@ enum vmx_msr_list_type {
     VMX_MSR_GUEST,          /* MSRs saved on VMExit, loaded on VMEntry. */
 };
 
-int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type);
+/**
+ * Add an MSR to an MSR list (inserting space for the entry if necessary), and
+ * set the MSRs value.
+ *
+ * May fail if unable to allocate memory for the list, or the total number of
+ * entries exceeds the memory allocated.
+ */
+int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
+                enum vmx_msr_list_type type);
 
-static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr)
+static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr, uint64_t val)
 {
-    return vmx_add_msr(v, msr, VMX_MSR_GUEST);
+    return vmx_add_msr(v, msr, val, VMX_MSR_GUEST);
 }
-
-static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr)
+static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr,
+                                        uint64_t val)
 {
-    return vmx_add_msr(v, msr, VMX_MSR_HOST);
+    return vmx_add_msr(v, msr, val, VMX_MSR_HOST);
 }
 
 struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
-- 
2.17.1


From 7b420e8a82cc8664e086ed31ec5e80615bd6225f Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 7 May 2018 11:57:00 +0100
Subject: [PATCH 28/42] x86/vmx: Support load-only guest MSR list entries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, the VMX_MSR_GUEST type maintains completely symmetric guest load
and save lists, by pointing VM_EXIT_MSR_STORE_ADDR and VM_ENTRY_MSR_LOAD_ADDR
at the same page, and setting VM_EXIT_MSR_STORE_COUNT and
VM_ENTRY_MSR_LOAD_COUNT to the same value.

However, for MSRs which we won't let the guest have direct access to, having
hardware save the current value on VMExit is unnecessary overhead.

To avoid this overhead, we must make the load and save lists asymmetric.  By
making the entry load count greater than the exit store count, we can maintain
two adjacent lists of MSRs, the first of which is saved and restored, and the
second of which is only restored on VMEntry.

For simplicity:
 * Both adjacent lists are still sorted by MSR index.
 * It undefined behaviour to insert the same MSR into both lists.
 * The total size of both lists is still limited at 256 entries (one 4k page).

Split the current msr_count field into msr_{load,save}_count, and introduce a
new VMX_MSR_GUEST_LOADONLY type, and update vmx_{add,find}_msr() to calculate
which sublist to search, based on type.  VMX_MSR_HOST has no logical sublist,
whereas VMX_MSR_GUEST has a sublist between 0 and the save count, while
VMX_MSR_GUEST_LOADONLY has a sublist between the save count and the load
count.

One subtle point is that inserting an MSR into the load-save list involves
moving the entire load-only list, and updating both counts.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
(cherry picked from commit 1ac46b55632626aeb935726e1b0a71605ef6763a)
---
 xen/arch/x86/hvm/vmx/vmcs.c        | 46 +++++++++++++++++++++++-------
 xen/arch/x86/hvm/vmx/vmx.c         |  2 +-
 xen/include/asm-x86/hvm/vmx/vmcs.h |  7 ++++-
 3 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index af422b3f92..ca652c49cb 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1310,7 +1310,7 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
 {
     const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
     struct vmx_msr_entry *start = NULL, *ent, *end;
-    unsigned int total;
+    unsigned int substart, subend, total;
 
     ASSERT(v == current || !vcpu_runnable(v));
 
@@ -1318,12 +1318,23 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
     {
     case VMX_MSR_HOST:
         start    = vmx->host_msr_area;
-        total    = vmx->host_msr_count;
+        substart = 0;
+        subend   = vmx->host_msr_count;
+        total    = subend;
         break;
 
     case VMX_MSR_GUEST:
         start    = vmx->msr_area;
-        total    = vmx->msr_count;
+        substart = 0;
+        subend   = vmx->msr_save_count;
+        total    = vmx->msr_load_count;
+        break;
+
+    case VMX_MSR_GUEST_LOADONLY:
+        start    = vmx->msr_area;
+        substart = vmx->msr_save_count;
+        subend   = vmx->msr_load_count;
+        total    = subend;
         break;
 
     default:
@@ -1334,7 +1345,7 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
         return NULL;
 
     end = start + total;
-    ent = locate_msr_entry(start, end, msr);
+    ent = locate_msr_entry(start + substart, start + subend, msr);
 
     return ((ent < end) && (ent->index == msr)) ? ent : NULL;
 }
@@ -1344,7 +1355,7 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
 {
     struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
     struct vmx_msr_entry **ptr, *start = NULL, *ent, *end;
-    unsigned int total;
+    unsigned int substart, subend, total;
     int rc;
 
     ASSERT(v == current || !vcpu_runnable(v));
@@ -1353,12 +1364,23 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
     {
     case VMX_MSR_HOST:
         ptr      = &vmx->host_msr_area;
-        total    = vmx->host_msr_count;
+        substart = 0;
+        subend   = vmx->host_msr_count;
+        total    = subend;
         break;
 
     case VMX_MSR_GUEST:
         ptr      = &vmx->msr_area;
-        total    = vmx->msr_count;
+        substart = 0;
+        subend   = vmx->msr_save_count;
+        total    = vmx->msr_load_count;
+        break;
+
+    case VMX_MSR_GUEST_LOADONLY:
+        ptr      = &vmx->msr_area;
+        substart = vmx->msr_save_count;
+        subend   = vmx->msr_load_count;
+        total    = subend;
         break;
 
     default:
@@ -1388,6 +1410,7 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
             break;
 
         case VMX_MSR_GUEST:
+        case VMX_MSR_GUEST_LOADONLY:
             __vmwrite(VM_EXIT_MSR_STORE_ADDR, addr);
             __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, addr);
             break;
@@ -1396,7 +1419,7 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
 
     start = *ptr;
     end   = start + total;
-    ent   = locate_msr_entry(start, end, msr);
+    ent   = locate_msr_entry(start + substart, start + subend, msr);
 
     if ( (ent < end) && (ent->index == msr) )
         goto found;
@@ -1420,8 +1443,11 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
         break;
 
     case VMX_MSR_GUEST:
-        __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_count);
-        __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_count);
+        __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_save_count);
+
+        /* Fallthrough */
+    case VMX_MSR_GUEST_LOADONLY:
+        __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, ++vmx->msr_load_count);
         break;
     }
 
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 5745543e49..1e32f61225 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4200,7 +4200,7 @@ out:
 static void lbr_tsx_fixup(void)
 {
     struct vcpu *curr = current;
-    unsigned int msr_count = curr->arch.hvm_vmx.msr_count;
+    unsigned int msr_count = curr->arch.hvm_vmx.msr_save_count;
     struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
     struct vmx_msr_entry *msr;
 
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index f94a108ea5..57e5098b99 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -139,7 +139,8 @@ struct arch_vmx_struct {
      */
     struct vmx_msr_entry *msr_area;
     struct vmx_msr_entry *host_msr_area;
-    unsigned int         msr_count;
+    unsigned int         msr_load_count;
+    unsigned int         msr_save_count;
     unsigned int         host_msr_count;
 
     unsigned long        eoi_exitmap_changed;
@@ -542,12 +543,16 @@ enum vmx_insn_errno
 enum vmx_msr_list_type {
     VMX_MSR_HOST,           /* MSRs loaded on VMExit.                   */
     VMX_MSR_GUEST,          /* MSRs saved on VMExit, loaded on VMEntry. */
+    VMX_MSR_GUEST_LOADONLY, /* MSRs loaded on VMEntry only.             */
 };
 
 /**
  * Add an MSR to an MSR list (inserting space for the entry if necessary), and
  * set the MSRs value.
  *
+ * It is undefined behaviour to try and insert the same MSR into both the
+ * GUEST and GUEST_LOADONLY list.
+ *
  * May fail if unable to allocate memory for the list, or the total number of
  * entries exceeds the memory allocated.
  */
-- 
2.17.1


From 1d32c21975097e64a7ecf0932680a3b6d53d00a4 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Thu, 19 Jul 2018 11:54:45 +0200
Subject: [PATCH 29/42] VMX: fix vmx_{find,del}_msr() build

Older gcc at -O2 (and perhaps higher) does not recognize that apparently
uninitialized variables aren't really uninitialized. Pull out the
assignments used by two of the three case blocks and make them
initializers of the variables, as I think I had suggested during review.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
(cherry picked from commit 97cb0516a322ecdf0032fa9d8aa1525c03d7772f)
---
 xen/arch/x86/hvm/vmx/vmcs.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index ca652c49cb..30a33dd0bd 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1310,7 +1310,8 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
 {
     const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
     struct vmx_msr_entry *start = NULL, *ent, *end;
-    unsigned int substart, subend, total;
+    unsigned int substart = 0, subend = vmx->msr_save_count;
+    unsigned int total = vmx->msr_load_count;
 
     ASSERT(v == current || !vcpu_runnable(v));
 
@@ -1318,23 +1319,18 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
     {
     case VMX_MSR_HOST:
         start    = vmx->host_msr_area;
-        substart = 0;
         subend   = vmx->host_msr_count;
         total    = subend;
         break;
 
     case VMX_MSR_GUEST:
         start    = vmx->msr_area;
-        substart = 0;
-        subend   = vmx->msr_save_count;
-        total    = vmx->msr_load_count;
         break;
 
     case VMX_MSR_GUEST_LOADONLY:
         start    = vmx->msr_area;
-        substart = vmx->msr_save_count;
-        subend   = vmx->msr_load_count;
-        total    = subend;
+        substart = subend;
+        subend   = total;
         break;
 
     default:
-- 
2.17.1


From fa79f9e762be390b56218437ed317a695a03a5e7 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <sstabellini@kernel.org>
Date: Mon, 13 Aug 2018 17:25:51 +0100
Subject: [PATCH 30/42] ARM: disable grant table v2

It was never expected to work, the implementation is incomplete.

As a side effect, it also prevents guests from triggering a
"BUG_ON(page_get_owner(pg) != d)" in gnttab_unpopulate_status_frames().

This is XSA-268.

Signed-off-by: Stefano Stabellini <sstabellini@kernel.org>
Acked-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit 9a5c16a3e75778c8a094ca87784d93b74676f46c)
---
 docs/misc/xen-command-line.markdown | 2 ++
 xen/common/grant_table.c            | 6 +++++-
 xen/include/asm-arm/grant_table.h   | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 3b710b71fb..e5e7fdc405 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -936,6 +936,8 @@ version are 1 and 2.
 use of grant table v2 without transitive grants is an ABI breakage from the
 guests point of view.
 
+The usage of gnttab v2 is not security supported on ARM platforms.
+
 ### gnttab\_max\_frames
 > `= <integer>`
 
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index c757b7f6f5..231ecf509a 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -97,7 +97,11 @@ static unsigned int __read_mostly max_maptrack_frames =
                                                DEFAULT_MAX_MAPTRACK_FRAMES;
 integer_runtime_param("gnttab_max_maptrack_frames", max_maptrack_frames);
 
-static unsigned int __read_mostly opt_gnttab_max_version = 2;
+#ifndef GNTTAB_MAX_VERSION
+#define GNTTAB_MAX_VERSION 2
+#endif
+
+static unsigned int __read_mostly opt_gnttab_max_version = GNTTAB_MAX_VERSION;
 static bool __read_mostly opt_transitive_grants = true;
 
 static int __init parse_gnttab(const char *s)
diff --git a/xen/include/asm-arm/grant_table.h b/xen/include/asm-arm/grant_table.h
index e52936c79f..24958e4670 100644
--- a/xen/include/asm-arm/grant_table.h
+++ b/xen/include/asm-arm/grant_table.h
@@ -7,6 +7,7 @@
 #include <xen/sched.h>
 
 #define INITIAL_NR_GRANT_FRAMES 1U
+#define GNTTAB_MAX_VERSION 1
 
 struct grant_table_arch {
     gfn_t *shared_gfn;
-- 
2.17.1


From 48fb482ef695c6b193ccfca665e6dd302eb230e2 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 13 Aug 2018 17:26:21 +0100
Subject: [PATCH 31/42] x86/vtx: Fix the checking for unknown/invalid
 MSR_DEBUGCTL bits

The VPMU_MODE_OFF early-exit in vpmu_do_wrmsr() introduced by c/s
11fe998e56 bypasses all reserved bit checking in the general case.  As a
result, a guest can enable BTS when it shouldn't be permitted to, and
lock up the entire host.

With vPMU active (not a security supported configuration, but useful for
debugging), the reserved bit checking in broken, caused by the original
BTS changeset 1a8aa75ed.

From a correctness standpoint, it is not possible to have two different
pieces of code responsible for different parts of value checking, if
there isn't an accumulation of bits which have been checked.  A
practical upshot of this is that a guest can set any value it
wishes (usually resulting in a vmentry failure for bad guest state).

Therefore, fix this by implementing all the reserved bit checking in the
main MSR_DEBUGCTL block, and removing all handling of DEBUGCTL from the
vPMU MSR logic.

This is XSA-269.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit 2a8a8e99feb950504559196521bc9fd63ed3a962)
---
 xen/arch/x86/cpu/vpmu_intel.c | 20 --------------------
 xen/arch/x86/hvm/vmx/vmx.c    | 29 ++++++++++++++++++++---------
 2 files changed, 20 insertions(+), 29 deletions(-)

diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c
index 1fc79c9ff4..6e27f6ec8e 100644
--- a/xen/arch/x86/cpu/vpmu_intel.c
+++ b/xen/arch/x86/cpu/vpmu_intel.c
@@ -533,27 +533,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
     uint64_t *enabled_cntrs;
 
     if ( !core2_vpmu_msr_common_check(msr, &type, &index) )
-    {
-        /* Special handling for BTS */
-        if ( msr == MSR_IA32_DEBUGCTLMSR )
-        {
-            supported |= IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS |
-                         IA32_DEBUGCTLMSR_BTINT;
-
-            if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
-                supported |= IA32_DEBUGCTLMSR_BTS_OFF_OS |
-                             IA32_DEBUGCTLMSR_BTS_OFF_USR;
-            if ( !(msr_content & ~supported) &&
-                 vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
-                return 0;
-            if ( (msr_content & supported) &&
-                 !vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
-                printk(XENLOG_G_WARNING
-                       "%pv: Debug Store unsupported on this CPU\n",
-                       current);
-        }
         return -EINVAL;
-    }
 
     ASSERT(!supported);
 
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 1e32f61225..c7cf3a8fbc 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3038,11 +3038,14 @@ void vmx_vlapic_msr_changed(struct vcpu *v)
 static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
 {
     struct vcpu *v = current;
+    const struct cpuid_policy *cp = v->domain->arch.cpuid;
 
     HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x, msr_value=%#"PRIx64, msr, msr_content);
 
     switch ( msr )
     {
+        uint64_t rsvd;
+
     case MSR_IA32_SYSENTER_CS:
         __vmwrite(GUEST_SYSENTER_CS, msr_content);
         break;
@@ -3095,18 +3098,26 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
         wrmsrl(MSR_SYSCALL_MASK, msr_content);
         break;
 
-    case MSR_IA32_DEBUGCTLMSR: {
-        uint64_t supported = IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF;
+    case MSR_IA32_DEBUGCTLMSR:
+        rsvd = ~(IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF);
 
-        if ( boot_cpu_has(X86_FEATURE_RTM) )
-            supported |= IA32_DEBUGCTLMSR_RTM;
-        if ( msr_content & ~supported )
+        /* TODO: Wire vPMU settings properly through the CPUID policy */
+        if ( vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_BTS) )
         {
-            /* Perhaps some other bits are supported in vpmu. */
-            if ( vpmu_do_wrmsr(msr, msr_content, supported) )
-                break;
+            rsvd &= ~(IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS |
+                      IA32_DEBUGCTLMSR_BTINT);
+
+            if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
+                rsvd &= ~(IA32_DEBUGCTLMSR_BTS_OFF_OS |
+                          IA32_DEBUGCTLMSR_BTS_OFF_USR);
         }
 
+        if ( cp->feat.rtm )
+            rsvd &= ~IA32_DEBUGCTLMSR_RTM;
+
+        if ( msr_content & rsvd )
+            goto gp_fault;
+
         /*
          * When a guest first enables LBR, arrange to save and restore the LBR
          * MSRs and allow the guest direct access.
@@ -3165,7 +3176,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
 
         __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
         break;
-    }
+
     case MSR_IA32_FEATURE_CONTROL:
     case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
         /* None of these MSRs are writeable. */
-- 
2.17.1


From e6441a804b76797c6ebac81b7d70ff19e5df9188 Mon Sep 17 00:00:00 2001
From: Christian Lindig <christian.lindig@citrix.com>
Date: Mon, 13 Aug 2018 17:26:56 +0100
Subject: [PATCH 32/42] tools/oxenstored: Make evaluation order explicit

In Store.path_write(), Path.apply_modify() updates the node_created
reference and both the value of apply_modify() and node_created are
returned by path_write().

At least with OCaml 4.06.1 this leads to the value of node_created being
returned *before* it is updated by apply_modify().  This in turn leads
to the quota for a domain not being updated in Store.write().  Hence, a
guest can create an unlimited number of entries in xenstore.

The fix is to make evaluation order explicit.

This is XSA-272.

Signed-off-by: Christian Lindig <christian.lindig@citrix.com>
Reviewed-by: Rob Hoes <rob.hoes@citrix.com>
(cherry picked from commit 73392c7fd14c59f8c96e0b2eeeb329e4ae9086b6)
---
 tools/ocaml/xenstored/store.ml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/ocaml/xenstored/store.ml b/tools/ocaml/xenstored/store.ml
index 13cf3b5bf4..5a8c377603 100644
--- a/tools/ocaml/xenstored/store.ml
+++ b/tools/ocaml/xenstored/store.ml
@@ -262,7 +262,8 @@ let path_write store perm path value =
 		Node.check_perm store.root perm Perms.WRITE;
 		Node.set_value store.root value, false
 	) else
-		Path.apply_modify store.root path do_write, !node_created
+		let root = Path.apply_modify store.root path do_write in
+		root, !node_created
 
 let path_rm store perm path =
 	let do_rm node name =
-- 
2.17.1


From d044f6cc590c58178d87ad78f1859d1c7905ee0b Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Wed, 25 Jul 2018 12:10:19 +0000
Subject: [PATCH 33/42] x86/spec-ctrl: Calculate safe PTE addresses for L1TF
 mitigations

Safe PTE addresses for L1TF mitigations are ones which are within the L1D
address width (may be wider than reported in CPUID), and above the highest
cacheable RAM/NVDIMM/BAR/etc.

All logic here is best-effort heuristics, which should in practice be fine for
most hardware.  Future work will see about disentangling the SRAT handling
further, as well as having L0 pass this information down to lower levels when
virtualised.

This is part of XSA-273 / CVE-2018-3620.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit b03a57c9383b32181e60add6b6de12b473652aa4)
---
 xen/arch/x86/setup.c            |  12 +++
 xen/arch/x86/spec_ctrl.c        | 153 ++++++++++++++++++++++++++++++++
 xen/arch/x86/srat.c             |   8 +-
 xen/common/efi/boot.c           |  12 +++
 xen/include/asm-x86/spec_ctrl.h |   7 ++
 5 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 66fd13f93a..3cd3e81b30 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -912,6 +912,18 @@ void __init noreturn __start_xen(unsigned long mbi_p)
     /* Sanitise the raw E820 map to produce a final clean version. */
     max_page = raw_max_page = init_e820(memmap_type, &e820_raw);
 
+    if ( !efi_enabled(EFI_BOOT) )
+    {
+        /*
+         * Supplement the heuristics in l1tf_calculations() by assuming that
+         * anything referenced in the E820 may be cacheable.
+         */
+        l1tf_safe_maddr =
+            max(l1tf_safe_maddr,
+                ROUNDUP(e820_raw.map[e820_raw.nr_map - 1].addr +
+                        e820_raw.map[e820_raw.nr_map - 1].size, PAGE_SIZE));
+    }
+
     /* Create a temporary copy of the E820 map. */
     memcpy(&boot_e820, &e820, sizeof(e820));
 
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 32213ace86..fe15a58de0 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -50,6 +50,10 @@ bool __initdata bsp_delay_spec_ctrl;
 uint8_t __read_mostly default_xen_spec_ctrl;
 uint8_t __read_mostly default_spec_ctrl_flags;
 
+paddr_t __read_mostly l1tf_addr_mask, __read_mostly l1tf_safe_maddr;
+static bool __initdata cpu_has_bug_l1tf;
+static unsigned int __initdata l1d_maxphysaddr;
+
 static int __init parse_bti(const char *s)
 {
     const char *ss;
@@ -420,6 +424,153 @@ static bool __init should_use_eager_fpu(void)
     }
 }
 
+/* Calculate whether this CPU is vulnerable to L1TF. */
+static __init void l1tf_calculations(uint64_t caps)
+{
+    bool hit_default = false;
+
+    l1d_maxphysaddr = paddr_bits;
+
+    /* L1TF is only known to affect Intel Family 6 processors at this time. */
+    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+         boot_cpu_data.x86 == 6 )
+    {
+        switch ( boot_cpu_data.x86_model )
+        {
+            /*
+             * Core processors since at least Penryn are vulnerable.
+             */
+        case 0x17: /* Penryn */
+        case 0x1d: /* Dunnington */
+            cpu_has_bug_l1tf = true;
+            break;
+
+        case 0x1f: /* Auburndale / Havendale */
+        case 0x1e: /* Nehalem */
+        case 0x1a: /* Nehalem EP */
+        case 0x2e: /* Nehalem EX */
+        case 0x25: /* Westmere */
+        case 0x2c: /* Westmere EP */
+        case 0x2f: /* Westmere EX */
+            cpu_has_bug_l1tf = true;
+            l1d_maxphysaddr = 44;
+            break;
+
+        case 0x2a: /* SandyBridge */
+        case 0x2d: /* SandyBridge EP/EX */
+        case 0x3a: /* IvyBridge */
+        case 0x3e: /* IvyBridge EP/EX */
+        case 0x3c: /* Haswell */
+        case 0x3f: /* Haswell EX/EP */
+        case 0x45: /* Haswell D */
+        case 0x46: /* Haswell H */
+        case 0x3d: /* Broadwell */
+        case 0x47: /* Broadwell H */
+        case 0x4f: /* Broadwell EP/EX */
+        case 0x56: /* Broadwell D */
+        case 0x4e: /* Skylake M */
+        case 0x55: /* Skylake X */
+        case 0x5e: /* Skylake D */
+        case 0x66: /* Cannonlake */
+        case 0x67: /* Cannonlake? */
+        case 0x8e: /* Kabylake M */
+        case 0x9e: /* Kabylake D */
+            cpu_has_bug_l1tf = true;
+            l1d_maxphysaddr = 46;
+            break;
+
+            /*
+             * Atom processors are not vulnerable.
+             */
+        case 0x1c: /* Pineview */
+        case 0x26: /* Lincroft */
+        case 0x27: /* Penwell */
+        case 0x35: /* Cloverview */
+        case 0x36: /* Cedarview */
+        case 0x37: /* Baytrail / Valleyview (Silvermont) */
+        case 0x4d: /* Avaton / Rangely (Silvermont) */
+        case 0x4c: /* Cherrytrail / Brasswell */
+        case 0x4a: /* Merrifield */
+        case 0x5a: /* Moorefield */
+        case 0x5c: /* Goldmont */
+        case 0x5f: /* Denverton */
+        case 0x7a: /* Gemini Lake */
+            break;
+
+            /*
+             * Knights processors are not vulnerable.
+             */
+        case 0x57: /* Knights Landing */
+        case 0x85: /* Knights Mill */
+            break;
+
+        default:
+            /* Defer printk() until we've accounted for RDCL_NO. */
+            hit_default = true;
+            cpu_has_bug_l1tf = true;
+            break;
+        }
+    }
+
+    /* Any processor advertising RDCL_NO should be not vulnerable to L1TF. */
+    if ( caps & ARCH_CAPABILITIES_RDCL_NO )
+        cpu_has_bug_l1tf = false;
+
+    if ( cpu_has_bug_l1tf && hit_default )
+        printk("Unrecognised CPU model %#x - assuming vulnerable to L1TF\n",
+               boot_cpu_data.x86_model);
+
+    /*
+     * L1TF safe address heuristics.  These apply to the real hardware we are
+     * running on, and are best-effort-only if Xen is virtualised.
+     *
+     * The address mask which the L1D cache uses, which might be wider than
+     * the CPUID-reported maxphysaddr.
+     */
+    l1tf_addr_mask = ((1ul << l1d_maxphysaddr) - 1) & PAGE_MASK;
+
+    /*
+     * To be safe, l1tf_safe_maddr must be above the highest cacheable entity
+     * in system physical address space.  However, to preserve space for
+     * paged-out metadata, it should be as low as possible above the highest
+     * cacheable address, so as to require fewer high-order bits being set.
+     *
+     * These heuristics are based on some guesswork to improve the likelihood
+     * of safety in the common case, including Linux's L1TF mitigation of
+     * inverting all address bits in a non-present PTE.
+     *
+     * - If L1D is wider than CPUID (Nehalem and later mobile/desktop/low end
+     *   server), setting any address bit beyond CPUID maxphysaddr guarantees
+     *   to make the PTE safe.  This case doesn't require all the high-order
+     *   bits being set, and doesn't require any other source of information
+     *   for safety.
+     *
+     * - If L1D is the same as CPUID (Pre-Nehalem, or high end server), we
+     *   must sacrifice high order bits from the real address space for
+     *   safety.  Therefore, make a blind guess that there is nothing
+     *   cacheable in the top quarter of physical address space.
+     *
+     *   It is exceedingly unlikely for machines to be populated with this
+     *   much RAM (likely 512G on pre-Nehalem, 16T on Nehalem/Westmere, 64T on
+     *   Sandybridge and later) due to the sheer volume of DIMMs this would
+     *   actually take.
+     *
+     *   However, it is possible to find machines this large, so the "top
+     *   quarter" guess is supplemented to push the limit higher if references
+     *   to cacheable mappings (E820/SRAT/EFI/etc) are found above the top
+     *   quarter boundary.
+     *
+     *   Finally, this top quarter guess gives us a good chance of being safe
+     *   when running virtualised (and the CPUID maxphysaddr hasn't been
+     *   levelled for heterogeneous migration safety), where the safety
+     *   consideration is still in terms of host details, but all E820/etc
+     *   information is in terms of guest physical layout.
+     */
+    l1tf_safe_maddr = max(l1tf_safe_maddr, ((l1d_maxphysaddr > paddr_bits)
+                                            ? (1ul << paddr_bits)
+                                            : (3ul << (paddr_bits - 2))));
+}
+
 int8_t __read_mostly opt_xpti = -1;
 
 static __init void xpti_init_default(uint64_t caps)
@@ -633,6 +784,8 @@ void __init init_speculation_mitigations(void)
     else
         setup_clear_cpu_cap(X86_FEATURE_NO_XPTI);
 
+    l1tf_calculations(caps);
+
     print_details(thunk, caps);
 
     /*
diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
index 166eb44fe2..2d70b45909 100644
--- a/xen/arch/x86/srat.c
+++ b/xen/arch/x86/srat.c
@@ -20,6 +20,7 @@
 #include <xen/pfn.h>
 #include <asm/e820.h>
 #include <asm/page.h>
+#include <asm/spec_ctrl.h>
 
 static struct acpi_table_slit *__read_mostly acpi_slit;
 
@@ -284,6 +285,11 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
 	if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
 		return;
 
+	start = ma->base_address;
+	end = start + ma->length;
+	/* Supplement the heuristics in l1tf_calculations(). */
+	l1tf_safe_maddr = max(l1tf_safe_maddr, ROUNDUP(end, PAGE_SIZE));
+
 	if (num_node_memblks >= NR_NODE_MEMBLKS)
 	{
 		dprintk(XENLOG_WARNING,
@@ -292,8 +298,6 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
 		return;
 	}
 
-	start = ma->base_address;
-	end = start + ma->length;
 	pxm = ma->proximity_domain;
 	if (srat_rev < 2)
 		pxm &= 0xff;
diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c
index 64d12685d3..6be0b3986f 100644
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -1304,6 +1304,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
 
 #ifndef CONFIG_ARM /* TODO - runtime service support */
 
+#include <asm/spec_ctrl.h>
+
 static bool __initdata efi_map_uc;
 
 static int __init parse_efi_param(const char *s)
@@ -1419,6 +1421,16 @@ void __init efi_init_memory(void)
                desc->PhysicalStart, desc->PhysicalStart + len - 1,
                desc->Type, desc->Attribute);
 
+        if ( (desc->Attribute & (EFI_MEMORY_WB | EFI_MEMORY_WT)) ||
+             (efi_bs_revision >= EFI_REVISION(2, 5) &&
+              (desc->Attribute & EFI_MEMORY_WP)) )
+        {
+            /* Supplement the heuristics in l1tf_calculations(). */
+            l1tf_safe_maddr =
+                max(l1tf_safe_maddr,
+                    ROUNDUP(desc->PhysicalStart + len, PAGE_SIZE));
+        }
+
         if ( !efi_enabled(EFI_RS) ||
              (!(desc->Attribute & EFI_MEMORY_RUNTIME) &&
               (!map_bs ||
diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
index fea82603ca..d7e8ed0f5f 100644
--- a/xen/include/asm-x86/spec_ctrl.h
+++ b/xen/include/asm-x86/spec_ctrl.h
@@ -38,6 +38,13 @@ extern int8_t opt_xpti;
 #define OPT_XPTI_DOM0  0x01
 #define OPT_XPTI_DOMU  0x02
 
+/*
+ * The L1D address mask, which might be wider than reported in CPUID, and the
+ * system physical address above which there are believed to be no cacheable
+ * memory regions, thus unable to leak data via the L1TF vulnerability.
+ */
+extern paddr_t l1tf_addr_mask, l1tf_safe_maddr;
+
 static inline void init_shadow_spec_ctrl_state(void)
 {
     struct cpu_info *info = get_cpu_info();
-- 
2.17.1


From 57483c09ef4fe9489ec4214989a97949916fecc0 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 23 Jul 2018 13:46:10 +0000
Subject: [PATCH 34/42] x86/spec-ctrl: Introduce an option to control L1TF
 mitigation for PV guests

Shadowing a PV guest is only available when shadow paging is compiled in.
When shadow paging isn't available, guests can be crashed instead as
mitigation from Xen's point of view.

Ideally, dom0 would also be potentially-shadowed-by-default, but dom0 has
never been shadowed before, and there are some stability issues under
investigation.

This is part of XSA-273 / CVE-2018-3620.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit 66a4e986819a86ba66ca2fe9d925e62a4fd30114)
---
 docs/misc/xen-command-line.markdown | 24 ++++++++
 xen/arch/x86/Kconfig                |  1 +
 xen/arch/x86/spec_ctrl.c            | 89 ++++++++++++++++++++++++++++-
 xen/include/asm-x86/spec_ctrl.h     |  4 ++
 4 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index e5e7fdc405..763cc1d878 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1546,6 +1546,30 @@ do; there may be other custom operating systems which do.  If you're
 certain you don't plan on having PV guests which use this feature,
 turning it off can reduce the attack surface.
 
+### pv-l1tf (x86)
+> `= List of [ <bool>, dom0=<bool>, domu=<bool> ]`
+
+> Default: `false` on believed-unaffected hardware, or in pv-shim mode.
+>          `domu`  on believed-affected hardware.
+
+Mitigations for L1TF / XSA-273 / CVE-2018-3620 for PV guests.
+
+For backwards compatibility, we may not alter an architecturally-legitimate
+pagetable entry a PV guest chooses to write.  We can however force such a
+guest into shadow mode so that Xen controls the PTEs which are reachable by
+the CPU pagewalk.
+
+Shadowing is performed at the point where a PV guest first tries to write an
+L1TF-vulnerable PTE.  Therefore, a PV guest kernel which has been updated with
+its own L1TF mitigations will not trigger shadow mode if it is well behaved.
+
+If CONFIG\_SHADOW\_PAGING is not compiled in, this mitigation instead crashes
+the guest when an L1TF-vulnerable PTE is written, which still allows updated,
+well-behaved PV guests to run, despite Shadow being compiled out.
+
+In the pv-shim case, Shadow is expected to be compiled out, and a malicious
+guest kernel can only leak data from the shim Xen, rather than the host Xen.
+
 ### pv-shim (x86)
 > `= <boolean>`
 
diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
index f64fc56739..cfba4a708c 100644
--- a/xen/arch/x86/Kconfig
+++ b/xen/arch/x86/Kconfig
@@ -72,6 +72,7 @@ config SHADOW_PAGING
             * Running HVM guests on hardware lacking hardware paging support
               (First-generation Intel VT-x or AMD SVM).
             * Live migration of PV guests.
+            * L1TF sidechannel mitigation for PV guests.
 
           Under a small number of specific workloads, shadow paging may be
           deliberately used as a performance optimisation.
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index fe15a58de0..7995e27218 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -23,6 +23,7 @@
 #include <asm/microcode.h>
 #include <asm/msr.h>
 #include <asm/processor.h>
+#include <asm/pv/shim.h>
 #include <asm/spec_ctrl.h>
 #include <asm/spec_ctrl_asm.h>
 
@@ -203,6 +204,55 @@ static int __init parse_spec_ctrl(const char *s)
 }
 custom_param("spec-ctrl", parse_spec_ctrl);
 
+int8_t __read_mostly opt_pv_l1tf = -1;
+
+static __init int parse_pv_l1tf(const char *s)
+{
+    const char *ss;
+    int val, rc = 0;
+
+    /* Inhibit the defaults as an explicit choice has been given. */
+    if ( opt_pv_l1tf == -1 )
+        opt_pv_l1tf = 0;
+
+    /* Interpret 'pv-l1tf' alone in its positive boolean form. */
+    if ( *s == '\0' )
+        opt_xpti = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
+
+    do {
+        ss = strchr(s, ',');
+        if ( !ss )
+            ss = strchr(s, '\0');
+
+        switch ( parse_bool(s, ss) )
+        {
+        case 0:
+            opt_pv_l1tf = 0;
+            break;
+
+        case 1:
+            opt_pv_l1tf = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
+            break;
+
+        default:
+            if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
+                opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOM0) |
+                               (val ? OPT_PV_L1TF_DOM0 : 0));
+            else if ( (val = parse_boolean("domu", s, ss)) >= 0 )
+                opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOMU) |
+                               (val ? OPT_PV_L1TF_DOMU : 0));
+            else
+                rc = -EINVAL;
+            break;
+        }
+
+        s = ss + 1;
+    } while ( *ss );
+
+    return rc;
+}
+custom_param("pv-l1tf", parse_pv_l1tf);
+
 static void __init print_details(enum ind_thunk thunk, uint64_t caps)
 {
     unsigned int _7d0 = 0, e8b = 0, tmp;
@@ -226,9 +276,16 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
            (caps & ARCH_CAPS_RSBA)                  ? " RSBA"      : "",
            (caps & ARCH_CAPS_SSB_NO)                ? " SSB_NO"    : "");
 
-    /* Compiled-in support which pertains to BTI mitigations. */
-    if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) )
-        printk("  Compiled-in support: INDIRECT_THUNK\n");
+    /* Compiled-in support which pertains to mitigations. */
+    if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
+        printk("  Compiled-in support:"
+#ifdef CONFIG_INDIRECT_THUNK
+               " INDIRECT_THUNK"
+#endif
+#ifdef CONFIG_SHADOW_PAGING
+               " SHADOW_PAGING"
+#endif
+               "\n");
 
     /* Settings for Xen's protection, irrespective of guests. */
     printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s\n",
@@ -242,6 +299,13 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
            (default_xen_spec_ctrl & SPEC_CTRL_SSBD)  ? " SSBD+" : " SSBD-",
            opt_ibpb                                  ? " IBPB"  : "");
 
+    /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
+    if ( cpu_has_bug_l1tf || opt_pv_l1tf )
+        printk("  L1TF: believed%s vulnerable, maxphysaddr L1D %u, CPUID %u"
+               ", Safe address %"PRIx64"\n",
+               cpu_has_bug_l1tf ? "" : " not",
+               l1d_maxphysaddr, paddr_bits, l1tf_safe_maddr);
+
     /*
      * Alternatives blocks for protecting against and/or virtualising
      * mitigation support for guests.
@@ -263,6 +327,10 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
     printk("  XPTI (64-bit PV only): Dom0 %s, DomU %s\n",
            opt_xpti & OPT_XPTI_DOM0 ? "enabled" : "disabled",
            opt_xpti & OPT_XPTI_DOMU ? "enabled" : "disabled");
+
+    printk("  PV L1TF shadowing: Dom0 %s, DomU %s\n",
+           opt_pv_l1tf & OPT_PV_L1TF_DOM0  ? "enabled"  : "disabled",
+           opt_pv_l1tf & OPT_PV_L1TF_DOMU  ? "enabled"  : "disabled");
 }
 
 /* Calculate whether Retpoline is known-safe on this CPU. */
@@ -786,6 +854,21 @@ void __init init_speculation_mitigations(void)
 
     l1tf_calculations(caps);
 
+    /*
+     * By default, enable PV domU L1TF mitigations on all L1TF-vulnerable
+     * hardware, except when running in shim mode.
+     *
+     * In shim mode, SHADOW is expected to be compiled out, and a malicious
+     * guest kernel can only attack the shim Xen, not the host Xen.
+     */
+    if ( opt_pv_l1tf == -1 )
+    {
+        if ( pv_shim || !cpu_has_bug_l1tf )
+            opt_pv_l1tf = 0;
+        else
+            opt_pv_l1tf = OPT_PV_L1TF_DOMU;
+    }
+
     print_details(thunk, caps);
 
     /*
diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
index d7e8ed0f5f..cdf5737dc2 100644
--- a/xen/include/asm-x86/spec_ctrl.h
+++ b/xen/include/asm-x86/spec_ctrl.h
@@ -38,6 +38,10 @@ extern int8_t opt_xpti;
 #define OPT_XPTI_DOM0  0x01
 #define OPT_XPTI_DOMU  0x02
 
+extern int8_t opt_pv_l1tf;
+#define OPT_PV_L1TF_DOM0  0x01
+#define OPT_PV_L1TF_DOMU  0x02
+
 /*
  * The L1D address mask, which might be wider than reported in CPUID, and the
  * system physical address above which there are believed to be no cacheable
-- 
2.17.1


From 02d2c660935cfd6ff2438afb3892776dfc7db711 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 23 Jul 2018 07:11:40 +0100
Subject: [PATCH 35/42] x86/shadow: Infrastructure to force a PV guest into
 shadow mode

To mitigate L1TF, we cannot alter an architecturally-legitimate PTE a PV guest
chooses to write, but we can force the PV domain into shadow mode so Xen
controls the PTEs which are reachable by the CPU pagewalk.

Introduce new shadow mode, PG_SH_forced, and a tasklet to perform the
transition.  Later patches will introduce the logic to enable this mode at the
appropriate time.

To simplify vcpu cleanup, make tasklet_kill() idempotent with respect to
tasklet_init(), which involves adding a helper to check for an uninitialised
list head.

This is part of XSA-273 / CVE-2018-3620.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Tim Deegan <tim@xen.org>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit b76ec3946bf6caca2c3950b857c008bc8db6723f)
---
 xen/arch/x86/mm/paging.c        |  2 ++
 xen/arch/x86/mm/shadow/common.c | 36 +++++++++++++++++++++++++++++++++
 xen/arch/x86/pv/domain.c        |  5 +++++
 xen/common/tasklet.c            |  5 +++++
 xen/include/asm-x86/domain.h    |  7 +++++++
 xen/include/asm-x86/paging.h    |  4 ++++
 xen/include/asm-x86/shadow.h    | 32 +++++++++++++++++++++++++++++
 xen/include/xen/list.h          |  5 +++++
 8 files changed, 96 insertions(+)

diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
index 2b0445ffe9..dcee496eb0 100644
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -873,6 +873,8 @@ void paging_dump_domain_info(struct domain *d)
         printk("    paging assistance: ");
         if ( paging_mode_shadow(d) )
             printk("shadow ");
+        if ( paging_mode_sh_forced(d) )
+            printk("forced ");
         if ( paging_mode_hap(d) )
             printk("hap ");
         if ( paging_mode_refcounts(d) )
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index dd61b50eb7..fd42d734e7 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -3177,6 +3177,15 @@ static void sh_new_mode(struct domain *d, u32 new_mode)
     ASSERT(paging_locked_by_me(d));
     ASSERT(d != current->domain);
 
+    /*
+     * If PG_SH_forced has previously been activated because of writing an
+     * L1TF-vulnerable PTE, it must remain active for the remaining lifetime
+     * of the domain, even if the logdirty mode needs to be controlled for
+     * migration purposes.
+     */
+    if ( paging_mode_sh_forced(d) )
+        new_mode |= PG_SH_forced | PG_SH_enable;
+
     d->arch.paging.mode = new_mode;
     for_each_vcpu(d, v)
         sh_update_paging_modes(v);
@@ -4057,6 +4066,33 @@ void shadow_audit_tables(struct vcpu *v)
 
 #endif /* Shadow audit */
 
+#ifdef CONFIG_PV
+
+void pv_l1tf_tasklet(unsigned long data)
+{
+    struct domain *d = (void *)data;
+
+    domain_pause(d);
+    paging_lock(d);
+
+    if ( !paging_mode_sh_forced(d) && !d->is_dying )
+    {
+        int ret = shadow_one_bit_enable(d, PG_SH_forced);
+
+        if ( ret )
+        {
+            printk(XENLOG_G_ERR "d%d Failed to enable PG_SH_forced: %d\n",
+                   d->domain_id, ret);
+            domain_crash(d);
+        }
+    }
+
+    paging_unlock(d);
+    domain_unpause(d);
+}
+
+#endif /* CONFIG_PV */
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index a4f0bd239d..3230ac6a22 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -13,6 +13,7 @@
 #include <asm/invpcid.h>
 #include <asm/spec_ctrl.h>
 #include <asm/pv/domain.h>
+#include <asm/shadow.h>
 
 static __read_mostly enum {
     PCID_OFF,
@@ -209,6 +210,8 @@ int pv_vcpu_initialise(struct vcpu *v)
 
 void pv_domain_destroy(struct domain *d)
 {
+    pv_l1tf_domain_destroy(d);
+
     destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
                               GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
 
@@ -229,6 +232,8 @@ int pv_domain_initialise(struct domain *d)
     };
     int rc = -ENOMEM;
 
+    pv_l1tf_domain_init(d);
+
     d->arch.pv_domain.gdt_ldt_l1tab =
         alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
     if ( !d->arch.pv_domain.gdt_ldt_l1tab )
diff --git a/xen/common/tasklet.c b/xen/common/tasklet.c
index 0f0a6f8365..d4fea3151c 100644
--- a/xen/common/tasklet.c
+++ b/xen/common/tasklet.c
@@ -156,6 +156,10 @@ void tasklet_kill(struct tasklet *t)
 
     spin_lock_irqsave(&tasklet_lock, flags);
 
+    /* Cope with uninitialised tasklets. */
+    if ( list_head_is_null(&t->list) )
+        goto unlock;
+
     if ( !list_empty(&t->list) )
     {
         BUG_ON(t->is_dead || t->is_running || (t->scheduled_on < 0));
@@ -172,6 +176,7 @@ void tasklet_kill(struct tasklet *t)
         spin_lock_irqsave(&tasklet_lock, flags);
     }
 
+ unlock:
     spin_unlock_irqrestore(&tasklet_lock, flags);
 }
 
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index e0d413c7de..61e6900465 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -121,6 +121,11 @@ struct shadow_domain {
 
     /* Has this domain ever used HVMOP_pagetable_dying? */
     bool_t pagetable_dying_op;
+
+#ifdef CONFIG_PV
+    /* PV L1 Terminal Fault mitigation. */
+    struct tasklet pv_l1tf_tasklet;
+#endif /* CONFIG_PV */
 #endif
 };
 
@@ -257,6 +262,8 @@ struct pv_domain
     bool xpti;
     /* Use PCID feature? */
     bool pcid;
+    /* Mitigate L1TF with shadow/crashing? */
+    bool check_l1tf;
 
     /* map_domain_page() mapping cache. */
     struct mapcache_domain mapcache;
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index f0085511c7..f440e3e53c 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -37,11 +37,14 @@
 
 #define PG_SH_shift    20
 #define PG_HAP_shift   21
+#define PG_SHF_shift   22
 /* We're in one of the shadow modes */
 #ifdef CONFIG_SHADOW_PAGING
 #define PG_SH_enable   (1U << PG_SH_shift)
+#define PG_SH_forced   (1U << PG_SHF_shift)
 #else
 #define PG_SH_enable   0
+#define PG_SH_forced   0
 #endif
 #define PG_HAP_enable  (1U << PG_HAP_shift)
 
@@ -62,6 +65,7 @@
 
 #define paging_mode_enabled(_d)   (!!(_d)->arch.paging.mode)
 #define paging_mode_shadow(_d)    (!!((_d)->arch.paging.mode & PG_SH_enable))
+#define paging_mode_sh_forced(_d) (!!((_d)->arch.paging.mode & PG_SH_forced))
 #define paging_mode_hap(_d)       (!!((_d)->arch.paging.mode & PG_HAP_enable))
 
 #define paging_mode_refcounts(_d) (!!((_d)->arch.paging.mode & PG_refcounts))
diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
index 94a34fd16a..14afb7db52 100644
--- a/xen/include/asm-x86/shadow.h
+++ b/xen/include/asm-x86/shadow.h
@@ -29,6 +29,7 @@
 #include <asm/flushtlb.h>
 #include <asm/paging.h>
 #include <asm/p2m.h>
+#include <asm/spec_ctrl.h>
 
 /*****************************************************************************
  * Macros to tell which shadow paging mode a domain is in*/
@@ -115,6 +116,37 @@ static inline int shadow_domctl(struct domain *d,
 
 #endif /* CONFIG_SHADOW_PAGING */
 
+/*
+ * Mitigations for L1TF / CVE-2018-3620 for PV guests.
+ *
+ * We cannot alter an architecturally-legitimate PTE which a PV guest has
+ * chosen to write, as traditional paged-out metadata is L1TF-vulnerable.
+ * What we can do is force a PV guest which writes a vulnerable PTE into
+ * shadow mode, so Xen controls the pagetables which are reachable by the CPU
+ * pagewalk.
+ */
+
+void pv_l1tf_tasklet(unsigned long data);
+
+static inline void pv_l1tf_domain_init(struct domain *d)
+{
+    d->arch.pv_domain.check_l1tf =
+        opt_pv_l1tf & (is_hardware_domain(d)
+                       ? OPT_PV_L1TF_DOM0 : OPT_PV_L1TF_DOMU);
+
+#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV)
+    tasklet_init(&d->arch.paging.shadow.pv_l1tf_tasklet,
+                 pv_l1tf_tasklet, (unsigned long)d);
+#endif
+}
+
+static inline void pv_l1tf_domain_destroy(struct domain *d)
+{
+#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV)
+    tasklet_kill(&d->arch.paging.shadow.pv_l1tf_tasklet);
+#endif
+}
+
 /* Remove all shadows of the guest mfn. */
 static inline void shadow_remove_all_shadows(struct domain *d, mfn_t gmfn)
 {
diff --git a/xen/include/xen/list.h b/xen/include/xen/list.h
index fa07d720ee..1387abb211 100644
--- a/xen/include/xen/list.h
+++ b/xen/include/xen/list.h
@@ -51,6 +51,11 @@ static inline void INIT_LIST_HEAD(struct list_head *list)
     list->prev = list;
 }
 
+static inline bool list_head_is_null(const struct list_head *list)
+{
+    return !list->next && !list->prev;
+}
+
 /*
  * Insert a new entry between two known consecutive entries. 
  *
-- 
2.17.1


From f4a049ede7ee9e1fafad6248cffc5e6deac1bc39 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 23 Jul 2018 08:11:40 +0200
Subject: [PATCH 36/42] x86/mm: Plumbing to allow any PTE update to fail with
 -ERESTART

Switching to shadow mode is performed in tasklet context.  To facilitate this,
we schedule the tasklet, then create a hypercall continuation to allow the
switch to take place.

As a consequence, the x86 mm code needs to cope with an L1e operation being
continuable.  do_mmu{,ext}_op() may no longer assert that a continuation
doesn't happen on the final iteration.

To handle the arguments correctly on continuation, compat_update_va_mapping*()
may no longer call into their non-compat counterparts.  Move the compat
functions into mm.c rather than exporting __do_update_va_mapping() and
{get,put}_pg_owner(), and fix an unsigned long/int inconsistency with
compat_update_va_mapping_otherdomain().

This is part of XSA-273 / CVE-2018-3620.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit c612481d1c9232c6abf91b03ec655e92f808805f)
---
 xen/arch/x86/mm.c               | 83 ++++++++++++++++++++++++++-------
 xen/arch/x86/x86_64/compat/mm.c | 13 ------
 xen/include/asm-x86/hypercall.h |  2 +-
 3 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index bcf46c0743..657af50c4c 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -613,6 +613,9 @@ static int alloc_segdesc_page(struct page_info *page)
     return i == 512 ? 0 : -EINVAL;
 }
 
+static int _get_page_type(struct page_info *page, unsigned long type,
+                          bool preemptible);
+
 static int get_page_and_type_from_mfn(
     mfn_t mfn, unsigned long type, struct domain *d,
     int partial, int preemptible)
@@ -624,9 +627,7 @@ static int get_page_and_type_from_mfn(
          unlikely(!get_page_from_mfn(mfn, d)) )
         return -EINVAL;
 
-    rc = (preemptible ?
-          get_page_type_preemptible(page, type) :
-          (get_page_type(page, type) ? 0 : -EINVAL));
+    rc = _get_page_type(page, type, preemptible);
 
     if ( unlikely(rc) && partial >= 0 &&
          (!preemptible || page != current->arch.old_guest_table) )
@@ -1456,8 +1457,7 @@ static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
     return 1;
 }
 
-static int alloc_l2_table(struct page_info *page, unsigned long type,
-                          int preemptible)
+static int alloc_l2_table(struct page_info *page, unsigned long type)
 {
     struct domain *d = page_get_owner(page);
     unsigned long  pfn = mfn_x(page_to_mfn(page));
@@ -1469,8 +1469,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type,
 
     for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
     {
-        if ( preemptible && i > page->nr_validated_ptes
-             && hypercall_preempt_check() )
+        if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
         {
             page->nr_validated_ptes = i;
             rc = -ERESTART;
@@ -1481,6 +1480,12 @@ static int alloc_l2_table(struct page_info *page, unsigned long type,
              (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
             continue;
 
+        if ( unlikely(rc == -ERESTART) )
+        {
+            page->nr_validated_ptes = i;
+            break;
+        }
+
         if ( rc < 0 )
         {
             gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
@@ -1763,7 +1768,7 @@ static void free_l1_table(struct page_info *page)
 }
 
 
-static int free_l2_table(struct page_info *page, int preemptible)
+static int free_l2_table(struct page_info *page)
 {
     struct domain *d = page_get_owner(page);
     unsigned long pfn = mfn_x(page_to_mfn(page));
@@ -1777,7 +1782,7 @@ static int free_l2_table(struct page_info *page, int preemptible)
     do {
         if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
              put_page_from_l2e(pl2e[i], pfn) == 0 &&
-             preemptible && i && hypercall_preempt_check() )
+             i && hypercall_preempt_check() )
         {
            page->nr_validated_ptes = i;
            err = -ERESTART;
@@ -2373,7 +2378,8 @@ static int alloc_page_type(struct page_info *page, unsigned long type,
         rc = alloc_l1_table(page);
         break;
     case PGT_l2_page_table:
-        rc = alloc_l2_table(page, type, preemptible);
+        ASSERT(preemptible);
+        rc = alloc_l2_table(page, type);
         break;
     case PGT_l3_page_table:
         ASSERT(preemptible);
@@ -2463,7 +2469,8 @@ int free_page_type(struct page_info *page, unsigned long type,
         rc = 0;
         break;
     case PGT_l2_page_table:
-        rc = free_l2_table(page, preemptible);
+        ASSERT(preemptible);
+        rc = free_l2_table(page);
         break;
     case PGT_l3_page_table:
         ASSERT(preemptible);
@@ -3550,12 +3557,9 @@ long do_mmuext_op(
     }
 
     if ( rc == -ERESTART )
-    {
-        ASSERT(i < count);
         rc = hypercall_create_continuation(
             __HYPERVISOR_mmuext_op, "hihi",
             uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
-    }
     else if ( curr->arch.old_guest_table )
     {
         XEN_GUEST_HANDLE_PARAM(void) null;
@@ -3861,12 +3865,9 @@ long do_mmu_update(
     }
 
     if ( rc == -ERESTART )
-    {
-        ASSERT(i < count);
         rc = hypercall_create_continuation(
             __HYPERVISOR_mmu_update, "hihi",
             ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
-    }
     else if ( curr->arch.old_guest_table )
     {
         XEN_GUEST_HANDLE_PARAM(void) null;
@@ -4121,7 +4122,13 @@ static int __do_update_va_mapping(
 long do_update_va_mapping(unsigned long va, u64 val64,
                           unsigned long flags)
 {
-    return __do_update_va_mapping(va, val64, flags, current->domain);
+    int rc = __do_update_va_mapping(va, val64, flags, current->domain);
+
+    if ( rc == -ERESTART )
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_update_va_mapping, "lll", va, val64, flags);
+
+    return rc;
 }
 
 long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
@@ -4138,6 +4145,46 @@ long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
 
     put_pg_owner(pg_owner);
 
+    if ( rc == -ERESTART )
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_update_va_mapping_otherdomain,
+            "llli", va, val64, flags, domid);
+
+    return rc;
+}
+
+int compat_update_va_mapping(unsigned int va, uint32_t lo, uint32_t hi,
+                             unsigned int flags)
+{
+    int rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo,
+                                    flags, current->domain);
+
+    if ( rc == -ERESTART )
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_update_va_mapping, "iiii", va, lo, hi, flags);
+
+    return rc;
+}
+
+int compat_update_va_mapping_otherdomain(unsigned int va,
+                                         uint32_t lo, uint32_t hi,
+                                         unsigned int flags, domid_t domid)
+{
+    struct domain *pg_owner;
+    int rc;
+
+    if ( (pg_owner = get_pg_owner(domid)) == NULL )
+        return -ESRCH;
+
+    rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo, flags, pg_owner);
+
+    put_pg_owner(pg_owner);
+
+    if ( rc == -ERESTART )
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_update_va_mapping_otherdomain,
+            "iiiii", va, lo, hi, flags, domid);
+
     return rc;
 }
 
diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c
index c2aa6f2fdb..02bc75b91e 100644
--- a/xen/arch/x86/x86_64/compat/mm.c
+++ b/xen/arch/x86/x86_64/compat/mm.c
@@ -163,19 +163,6 @@ int compat_arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
     return rc;
 }
 
-int compat_update_va_mapping(unsigned int va, u32 lo, u32 hi,
-                             unsigned int flags)
-{
-    return do_update_va_mapping(va, lo | ((u64)hi << 32), flags);
-}
-
-int compat_update_va_mapping_otherdomain(unsigned long va, u32 lo, u32 hi,
-                                         unsigned long flags,
-                                         domid_t domid)
-{
-    return do_update_va_mapping_otherdomain(va, lo | ((u64)hi << 32), flags, domid);
-}
-
 DEFINE_XEN_GUEST_HANDLE(mmuext_op_compat_t);
 
 int compat_mmuext_op(XEN_GUEST_HANDLE_PARAM(void) arg,
diff --git a/xen/include/asm-x86/hypercall.h b/xen/include/asm-x86/hypercall.h
index 1cc2e37d5c..da38b7991c 100644
--- a/xen/include/asm-x86/hypercall.h
+++ b/xen/include/asm-x86/hypercall.h
@@ -165,7 +165,7 @@ extern int compat_update_va_mapping(
     unsigned int va, u32 lo, u32 hi, unsigned int flags);
 
 extern int compat_update_va_mapping_otherdomain(
-    unsigned long va, u32 lo, u32 hi, unsigned long flags, domid_t domid);
+    unsigned int va, u32 lo, u32 hi, unsigned int flags, domid_t domid);
 
 DEFINE_XEN_GUEST_HANDLE(trap_info_compat_t);
 extern int compat_set_trap_table(XEN_GUEST_HANDLE(trap_info_compat_t) traps);
-- 
2.17.1


From 665e7685b4f5a683101ef833c45415e2548d873f Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 23 Jul 2018 08:11:40 +0200
Subject: [PATCH 37/42] x86/pv: Force a guest into shadow mode when it writes
 an L1TF-vulnerable PTE

See the comment in shadow.h for an explanation of L1TF and the safety
consideration of the PTEs.

In the case that CONFIG_SHADOW_PAGING isn't compiled in, crash the domain
instead.  This allows well-behaved PV guests to function, while preventing
L1TF from being exploited.  (Note: PV guest kernels which haven't been updated
with L1TF mitigations will likely be crashed as soon as they try paging a
piece of userspace out to disk.)

This is part of XSA-273 / CVE-2018-3620.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Tim Deegan <tim@xen.org>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit 06e8b622d3f3c0fa5075e91b041c6f45549ad70a)
---
 xen/arch/x86/mm.c               | 22 ++++++--
 xen/arch/x86/pv/ro-page-fault.c |  5 ++
 xen/include/asm-x86/shadow.h    | 94 +++++++++++++++++++++++++++++++++
 xen/include/xen/tasklet.h       |  5 ++
 4 files changed, 123 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 657af50c4c..7d4871b791 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1116,7 +1116,7 @@ get_page_from_l2e(
     int rc;
 
     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-        return 1;
+        return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1;
 
     if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
     {
@@ -1147,7 +1147,7 @@ get_page_from_l3e(
     int rc;
 
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-        return 1;
+        return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1;
 
     if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
     {
@@ -1180,7 +1180,7 @@ get_page_from_l4e(
     int rc;
 
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return 1;
+        return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1;
 
     if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
     {
@@ -1390,6 +1390,13 @@ static int alloc_l1_table(struct page_info *page)
 
     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
     {
+        if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
+        {
+            ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0;
+            if ( ret )
+                goto out;
+        }
+
         switch ( ret = get_page_from_l1e(pl1e[i], d, d) )
         {
         default:
@@ -1410,6 +1417,7 @@ static int alloc_l1_table(struct page_info *page)
 
  fail:
     gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
+ out:
     while ( i-- > 0 )
         put_page_from_l1e(pl1e[i], d);
 
@@ -2060,6 +2068,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
             rc = -EBUSY;
         }
     }
+    else if ( pv_l1tf_check_l1e(pt_dom, nl1e) )
+        return -ERESTART;
     else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
                                      preserve_ad)) )
     {
@@ -2123,6 +2133,8 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
             rc = -EBUSY;
         }
     }
+    else if ( pv_l1tf_check_l2e(d, nl2e) )
+        return -ERESTART;
     else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
                                      preserve_ad)) )
     {
@@ -2184,6 +2196,8 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
             rc = -EFAULT;
         }
     }
+    else if ( pv_l1tf_check_l3e(d, nl3e) )
+        return -ERESTART;
     else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
                                      preserve_ad)) )
     {
@@ -2249,6 +2263,8 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
             rc = -EFAULT;
         }
     }
+    else if ( pv_l1tf_check_l4e(d, nl4e) )
+        return -ERESTART;
     else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
                                      preserve_ad)) )
     {
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
index aa8d5a7556..a3c0c2dd19 100644
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -29,6 +29,7 @@
 #include <asm/mm.h>
 #include <asm/pci.h>
 #include <asm/pv/mm.h>
+#include <asm/shadow.h>
 
 #include "emulate.h"
 #include "mm.h"
@@ -129,6 +130,10 @@ static int ptwr_emulated_update(unsigned long addr, intpte_t *p_old,
 
     /* Check the new PTE. */
     nl1e = l1e_from_intpte(val);
+
+    if ( !(l1e_get_flags(nl1e) & _PAGE_PRESENT) && pv_l1tf_check_l1e(d, nl1e) )
+        return X86EMUL_RETRY;
+
     switch ( ret = get_page_from_l1e(nl1e, d, d) )
     {
     default:
diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
index 14afb7db52..f40f411871 100644
--- a/xen/include/asm-x86/shadow.h
+++ b/xen/include/asm-x86/shadow.h
@@ -124,8 +124,102 @@ static inline int shadow_domctl(struct domain *d,
  * What we can do is force a PV guest which writes a vulnerable PTE into
  * shadow mode, so Xen controls the pagetables which are reachable by the CPU
  * pagewalk.
+ *
+ * The core of the L1TF vulnerability is that the address bits of the PTE
+ * (accounting for PSE and factoring in the level-relevant part of the linear
+ * access) are sent for an L1D lookup (to retrieve the next-level PTE, or
+ * eventual memory address) before the Present or reserved bits (which would
+ * cause a terminal fault) are accounted for.  If an L1D hit occurs, the
+ * resulting data is available for potentially dependent instructions.
+ *
+ * For Present PTEs, the PV type-count safety logic ensures that the address
+ * bits always point at a guest-accessible frame, which is safe WRT L1TF from
+ * Xen's point of view.  In practice, a PV guest should be unable to set any
+ * reserved bits, so should be unable to create any present L1TF-vulnerable
+ * PTEs at all.
+ *
+ * Therefore, these safety checks apply to Not-Present PTEs only, where
+ * traditionally, Xen would have let the guest write any value it chose.
+ *
+ * The all-zero PTE potentially leaks mfn 0.  All software on the system is
+ * expected to cooperate and not put any secrets there.  In a Xen system,
+ * neither Xen nor dom0 are expected to touch mfn 0, as it typically contains
+ * the real mode IVT and Bios Data Area.  Therefore, mfn 0 is considered safe.
+ *
+ * Any PTE whose address is higher than the maximum cacheable address is safe,
+ * as it won't get an L1D hit.
+ *
+ * Speculative superpages also need accounting for, as PSE is considered
+ * irrespective of Present.  We disallow PSE being set, as it allows an
+ * attacker to leak 2M or 1G of data starting from mfn 0.  Also, because of
+ * recursive/linear pagetables, we must consider PSE even at L4, as hardware
+ * will interpret an L4e as an L3e during a recursive walk.
  */
 
+static inline bool is_l1tf_safe_maddr(intpte_t pte)
+{
+    paddr_t maddr = pte & l1tf_addr_mask;
+
+    return maddr == 0 || maddr >= l1tf_safe_maddr;
+}
+
+static inline bool pv_l1tf_check_pte(struct domain *d, unsigned int level,
+                                     intpte_t pte)
+{
+    ASSERT(is_pv_domain(d));
+    ASSERT(!(pte & _PAGE_PRESENT));
+
+    if ( d->arch.pv_domain.check_l1tf && !paging_mode_sh_forced(d) &&
+         (((level > 1) && (pte & _PAGE_PSE)) || !is_l1tf_safe_maddr(pte)) )
+    {
+#ifdef CONFIG_SHADOW_PAGING
+        struct tasklet *t = &d->arch.paging.shadow.pv_l1tf_tasklet;
+
+        printk(XENLOG_G_WARNING
+               "d%d L1TF-vulnerable L%ue %016"PRIx64" - Shadowing\n",
+               d->domain_id, level, pte);
+        /*
+         * Safety consideration for accessing tasklet.scheduled_on without the
+         * tasklet lock.  This is a singleshot tasklet with the side effect of
+         * setting PG_SH_forced (checked just above).  Multiple vcpus can race
+         * to schedule the tasklet, but if we observe it scheduled anywhere,
+         * that is good enough.
+         */
+        smp_rmb();
+        if ( !tasklet_is_scheduled(t) )
+            tasklet_schedule(t);
+#else
+        printk(XENLOG_G_ERR
+               "d%d L1TF-vulnerable L%ue %016"PRIx64" - Crashing\n",
+               d->domain_id, level, pte);
+        domain_crash(d);
+#endif
+        return true;
+    }
+
+    return false;
+}
+
+static inline bool pv_l1tf_check_l1e(struct domain *d, l1_pgentry_t l1e)
+{
+    return pv_l1tf_check_pte(d, 1, l1e.l1);
+}
+
+static inline bool pv_l1tf_check_l2e(struct domain *d, l2_pgentry_t l2e)
+{
+    return pv_l1tf_check_pte(d, 2, l2e.l2);
+}
+
+static inline bool pv_l1tf_check_l3e(struct domain *d, l3_pgentry_t l3e)
+{
+    return pv_l1tf_check_pte(d, 3, l3e.l3);
+}
+
+static inline bool pv_l1tf_check_l4e(struct domain *d, l4_pgentry_t l4e)
+{
+    return pv_l1tf_check_pte(d, 4, l4e.l4);
+}
+
 void pv_l1tf_tasklet(unsigned long data);
 
 static inline void pv_l1tf_domain_init(struct domain *d)
diff --git a/xen/include/xen/tasklet.h b/xen/include/xen/tasklet.h
index 23d69c738e..bc9ddace6d 100644
--- a/xen/include/xen/tasklet.h
+++ b/xen/include/xen/tasklet.h
@@ -50,6 +50,11 @@ static inline bool tasklet_work_to_do(unsigned int cpu)
                                                 TASKLET_scheduled);
 }
 
+static inline bool tasklet_is_scheduled(const struct tasklet *t)
+{
+    return t->scheduled_on != -1;
+}
+
 void tasklet_schedule_on_cpu(struct tasklet *t, unsigned int cpu);
 void tasklet_schedule(struct tasklet *t);
 void do_tasklet(void);
-- 
2.17.1


From fb78137bb82d3d8bcac36430b8bc331008ee3826 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Wed, 28 Mar 2018 15:21:39 +0100
Subject: [PATCH 38/42] x86/spec-ctrl: CPUID/MSR definitions for L1D_FLUSH

This is part of XSA-273 / CVE-2018-3646.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit 3563fc2b2731a63fd7e8372ab0f5cef205bf8477)
---
 docs/misc/xen-command-line.markdown         | 8 ++++----
 tools/libxl/libxl_cpuid.c                   | 1 +
 tools/misc/xen-cpuid.c                      | 2 +-
 xen/arch/x86/cpuid.c                        | 5 +++++
 xen/arch/x86/spec_ctrl.c                    | 4 +++-
 xen/include/asm-x86/msr-index.h             | 4 ++++
 xen/include/public/arch-x86/cpufeatureset.h | 1 +
 7 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 763cc1d878..158b5bb919 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -489,10 +489,10 @@ accounting for hardware capabilities as enumerated via CPUID.
 
 Currently accepted:
 
-The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb`, `ssbd` are
-used by default if available and applicable.  They can be ignored,
-e.g. `no-ibrsb`, at which point Xen won't use them itself, and won't offer
-them to guests.
+The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb`,
+`l1d-flush` and `ssbd` are used by default if available and applicable.  They can
+be ignored, e.g. `no-ibrsb`, at which point Xen won't use them itself, and
+won't offer them to guests.
 
 ### cpuid\_mask\_cpu (AMD only)
 > `= fam_0f_rev_c | fam_0f_rev_d | fam_0f_rev_e | fam_0f_rev_f | fam_0f_rev_g | fam_10_rev_b | fam_10_rev_c | fam_11_rev_b`
diff --git a/tools/libxl/libxl_cpuid.c b/tools/libxl/libxl_cpuid.c
index 7b0f594c3d..52e16c20ed 100644
--- a/tools/libxl/libxl_cpuid.c
+++ b/tools/libxl/libxl_cpuid.c
@@ -204,6 +204,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str)
         {"avx512-4fmaps",0x00000007,  0, CPUID_REG_EDX,  3,  1},
         {"ibrsb",        0x00000007,  0, CPUID_REG_EDX, 26,  1},
         {"stibp",        0x00000007,  0, CPUID_REG_EDX, 27,  1},
+        {"l1d-flush",    0x00000007,  0, CPUID_REG_EDX, 28,  1},
         {"arch-caps",    0x00000007,  0, CPUID_REG_EDX, 29,  1},
         {"ssbd",         0x00000007,  0, CPUID_REG_EDX, 31,  1},
 
diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c
index e116339733..3888b4e158 100644
--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -143,7 +143,7 @@ static const char *str_7d0[32] =
     [ 2] = "avx512_4vnniw", [ 3] = "avx512_4fmaps",
 
     [26] = "ibrsb",         [27] = "stibp",
-    /* 28 */                [29] = "arch_caps",
+    [28] = "l1d_flush",     [29] = "arch_caps",
     /* 30 */                [31] = "ssbd",
 };
 
diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
index beee47d0ed..5cc89e2b34 100644
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -43,6 +43,11 @@ static int __init parse_xen_cpuid(const char *s)
             if ( !val )
                 setup_clear_cpu_cap(X86_FEATURE_STIBP);
         }
+        else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
+        {
+            if ( !val )
+                setup_clear_cpu_cap(X86_FEATURE_L1D_FLUSH);
+        }
         else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 )
         {
             if ( !val )
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 7995e27218..9bcc2b6adc 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -266,14 +266,16 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
     printk("Speculative mitigation facilities:\n");
 
     /* Hardware features which pertain to speculative mitigations. */
-    printk("  Hardware features:%s%s%s%s%s%s%s%s\n",
+    printk("  Hardware features:%s%s%s%s%s%s%s%s%s%s\n",
            (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
            (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP"     : "",
+           (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "",
            (_7d0 & cpufeat_mask(X86_FEATURE_SSBD))  ? " SSBD"      : "",
            (e8b  & cpufeat_mask(X86_FEATURE_IBPB))  ? " IBPB"      : "",
            (caps & ARCH_CAPABILITIES_IBRS_ALL)      ? " IBRS_ALL"  : "",
            (caps & ARCH_CAPABILITIES_RDCL_NO)       ? " RDCL_NO"   : "",
            (caps & ARCH_CAPS_RSBA)                  ? " RSBA"      : "",
+           (caps & ARCH_CAPS_SKIP_L1DFL)            ? " SKIP_L1DFL": "",
            (caps & ARCH_CAPS_SSB_NO)                ? " SSB_NO"    : "");
 
     /* Compiled-in support which pertains to mitigations. */
diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
index 8fbccc88a7..7235623c86 100644
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -47,8 +47,12 @@
 #define ARCH_CAPABILITIES_RDCL_NO	(_AC(1, ULL) << 0)
 #define ARCH_CAPABILITIES_IBRS_ALL	(_AC(1, ULL) << 1)
 #define ARCH_CAPS_RSBA			(_AC(1, ULL) << 2)
+#define ARCH_CAPS_SKIP_L1DFL		(_AC(1, ULL) << 3)
 #define ARCH_CAPS_SSB_NO		(_AC(1, ULL) << 4)
 
+#define MSR_FLUSH_CMD			0x0000010b
+#define FLUSH_CMD_L1D			(_AC(1, ULL) << 0)
+
 /* Intel MSRs. Some also available on other CPUs */
 #define MSR_IA32_PERFCTR0		0x000000c1
 #define MSR_IA32_A_PERFCTR0		0x000004c1
diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h
index f1a5ed93e0..9f4c8246a9 100644
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -244,6 +244,7 @@ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A  AVX512 Neural Network Instructions *
 XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A  AVX512 Multiply Accumulation Single Precision */
 XEN_CPUFEATURE(IBRSB,         9*32+26) /*A  IBRS and IBPB support (used by Intel) */
 XEN_CPUFEATURE(STIBP,         9*32+27) /*A  STIBP */
+XEN_CPUFEATURE(L1D_FLUSH,     9*32+28) /*   MSR_FLUSH_CMD and L1D flush. */
 XEN_CPUFEATURE(ARCH_CAPS,     9*32+29) /*   IA32_ARCH_CAPABILITIES MSR */
 XEN_CPUFEATURE(SSBD,          9*32+31) /*A  MSR_SPEC_CTRL.SSBD available */
 
-- 
2.17.1


From 007752fb9b85b9235fe2820677988c6408c583da Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Fri, 13 Apr 2018 15:34:01 +0000
Subject: [PATCH 39/42] x86/msr: Virtualise MSR_FLUSH_CMD for guests

Guests (outside of the nested virt case, which isn't supported yet) don't need
L1D_FLUSH for their L1TF mitigations, but offering/emulating MSR_FLUSH_CMD is
easy and doesn't pose an issue for Xen.

The MSR is offered to HVM guests only.  PV guests attempting to use it would
trap for emulation, and the L1D cache would fill long before the return to
guest context.  As such, PV guests can't make any use of the L1D_FLUSH
functionality.

This is part of XSA-273 / CVE-2018-3646.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit fd9823faf9df057a69a9a53c2e100691d3f4267c)
---
 xen/arch/x86/domctl.c                       |  3 ++-
 xen/arch/x86/hvm/vmx/vmx.c                  |  6 ++++++
 xen/arch/x86/msr.c                          | 12 ++++++++++++
 xen/include/public/arch-x86/cpufeatureset.h |  2 +-
 4 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index fa82b6744e..dd91038a67 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -225,7 +225,8 @@ static int update_domain_cpuid_info(struct domain *d,
          */
         call_policy_changed = (is_hvm_domain(d) &&
                                ((old_7d0 ^ p->feat.raw[0].d) &
-                                cpufeat_mask(X86_FEATURE_IBRSB)));
+                                (cpufeat_mask(X86_FEATURE_IBRSB) |
+                                 cpufeat_mask(X86_FEATURE_L1D_FLUSH))));
         break;
 
     case 0xa:
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index c7cf3a8fbc..b0fababede 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -583,6 +583,12 @@ static void vmx_cpuid_policy_changed(struct vcpu *v)
         vmx_clear_msr_intercept(v, MSR_PRED_CMD,  VMX_MSR_RW);
     else
         vmx_set_msr_intercept(v, MSR_PRED_CMD,  VMX_MSR_RW);
+
+    /* MSR_FLUSH_CMD is safe to pass through if the guest knows about it. */
+    if ( cp->feat.l1d_flush )
+        vmx_clear_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
+    else
+        vmx_set_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
 }
 
 int vmx_guest_x86_mode(struct vcpu *v)
diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
index 1e12ccb729..1a591dd2b5 100644
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -150,6 +150,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
     case MSR_AMD_PATCHLOADER:
     case MSR_IA32_UCODE_WRITE:
     case MSR_PRED_CMD:
+    case MSR_FLUSH_CMD:
         /* Write-only */
         goto gp_fault;
 
@@ -254,6 +255,17 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
             wrmsrl(MSR_PRED_CMD, val);
         break;
 
+    case MSR_FLUSH_CMD:
+        if ( !cp->feat.l1d_flush )
+            goto gp_fault; /* MSR available? */
+
+        if ( val & ~FLUSH_CMD_L1D )
+            goto gp_fault; /* Rsvd bit set? */
+
+        if ( v == curr )
+            wrmsrl(MSR_FLUSH_CMD, val);
+        break;
+
     case MSR_INTEL_MISC_FEATURES_ENABLES:
     {
         bool old_cpuid_faulting = vp->misc_features_enables.cpuid_faulting;
diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h
index 9f4c8246a9..6c82816fd3 100644
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -244,7 +244,7 @@ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A  AVX512 Neural Network Instructions *
 XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A  AVX512 Multiply Accumulation Single Precision */
 XEN_CPUFEATURE(IBRSB,         9*32+26) /*A  IBRS and IBPB support (used by Intel) */
 XEN_CPUFEATURE(STIBP,         9*32+27) /*A  STIBP */
-XEN_CPUFEATURE(L1D_FLUSH,     9*32+28) /*   MSR_FLUSH_CMD and L1D flush. */
+XEN_CPUFEATURE(L1D_FLUSH,     9*32+28) /*S  MSR_FLUSH_CMD and L1D flush. */
 XEN_CPUFEATURE(ARCH_CAPS,     9*32+29) /*   IA32_ARCH_CAPABILITIES MSR */
 XEN_CPUFEATURE(SSBD,          9*32+31) /*A  MSR_SPEC_CTRL.SSBD available */
 
-- 
2.17.1


From 2a47c7550910f5d591ca0de369234f8c18daa2d2 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Tue, 29 May 2018 18:44:16 +0100
Subject: [PATCH 40/42] x86/spec-ctrl: Introduce an option to control L1D_FLUSH
 for HVM HAP guests

This mitigation requires up-to-date microcode, and is enabled by default on
affected hardware if available, and is used for HVM guests

The default for SMT/Hyperthreading is far more complicated to reason about,
not least because we don't know if the user is going to want to run any HVM
guests to begin with.  If a explicit default isn't given, nag the user to
perform a risk assessment and choose an explicit default, and leave other
configuration to the toolstack.

This is part of XSA-273 / CVE-2018-3620.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit 3bd36952dab60290f33d6791070b57920e10754b)
---
 docs/misc/xen-command-line.markdown |  9 ++++++-
 xen/arch/x86/hvm/vmx/vmcs.c         |  5 ++++
 xen/arch/x86/spec_ctrl.c            | 38 +++++++++++++++++++++++++++--
 xen/include/asm-x86/spec_ctrl.h     |  1 +
 4 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 158b5bb919..57ef18194a 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1791,7 +1791,8 @@ false disable the quirk workaround, which is also the default.
 
 ### spec-ctrl (x86)
 > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb}=<bool>,
->              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu}=<bool> ]`
+>              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu,
+>              l1d-flush}=<bool> ]`
 
 Controls for speculative execution sidechannel mitigations.  By default, Xen
 will pick the most appropriate mitigations based on compiled in support,
@@ -1846,6 +1847,12 @@ from using fully eager FPU context switches.  This is currently implemented as
 a global control.  By default, Xen will choose to use fully eager context
 switches on hardware believed to speculate past #NM exceptions.
 
+On hardware supporting L1D_FLUSH, the `l1d-flush=` option can be used to force
+or prevent Xen from issuing an L1 data cache flush on each VMEntry.
+Irrespective of Xen's setting, the feature is virtualised for HVM guests to
+use.  By default, Xen will enable this mitigation on hardware believed to be
+vulnerable to L1TF.
+
 ### sync\_console
 > `= <boolean>`
 
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 30a33dd0bd..2ba0c40808 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -38,6 +38,7 @@
 #include <asm/flushtlb.h>
 #include <asm/monitor.h>
 #include <asm/shadow.h>
+#include <asm/spec_ctrl.h>
 #include <asm/tboot.h>
 #include <asm/apic.h>
 
@@ -1274,6 +1275,10 @@ static int construct_vmcs(struct vcpu *v)
 
     vmx_vlapic_msr_changed(v);
 
+    if ( opt_l1d_flush && paging_mode_hap(d) )
+        rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D,
+                         VMX_MSR_GUEST_LOADONLY);
+
  out:
     vmx_vmcs_exit(v);
 
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 9bcc2b6adc..59baebb959 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -19,11 +19,13 @@
 #include <xen/errno.h>
 #include <xen/init.h>
 #include <xen/lib.h>
+#include <xen/warning.h>
 
 #include <asm/microcode.h>
 #include <asm/msr.h>
 #include <asm/processor.h>
 #include <asm/pv/shim.h>
+#include <asm/setup.h>
 #include <asm/spec_ctrl.h>
 #include <asm/spec_ctrl_asm.h>
 
@@ -46,6 +48,7 @@ static int8_t __initdata opt_ibrs = -1;
 bool __read_mostly opt_ibpb = true;
 bool __read_mostly opt_ssbd = false;
 int8_t __read_mostly opt_eager_fpu = -1;
+int8_t __read_mostly opt_l1d_flush = -1;
 
 bool __initdata bsp_delay_spec_ctrl;
 uint8_t __read_mostly default_xen_spec_ctrl;
@@ -139,6 +142,7 @@ static int __init parse_spec_ctrl(const char *s)
             opt_ibrs = 0;
             opt_ibpb = false;
             opt_ssbd = false;
+            opt_l1d_flush = 0;
         }
         else if ( val > 0 )
             rc = -EINVAL;
@@ -194,6 +198,8 @@ static int __init parse_spec_ctrl(const char *s)
             opt_ssbd = val;
         else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 )
             opt_eager_fpu = val;
+        else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
+            opt_l1d_flush = val;
         else
             rc = -EINVAL;
 
@@ -290,7 +296,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
                "\n");
 
     /* Settings for Xen's protection, irrespective of guests. */
-    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s\n",
+    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s\n",
            thunk == THUNK_NONE      ? "N/A" :
            thunk == THUNK_RETPOLINE ? "RETPOLINE" :
            thunk == THUNK_LFENCE    ? "LFENCE" :
@@ -299,7 +305,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
            (default_xen_spec_ctrl & SPEC_CTRL_IBRS)  ? "IBRS+" :  "IBRS-",
            !boot_cpu_has(X86_FEATURE_SSBD)           ? "" :
            (default_xen_spec_ctrl & SPEC_CTRL_SSBD)  ? " SSBD+" : " SSBD-",
-           opt_ibpb                                  ? " IBPB"  : "");
+           opt_ibpb                                  ? " IBPB"  : "",
+           opt_l1d_flush                             ? " L1D_FLUSH" : "");
 
     /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
     if ( cpu_has_bug_l1tf || opt_pv_l1tf )
@@ -871,6 +878,33 @@ void __init init_speculation_mitigations(void)
             opt_pv_l1tf = OPT_PV_L1TF_DOMU;
     }
 
+    /*
+     * By default, enable L1D_FLUSH on L1TF-vulnerable hardware, unless
+     * instructed to skip the flush on vmentry by our outer hypervisor.
+     */
+    if ( !boot_cpu_has(X86_FEATURE_L1D_FLUSH) )
+        opt_l1d_flush = 0;
+    else if ( opt_l1d_flush == -1 )
+        opt_l1d_flush = cpu_has_bug_l1tf && !(caps & ARCH_CAPS_SKIP_L1DFL);
+
+    /*
+     * We do not disable HT by default on affected hardware.
+     *
+     * Firstly, if the user intends to use exclusively PV, or HVM shadow
+     * guests, HT isn't a concern and should remain fully enabled.  Secondly,
+     * safety for HVM HAP guests can be arranged by the toolstack with core
+     * parking, pinning or cpupool configurations, including mixed setups.
+     *
+     * However, if we are on affected hardware, with HT enabled, and the user
+     * hasn't explicitly chosen whether to use HT or not, nag them to do so.
+     */
+    if ( opt_smt == -1 && cpu_has_bug_l1tf && !pv_shim &&
+         boot_cpu_data.x86_num_siblings > 1 )
+        warning_add(
+            "Booted on L1TF-vulnerable hardware with SMT/Hyperthreading\n"
+            "enabled.  Please assess your configuration and choose an\n"
+            "explicit 'smt=<bool>' setting.  See XSA-273.\n");
+
     print_details(thunk, caps);
 
     /*
diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
index cdf5737dc2..8f8aad40bb 100644
--- a/xen/include/asm-x86/spec_ctrl.h
+++ b/xen/include/asm-x86/spec_ctrl.h
@@ -29,6 +29,7 @@ void init_speculation_mitigations(void);
 extern bool opt_ibpb;
 extern bool opt_ssbd;
 extern int8_t opt_eager_fpu;
+extern int8_t opt_l1d_flush;
 
 extern bool bsp_delay_spec_ctrl;
 extern uint8_t default_xen_spec_ctrl;
-- 
2.17.1


From 6c7d074a4b5c8e69e21e505a04e7bb3f43658bea Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Mon, 13 Aug 2018 05:07:23 -0600
Subject: [PATCH 41/42] x86: Make "spec-ctrl=no" a global disable of all
 mitigations

In order to have a simple and easy to remember means to suppress all the
more or less recent workarounds for hardware vulnerabilities, force
settings not controlled by "spec-ctrl=" also to their original defaults,
unless they've been forced to specific values already by earlier command
line options.

This is part of XSA-273.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
(cherry picked from commit d8800a82c3840b06b17672eddee4878bbfdacc6d)
---
 docs/misc/xen-command-line.markdown | 13 +++++++++----
 xen/arch/x86/spec_ctrl.c            |  9 +++++++++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 57ef18194a..0886706368 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1804,10 +1804,15 @@ extreme care.**
 
 An overall boolean value, `spec-ctrl=no`, can be specified to turn off all
 mitigations, including pieces of infrastructure used to virtualise certain
-mitigation features for guests.  Alternatively, a slightly more restricted
-`spec-ctrl=no-xen` can be used to turn off all of Xen's mitigations, while
-leaving the virtualisation support in place for guests to use.  Use of a
-positive boolean value for either of these options is invalid.
+mitigation features for guests.  This also includes settings which `xpti`,
+`smt`, `pv-l1tf` control, unless the respective option(s) have been
+specified earlier on the command line.
+
+Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to
+turn off all of Xen's mitigations, while leaving the virtualisation support
+in place for guests to use.
+
+Use of a positive boolean value for either of these options is invalid.
 
 The booleans `pv=`, `hvm=`, `msr-sc=` and `rsb=` offer fine grained control
 over the alternative blocks used by Xen.  These impact Xen's ability to
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 59baebb959..f0c50d6703 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -134,6 +134,15 @@ static int __init parse_spec_ctrl(const char *s)
 
             opt_eager_fpu = 0;
 
+            if ( opt_xpti < 0 )
+                opt_xpti = 0;
+
+            if ( opt_smt < 0 )
+                opt_smt = 1;
+
+            if ( opt_pv_l1tf < 0 )
+                opt_pv_l1tf = 0;
+
         disable_common:
             opt_rsb_pv = false;
             opt_rsb_hvm = false;
-- 
2.17.1


From d757c29ffe2e31b15397e43cd58da88b6318b654 Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu2@citrix.com>
Date: Tue, 7 Aug 2018 15:35:34 +0100
Subject: [PATCH 42/42] xl.conf: Add global affinity masks

XSA-273 involves one hyperthread being able to use Spectre-like
techniques to "spy" on another thread.  The details are somewhat
complicated, but the upshot is that after all Xen-based mitigations
have been applied:

* PV guests cannot spy on sibling threads
* HVM guests can spy on sibling threads

(NB that for purposes of this vulnerability, PVH and HVM guests are
identical.  Whenever this comment refers to 'HVM', this includes PVH.)

There are many possible mitigations to this, including disabling
hyperthreading entirely.  But another solution would be:

* Specify some cores as PV-only, others as PV or HVM
* Allow HVM guests to only run on thread 0 of the "HVM-or-PV" cores
* Allow PV guests to run on the above cores, as well as any thread of the PV-only cores.

For example, suppose you had 16 threads across 8 cores (0-7).  You
could specify 0-3 as PV-only, and 4-7 as HVM-or-PV.  Then you'd set
the affinity of the HVM guests as follows (binary representation):

0000000010101010

And the affinity of the PV guests as follows:

1111111110101010

In order to make this easy, this patches introduces three "global affinity
masks", placed in xl.conf:

    vm.cpumask
    vm.hvm.cpumask
    vm.pv.cpumask

These are parsed just like the 'cpus' and 'cpus_soft' options in the
per-domain xl configuration files.  The resulting mask is AND-ed with
whatever mask results at the end of the xl configuration file.
`vm.cpumask` would be applied to all guest types, `vm.hvm.cpumask`
would be applied to HVM and PVH guest types, and `vm.pv.cpumask`
would be applied to PV guest types.

The idea would be that to implement the above mask across all your
VMs, you'd simply add the following two lines to the configuration
file:

    vm.hvm.cpumask=8,10,12,14
    vm.pv.cpumask=0-8,10,12,14

See xl.conf manpage for details.

This is part of XSA-273 / CVE-2018-3646.

Signed-off-by: George Dunlap <george.dunlap@citrix.com>
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
(cherry picked from commit aa67b97ed34279c43a43d9ca46727b5746caa92e)
---
 docs/man/xl.conf.pod.5  | 22 ++++++++++++
 tools/examples/xl.conf  |  5 +++
 tools/xl/xl.c           | 26 ++++++++++++++
 tools/xl/xl.h           |  7 ++++
 tools/xl/xl_cmdtable.c  |  6 ++--
 tools/xl/xl_vcpu.c      | 80 +++++++++++++++++++++++++++++++++++++++--
 tools/xl/xl_vmcontrol.c | 39 ++++++++++++++++++--
 7 files changed, 179 insertions(+), 6 deletions(-)

diff --git a/docs/man/xl.conf.pod.5 b/docs/man/xl.conf.pod.5
index da91b8626c..37262a7ef8 100644
--- a/docs/man/xl.conf.pod.5
+++ b/docs/man/xl.conf.pod.5
@@ -185,6 +185,28 @@ massively huge guests).
 
 =back
 
+=item B<vm.cpumask>="CPULIST"
+
+=item B<vm.hvm.cpumask>="CPULIST"
+
+=item B<vm.pv.cpumask>="CPULIST"
+
+Global masks that are applied when creating guests and pinning vcpus
+to indicate which cpus they are allowed to run on.  Specifically,
+C<vm.cpumask> applies to all guest types, C<vm.hvm.cpumask> applies to
+both HVM and PVH guests and C<vm.pv.cpumask> applies to PV guests.
+
+The hard affinity of guest's vcpus are logical-AND'ed with respective
+masks. If the resulting affinity mask is empty, operation will fail.
+
+Use --ignore-global-affinity-masks to skip applying global masks.
+
+The default value for these masks are all 1's, i.e. all cpus are allowed.
+
+Due to bug(s), these options may not interact well with other options
+concerning CPU affinity. One example is CPU pools. Users should always double
+check that the required affinity has taken effect.
+
 =back
 
 =head1 SEE ALSO
diff --git a/tools/examples/xl.conf b/tools/examples/xl.conf
index 374b6bbc2e..0446deb304 100644
--- a/tools/examples/xl.conf
+++ b/tools/examples/xl.conf
@@ -37,3 +37,8 @@
 # (which can take a long time to find out if launching huge guests).
 # see xl.conf(5) for details.
 #claim_mode=1
+
+# Specify global vcpu hard affinity masks. See xl.conf(5) for details.
+#vm.cpumask="0-7"
+#vm.pv.cpumask="0-3"
+#vm.hvm.cpumask="3-7"
diff --git a/tools/xl/xl.c b/tools/xl/xl.c
index 179908b4f6..7d2142f16f 100644
--- a/tools/xl/xl.c
+++ b/tools/xl/xl.c
@@ -28,6 +28,9 @@
 #include <libxl_utils.h>
 #include <libxlutil.h>
 #include "xl.h"
+#include "xl_parse.h"
+
+#include "xl_utils.h"
 
 xentoollog_logger_stdiostream *logger;
 int dryrun_only;
@@ -42,6 +45,9 @@ char *default_gatewaydev = NULL;
 char *default_vifbackend = NULL;
 char *default_remus_netbufscript = NULL;
 char *default_colo_proxy_script = NULL;
+libxl_bitmap global_vm_affinity_mask;
+libxl_bitmap global_hvm_affinity_mask;
+libxl_bitmap global_pv_affinity_mask;
 enum output_format default_output_format = OUTPUT_FORMAT_JSON;
 int claim_mode = 1;
 bool progress_use_cr = 0;
@@ -203,6 +209,26 @@ static void parse_global_config(const char *configfile,
     if (!xlu_cfg_get_long (config, "max_maptrack_frames", &l, 0))
         max_maptrack_frames = l;
 
+    libxl_bitmap_init(&global_vm_affinity_mask);
+    libxl_cpu_bitmap_alloc(ctx, &global_vm_affinity_mask, 0);
+    libxl_bitmap_init(&global_hvm_affinity_mask);
+    libxl_cpu_bitmap_alloc(ctx, &global_hvm_affinity_mask, 0);
+    libxl_bitmap_init(&global_pv_affinity_mask);
+    libxl_cpu_bitmap_alloc(ctx, &global_pv_affinity_mask, 0);
+
+    if (!xlu_cfg_get_string (config, "vm.cpumask", &buf, 0))
+        parse_cpurange(buf, &global_vm_affinity_mask);
+    else
+        libxl_bitmap_set_any(&global_vm_affinity_mask);
+    if (!xlu_cfg_get_string (config, "vm.hvm.cpumask", &buf, 0))
+        parse_cpurange(buf, &global_hvm_affinity_mask);
+    else
+       libxl_bitmap_set_any(&global_hvm_affinity_mask);
+    if (!xlu_cfg_get_string (config, "vm.pv.cpumask", &buf, 0))
+        parse_cpurange(buf, &global_pv_affinity_mask);
+    else
+        libxl_bitmap_set_any(&global_pv_affinity_mask);
+
     xlu_cfg_destroy(config);
 }
 
diff --git a/tools/xl/xl.h b/tools/xl/xl.h
index 4e784ff402..7e97144b50 100644
--- a/tools/xl/xl.h
+++ b/tools/xl/xl.h
@@ -41,6 +41,7 @@ struct domain_create {
     int vncautopass;
     int console_autoconnect;
     int checkpointed_stream;
+    int ignore_global_affinity_masks;
     const char *config_file;
     char *extra_config; /* extra config string */
     const char *restore_file;
@@ -279,6 +280,9 @@ extern char *default_colo_proxy_script;
 extern char *blkdev_start;
 extern int max_grant_frames;
 extern int max_maptrack_frames;
+extern libxl_bitmap global_vm_affinity_mask;
+extern libxl_bitmap global_hvm_affinity_mask;
+extern libxl_bitmap global_pv_affinity_mask;
 
 enum output_format {
     OUTPUT_FORMAT_JSON,
@@ -294,6 +298,9 @@ typedef enum {
 } domain_restart_type;
 
 extern void printf_info_sexp(int domid, libxl_domain_config *d_config, FILE *fh);
+extern void apply_global_affinity_masks(libxl_domain_type type,
+                                        libxl_bitmap *vcpu_affinity_array,
+                                        unsigned int size);
 
 #define XL_GLOBAL_CONFIG XEN_CONFIG_DIR "/xl.conf"
 #define XL_LOCK_FILE XEN_LOCK_DIR "/xl"
diff --git a/tools/xl/xl_cmdtable.c b/tools/xl/xl_cmdtable.c
index bf2ced8140..54c2db6022 100644
--- a/tools/xl/xl_cmdtable.c
+++ b/tools/xl/xl_cmdtable.c
@@ -34,7 +34,8 @@ struct cmd_spec cmd_table[] = {
       "-e                      Do not wait in the background for the death of the domain.\n"
       "-V, --vncviewer         Connect to the VNC display after the domain is created.\n"
       "-A, --vncviewer-autopass\n"
-      "                        Pass VNC password to viewer via stdin."
+      "                        Pass VNC password to viewer via stdin.\n"
+      "--ignore-global-affinity-masks Ignore global masks in xl.conf."
     },
     { "config-update",
       &main_config_update, 1, 1,
@@ -224,7 +225,8 @@ struct cmd_spec cmd_table[] = {
       &main_vcpupin, 1, 1,
       "Set which CPUs a VCPU can use",
       "[option] <Domain> <VCPU|all> <Hard affinity|-|all> <Soft affinity|-|all>",
-      "-f, --force        undo an override pinning done by the kernel",
+      "-f, --force        undo an override pinning done by the kernel\n"
+      "--ignore-global-affinity-masks Ignore global masks in xl.conf",
     },
     { "vcpu-set",
       &main_vcpuset, 0, 1,
diff --git a/tools/xl/xl_vcpu.c b/tools/xl/xl_vcpu.c
index 8e735b38c1..3384eeed06 100644
--- a/tools/xl/xl_vcpu.c
+++ b/tools/xl/xl_vcpu.c
@@ -68,6 +68,61 @@ static void print_domain_vcpuinfo(uint32_t domid, uint32_t nr_cpus)
     libxl_vcpuinfo_list_free(vcpuinfo, nb_vcpu);
 }
 
+void apply_global_affinity_masks(libxl_domain_type type,
+                                 libxl_bitmap *vcpu_affinity_array,
+                                 unsigned int size)
+{
+    libxl_bitmap *mask = &global_vm_affinity_mask;
+    libxl_bitmap *type_mask;
+    unsigned int i;
+
+    switch (type) {
+    case LIBXL_DOMAIN_TYPE_HVM:
+    case LIBXL_DOMAIN_TYPE_PVH:
+        type_mask = &global_hvm_affinity_mask;
+        break;
+    case LIBXL_DOMAIN_TYPE_PV:
+        type_mask = &global_pv_affinity_mask;
+        break;
+    default:
+        fprintf(stderr, "Unknown guest type\n");
+        exit(EXIT_FAILURE);
+    }
+
+    for (i = 0; i < size; i++) {
+        int rc;
+        libxl_bitmap *t = &vcpu_affinity_array[i];
+        libxl_bitmap b1, b2;
+
+        libxl_bitmap_init(&b1);
+        libxl_bitmap_init(&b2);
+
+        rc = libxl_bitmap_and(ctx, &b1, t, mask);
+        if (rc) {
+            fprintf(stderr, "libxl_bitmap_and errored\n");
+            exit(EXIT_FAILURE);
+        }
+        rc = libxl_bitmap_and(ctx, &b2, &b1, type_mask);
+        if (rc) {
+            fprintf(stderr, "libxl_bitmap_and errored\n");
+            exit(EXIT_FAILURE);
+        }
+
+        if (libxl_bitmap_is_empty(&b2)) {
+            fprintf(stderr, "vcpu hard affinity map is empty\n");
+            exit(EXIT_FAILURE);
+        }
+
+        /* Replace target bitmap with the result */
+        libxl_bitmap_dispose(t);
+        libxl_bitmap_init(t);
+        libxl_bitmap_copy_alloc(ctx, t, &b2);
+
+        libxl_bitmap_dispose(&b1);
+        libxl_bitmap_dispose(&b2);
+    }
+}
+
 static void vcpulist(int argc, char **argv)
 {
     libxl_dominfo *dominfo;
@@ -118,6 +173,7 @@ int main_vcpupin(int argc, char **argv)
 {
     static struct option opts[] = {
         {"force", 0, 0, 'f'},
+        {"ignore-global-affinity-masks", 0, 0, 'i'},
         COMMON_LONG_OPTS
     };
     libxl_vcpuinfo *vcpuinfo;
@@ -132,15 +188,18 @@ int main_vcpupin(int argc, char **argv)
     const char *vcpu, *hard_str, *soft_str;
     char *endptr;
     int opt, nb_cpu, nb_vcpu, rc = EXIT_FAILURE;
-    bool force = false;
+    bool force = false, ignore_masks = false;
 
     libxl_bitmap_init(&cpumap_hard);
     libxl_bitmap_init(&cpumap_soft);
 
-    SWITCH_FOREACH_OPT(opt, "f", opts, "vcpu-pin", 3) {
+    SWITCH_FOREACH_OPT(opt, "fi", opts, "vcpu-pin", 3) {
     case 'f':
         force = true;
         break;
+    case 'i':
+        ignore_masks = true;
+        break;
     default:
         break;
     }
@@ -222,6 +281,23 @@ int main_vcpupin(int argc, char **argv)
         goto out;
     }
 
+    /* Only hard affinity matters here */
+    if (!ignore_masks) {
+        libxl_domain_config d_config;
+
+        libxl_domain_config_init(&d_config);
+        rc = libxl_retrieve_domain_configuration(ctx, domid, &d_config);
+        if (rc) {
+            fprintf(stderr, "Could not retrieve domain configuration\n");
+            libxl_domain_config_dispose(&d_config);
+            goto out;
+        }
+
+        apply_global_affinity_masks(d_config.b_info.type, hard, 1);
+
+        libxl_domain_config_dispose(&d_config);
+    }
+
     if (force) {
         if (libxl_set_vcpuaffinity_force(ctx, domid, vcpuid, hard, soft)) {
             fprintf(stderr, "Could not set affinity for vcpu `%ld'.\n",
diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c
index 89c2b25ded..a1d633795c 100644
--- a/tools/xl/xl_vmcontrol.c
+++ b/tools/xl/xl_vmcontrol.c
@@ -804,6 +804,36 @@ int create_domain(struct domain_create *dom_info)
         parse_config_data(config_source, config_data, config_len, &d_config);
     }
 
+    if (!dom_info->ignore_global_affinity_masks) {
+        libxl_domain_build_info *b_info = &d_config.b_info;
+
+        /* It is possible that no hard affinity is specified in config file.
+         * Generate hard affinity maps now if we care about those.
+         */
+        if (b_info->num_vcpu_hard_affinity == 0 &&
+              (!libxl_bitmap_is_full(&global_vm_affinity_mask) ||
+                 (b_info->type == LIBXL_DOMAIN_TYPE_PV &&
+                  !libxl_bitmap_is_full(&global_pv_affinity_mask)) ||
+                 (b_info->type != LIBXL_DOMAIN_TYPE_PV &&
+                  !libxl_bitmap_is_full(&global_hvm_affinity_mask))
+               )) {
+            b_info->num_vcpu_hard_affinity = b_info->max_vcpus;
+            b_info->vcpu_hard_affinity =
+                xmalloc(b_info->max_vcpus * sizeof(libxl_bitmap));
+
+            for (i = 0; i < b_info->num_vcpu_hard_affinity; i++) {
+                libxl_bitmap *m = &b_info->vcpu_hard_affinity[i];
+                libxl_bitmap_init(m);
+                libxl_cpu_bitmap_alloc(ctx, m, 0);
+                libxl_bitmap_set_any(m);
+            }
+        }
+
+        apply_global_affinity_masks(b_info->type,
+                                    b_info->vcpu_hard_affinity,
+                                    b_info->num_vcpu_hard_affinity);
+    }
+
     if (migrate_fd >= 0) {
         if (d_config.c_info.name) {
             /* when we receive a domain we get its name from the config
@@ -1124,7 +1154,7 @@ int main_create(int argc, char **argv)
     const char *filename = NULL;
     struct domain_create dom_info;
     int paused = 0, debug = 0, daemonize = 1, console_autoconnect = 0,
-        quiet = 0, monitor = 1, vnc = 0, vncautopass = 0;
+        quiet = 0, monitor = 1, vnc = 0, vncautopass = 0, ignore_masks = 0;
     int opt, rc;
     static struct option opts[] = {
         {"dryrun", 0, 0, 'n'},
@@ -1132,6 +1162,7 @@ int main_create(int argc, char **argv)
         {"defconfig", 1, 0, 'f'},
         {"vncviewer", 0, 0, 'V'},
         {"vncviewer-autopass", 0, 0, 'A'},
+        {"ignore-global-affinity-masks", 0, 0, 'i'},
         COMMON_LONG_OPTS
     };
 
@@ -1142,7 +1173,7 @@ int main_create(int argc, char **argv)
         argc--; argv++;
     }
 
-    SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVA", opts, "create", 0) {
+    SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVAi", opts, "create", 0) {
     case 'f':
         filename = optarg;
         break;
@@ -1174,6 +1205,9 @@ int main_create(int argc, char **argv)
     case 'A':
         vnc = vncautopass = 1;
         break;
+    case 'i':
+        ignore_masks = 1;
+        break;
     }
 
     memset(&dom_info, 0, sizeof(dom_info));
@@ -1203,6 +1237,7 @@ int main_create(int argc, char **argv)
     dom_info.vnc = vnc;
     dom_info.vncautopass = vncautopass;
     dom_info.console_autoconnect = console_autoconnect;
+    dom_info.ignore_global_affinity_masks = ignore_masks;
 
     rc = create_domain(&dom_info);
     if (rc < 0) {
-- 
2.17.1