|
|
b34d678 |
From 94f984ff563d1777652b822d7a282cacc1e481c2 Mon Sep 17 00:00:00 2001
|
|
|
b34d678 |
From: Andrea Arcangeli <aarcange@redhat.com>
|
|
|
b34d678 |
Date: Wed, 27 Apr 2016 12:04:46 -0500
|
|
|
b34d678 |
Subject: [PATCH] mm: thp: kvm: fix memory corruption in KVM with THP enabled
|
|
|
b34d678 |
|
|
|
b34d678 |
After the THP refcounting change, obtaining a compound pages from
|
|
|
b34d678 |
get_user_pages() no longer allows us to assume the entire compound
|
|
|
b34d678 |
page is immediately mappable from a secondary MMU.
|
|
|
b34d678 |
|
|
|
b34d678 |
A secondary MMU doesn't want to call get_user_pages() more than once
|
|
|
b34d678 |
for each compound page, in order to know if it can map the whole
|
|
|
b34d678 |
compound page. So a secondary MMU needs to know from a single
|
|
|
b34d678 |
get_user_pages() invocation when it can map immediately the entire
|
|
|
b34d678 |
compound page to avoid a flood of unnecessary secondary MMU faults and
|
|
|
b34d678 |
spurious atomic_inc()/atomic_dec() (pages don't have to be pinned by
|
|
|
b34d678 |
MMU notifier users).
|
|
|
b34d678 |
|
|
|
b34d678 |
Ideally instead of the page->_mapcount < 1 check, get_user_pages()
|
|
|
b34d678 |
should return the granularity of the "page" mapping in the "mm" passed
|
|
|
b34d678 |
to get_user_pages(). However it's non trivial change to pass the "pmd"
|
|
|
b34d678 |
status belonging to the "mm" walked by get_user_pages up the stack (up
|
|
|
b34d678 |
to the caller of get_user_pages). So the fix just checks if there is
|
|
|
b34d678 |
not a single pte mapping on the page returned by get_user_pages, and
|
|
|
b34d678 |
in turn if the caller can assume that the whole compound page is
|
|
|
b34d678 |
mapped in the current "mm" (in a pmd_trans_huge()). In such case the
|
|
|
b34d678 |
entire compound page is safe to map into the secondary MMU without
|
|
|
b34d678 |
additional get_user_pages() calls on the surrounding tail/head
|
|
|
b34d678 |
pages. In addition of being faster, not having to run other
|
|
|
b34d678 |
get_user_pages() calls also reduces the memory footprint of the
|
|
|
b34d678 |
secondary MMU fault in case the pmd split happened as result of memory
|
|
|
b34d678 |
pressure.
|
|
|
b34d678 |
|
|
|
b34d678 |
Without this fix after a MADV_DONTNEED (like invoked by QEMU during
|
|
|
b34d678 |
postcopy live migration or balloning) or after generic swapping (with
|
|
|
b34d678 |
a failure in split_huge_page() that would only result in pmd splitting
|
|
|
b34d678 |
and not a physical page split), KVM would map the whole compound page
|
|
|
b34d678 |
into the shadow pagetables, despite regular faults or userfaults (like
|
|
|
b34d678 |
UFFDIO_COPY) may map regular pages into the primary MMU as result of
|
|
|
b34d678 |
the pte faults, leading to the guest mode and userland mode going out
|
|
|
b34d678 |
of sync and not working on the same memory at all times.
|
|
|
b34d678 |
|
|
|
b34d678 |
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
|
|
|
b34d678 |
---
|
|
|
b34d678 |
arch/arm/kvm/mmu.c | 2 +-
|
|
|
b34d678 |
arch/x86/kvm/mmu.c | 4 ++--
|
|
|
b34d678 |
include/linux/page-flags.h | 22 ++++++++++++++++++++++
|
|
|
b34d678 |
3 files changed, 25 insertions(+), 3 deletions(-)
|
|
|
b34d678 |
|
|
|
b34d678 |
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
|
|
|
b34d678 |
index aba61fd..8dafe97 100644
|
|
|
b34d678 |
--- a/arch/arm/kvm/mmu.c
|
|
|
b34d678 |
+++ b/arch/arm/kvm/mmu.c
|
|
|
b34d678 |
@@ -997,7 +997,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
|
|
|
b34d678 |
kvm_pfn_t pfn = *pfnp;
|
|
|
b34d678 |
gfn_t gfn = *ipap >> PAGE_SHIFT;
|
|
|
b34d678 |
|
|
|
b34d678 |
- if (PageTransCompound(pfn_to_page(pfn))) {
|
|
|
b34d678 |
+ if (PageTransCompoundMap(pfn_to_page(pfn))) {
|
|
|
b34d678 |
unsigned long mask;
|
|
|
b34d678 |
/*
|
|
|
b34d678 |
* The address we faulted on is backed by a transparent huge
|
|
|
b34d678 |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
|
|
|
b34d678 |
index 1e7a49b..3a371f7 100644
|
|
|
b34d678 |
--- a/arch/x86/kvm/mmu.c
|
|
|
b34d678 |
+++ b/arch/x86/kvm/mmu.c
|
|
|
b34d678 |
@@ -2767,7 +2767,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
|
|
|
b34d678 |
*/
|
|
|
b34d678 |
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
|
|
|
b34d678 |
level == PT_PAGE_TABLE_LEVEL &&
|
|
|
b34d678 |
- PageTransCompound(pfn_to_page(pfn)) &&
|
|
|
b34d678 |
+ PageTransCompoundMap(pfn_to_page(pfn)) &&
|
|
|
b34d678 |
!has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
|
|
|
b34d678 |
unsigned long mask;
|
|
|
b34d678 |
/*
|
|
|
b34d678 |
@@ -4621,7 +4621,7 @@ restart:
|
|
|
b34d678 |
*/
|
|
|
b34d678 |
if (sp->role.direct &&
|
|
|
b34d678 |
!kvm_is_reserved_pfn(pfn) &&
|
|
|
b34d678 |
- PageTransCompound(pfn_to_page(pfn))) {
|
|
|
b34d678 |
+ PageTransCompoundMap(pfn_to_page(pfn))) {
|
|
|
b34d678 |
drop_spte(kvm, sptep);
|
|
|
b34d678 |
need_tlb_flush = 1;
|
|
|
b34d678 |
goto restart;
|
|
|
b34d678 |
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
|
|
|
b34d678 |
index 19724e6..522bd6d 100644
|
|
|
b34d678 |
--- a/include/linux/page-flags.h
|
|
|
b34d678 |
+++ b/include/linux/page-flags.h
|
|
|
b34d678 |
@@ -517,6 +517,27 @@ static inline int PageTransCompound(struct page *page)
|
|
|
b34d678 |
}
|
|
|
b34d678 |
|
|
|
b34d678 |
/*
|
|
|
b34d678 |
+ * PageTransCompoundMap is the same as PageTransCompound, but it also
|
|
|
b34d678 |
+ * guarantees the primary MMU has the entire compound page mapped
|
|
|
b34d678 |
+ * through pmd_trans_huge, which in turn guarantees the secondary MMUs
|
|
|
b34d678 |
+ * can also map the entire compound page. This allows the secondary
|
|
|
b34d678 |
+ * MMUs to call get_user_pages() only once for each compound page and
|
|
|
b34d678 |
+ * to immediately map the entire compound page with a single secondary
|
|
|
b34d678 |
+ * MMU fault. If there will be a pmd split later, the secondary MMUs
|
|
|
b34d678 |
+ * will get an update through the MMU notifier invalidation through
|
|
|
b34d678 |
+ * split_huge_pmd().
|
|
|
b34d678 |
+ *
|
|
|
b34d678 |
+ * Unlike PageTransCompound, this is safe to be called only while
|
|
|
b34d678 |
+ * split_huge_pmd() cannot run from under us, like if protected by the
|
|
|
b34d678 |
+ * MMU notifier, otherwise it may result in page->_mapcount < 0 false
|
|
|
b34d678 |
+ * positives.
|
|
|
b34d678 |
+ */
|
|
|
b34d678 |
+static inline int PageTransCompoundMap(struct page *page)
|
|
|
b34d678 |
+{
|
|
|
b34d678 |
+ return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0;
|
|
|
b34d678 |
+}
|
|
|
b34d678 |
+
|
|
|
b34d678 |
+/*
|
|
|
b34d678 |
* PageTransTail returns true for both transparent huge pages
|
|
|
b34d678 |
* and hugetlbfs pages, so it should only be called when it's known
|
|
|
b34d678 |
* that hugetlbfs pages aren't involved.
|
|
|
b34d678 |
@@ -559,6 +580,7 @@ static inline int TestClearPageDoubleMap(struct page *page)
|
|
|
b34d678 |
#else
|
|
|
b34d678 |
TESTPAGEFLAG_FALSE(TransHuge)
|
|
|
b34d678 |
TESTPAGEFLAG_FALSE(TransCompound)
|
|
|
b34d678 |
+TESTPAGEFLAG_FALSE(TransCompoundMap)
|
|
|
b34d678 |
TESTPAGEFLAG_FALSE(TransTail)
|
|
|
b34d678 |
TESTPAGEFLAG_FALSE(DoubleMap)
|
|
|
b34d678 |
TESTSETFLAG_FALSE(DoubleMap)
|
|
|
b34d678 |
--
|
|
|
b34d678 |
2.7.4
|
|
|
b34d678 |
|