b34d678
From 94f984ff563d1777652b822d7a282cacc1e481c2 Mon Sep 17 00:00:00 2001
b34d678
From: Andrea Arcangeli <aarcange@redhat.com>
b34d678
Date: Wed, 27 Apr 2016 12:04:46 -0500
b34d678
Subject: [PATCH] mm: thp: kvm: fix memory corruption in KVM with THP enabled
b34d678
b34d678
After the THP refcounting change, obtaining a compound pages from
b34d678
get_user_pages() no longer allows us to assume the entire compound
b34d678
page is immediately mappable from a secondary MMU.
b34d678
b34d678
A secondary MMU doesn't want to call get_user_pages() more than once
b34d678
for each compound page, in order to know if it can map the whole
b34d678
compound page. So a secondary MMU needs to know from a single
b34d678
get_user_pages() invocation when it can map immediately the entire
b34d678
compound page to avoid a flood of unnecessary secondary MMU faults and
b34d678
spurious atomic_inc()/atomic_dec() (pages don't have to be pinned by
b34d678
MMU notifier users).
b34d678
b34d678
Ideally instead of the page->_mapcount < 1 check, get_user_pages()
b34d678
should return the granularity of the "page" mapping in the "mm" passed
b34d678
to get_user_pages(). However it's non trivial change to pass the "pmd"
b34d678
status belonging to the "mm" walked by get_user_pages up the stack (up
b34d678
to the caller of get_user_pages). So the fix just checks if there is
b34d678
not a single pte mapping on the page returned by get_user_pages, and
b34d678
in turn if the caller can assume that the whole compound page is
b34d678
mapped in the current "mm" (in a pmd_trans_huge()). In such case the
b34d678
entire compound page is safe to map into the secondary MMU without
b34d678
additional get_user_pages() calls on the surrounding tail/head
b34d678
pages. In addition of being faster, not having to run other
b34d678
get_user_pages() calls also reduces the memory footprint of the
b34d678
secondary MMU fault in case the pmd split happened as result of memory
b34d678
pressure.
b34d678
b34d678
Without this fix after a MADV_DONTNEED (like invoked by QEMU during
b34d678
postcopy live migration or balloning) or after generic swapping (with
b34d678
a failure in split_huge_page() that would only result in pmd splitting
b34d678
and not a physical page split), KVM would map the whole compound page
b34d678
into the shadow pagetables, despite regular faults or userfaults (like
b34d678
UFFDIO_COPY) may map regular pages into the primary MMU as result of
b34d678
the pte faults, leading to the guest mode and userland mode going out
b34d678
of sync and not working on the same memory at all times.
b34d678
b34d678
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
b34d678
---
b34d678
 arch/arm/kvm/mmu.c         |  2 +-
b34d678
 arch/x86/kvm/mmu.c         |  4 ++--
b34d678
 include/linux/page-flags.h | 22 ++++++++++++++++++++++
b34d678
 3 files changed, 25 insertions(+), 3 deletions(-)
b34d678
b34d678
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
b34d678
index aba61fd..8dafe97 100644
b34d678
--- a/arch/arm/kvm/mmu.c
b34d678
+++ b/arch/arm/kvm/mmu.c
b34d678
@@ -997,7 +997,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
b34d678
 	kvm_pfn_t pfn = *pfnp;
b34d678
 	gfn_t gfn = *ipap >> PAGE_SHIFT;
b34d678
b34d678
-	if (PageTransCompound(pfn_to_page(pfn))) {
b34d678
+	if (PageTransCompoundMap(pfn_to_page(pfn))) {
b34d678
 		unsigned long mask;
b34d678
 		/*
b34d678
 		 * The address we faulted on is backed by a transparent huge
b34d678
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
b34d678
index 1e7a49b..3a371f7 100644
b34d678
--- a/arch/x86/kvm/mmu.c
b34d678
+++ b/arch/x86/kvm/mmu.c
b34d678
@@ -2767,7 +2767,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
b34d678
 	 */
b34d678
 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
b34d678
 	    level == PT_PAGE_TABLE_LEVEL &&
b34d678
-	    PageTransCompound(pfn_to_page(pfn)) &&
b34d678
+	    PageTransCompoundMap(pfn_to_page(pfn)) &&
b34d678
 	    !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
b34d678
 		unsigned long mask;
b34d678
 		/*
b34d678
@@ -4621,7 +4621,7 @@ restart:
b34d678
 		 */
b34d678
 		if (sp->role.direct &&
b34d678
 			!kvm_is_reserved_pfn(pfn) &&
b34d678
-			PageTransCompound(pfn_to_page(pfn))) {
b34d678
+			PageTransCompoundMap(pfn_to_page(pfn))) {
b34d678
 			drop_spte(kvm, sptep);
b34d678
 			need_tlb_flush = 1;
b34d678
 			goto restart;
b34d678
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
b34d678
index 19724e6..522bd6d 100644
b34d678
--- a/include/linux/page-flags.h
b34d678
+++ b/include/linux/page-flags.h
b34d678
@@ -517,6 +517,27 @@ static inline int PageTransCompound(struct page *page)
b34d678
 }
b34d678
b34d678
 /*
b34d678
+ * PageTransCompoundMap is the same as PageTransCompound, but it also
b34d678
+ * guarantees the primary MMU has the entire compound page mapped
b34d678
+ * through pmd_trans_huge, which in turn guarantees the secondary MMUs
b34d678
+ * can also map the entire compound page. This allows the secondary
b34d678
+ * MMUs to call get_user_pages() only once for each compound page and
b34d678
+ * to immediately map the entire compound page with a single secondary
b34d678
+ * MMU fault. If there will be a pmd split later, the secondary MMUs
b34d678
+ * will get an update through the MMU notifier invalidation through
b34d678
+ * split_huge_pmd().
b34d678
+ *
b34d678
+ * Unlike PageTransCompound, this is safe to be called only while
b34d678
+ * split_huge_pmd() cannot run from under us, like if protected by the
b34d678
+ * MMU notifier, otherwise it may result in page->_mapcount < 0 false
b34d678
+ * positives.
b34d678
+ */
b34d678
+static inline int PageTransCompoundMap(struct page *page)
b34d678
+{
b34d678
+	return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0;
b34d678
+}
b34d678
+
b34d678
+/*
b34d678
  * PageTransTail returns true for both transparent huge pages
b34d678
  * and hugetlbfs pages, so it should only be called when it's known
b34d678
  * that hugetlbfs pages aren't involved.
b34d678
@@ -559,6 +580,7 @@ static inline int TestClearPageDoubleMap(struct page *page)
b34d678
 #else
b34d678
 TESTPAGEFLAG_FALSE(TransHuge)
b34d678
 TESTPAGEFLAG_FALSE(TransCompound)
b34d678
+TESTPAGEFLAG_FALSE(TransCompoundMap)
b34d678
 TESTPAGEFLAG_FALSE(TransTail)
b34d678
 TESTPAGEFLAG_FALSE(DoubleMap)
b34d678
 	TESTSETFLAG_FALSE(DoubleMap)
b34d678
-- 
b34d678
2.7.4
b34d678