From 1275559aac5f3fc90488d1285b2e2902801ab383 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Thu, 12 May 2016 23:52:30 +0200 Subject: [PATCH] [arm,x86] Fix memory corruption in KVM with THP enabled. --- debian/changelog | 1 + ...emory-corruption-in-KVM-with-THP-ena.patch | 143 ++++++++++++++++++ debian/patches/series | 1 + 3 files changed, 145 insertions(+) create mode 100644 debian/patches/bugfix/all/mm-thp-kvm-fix-memory-corruption-in-KVM-with-THP-ena.patch diff --git a/debian/changelog b/debian/changelog index 031f055a5..30cccb4f2 100644 --- a/debian/changelog +++ b/debian/changelog @@ -83,6 +83,7 @@ linux (4.5.4-1) UNRELEASED; urgency=medium [ Aurelien Jarno ] * [mips*] Fix PR_SET_FPMODE issues with multi-threaded programs. * [i386] Stop recommending libc6-i686. + * [arm,x86] Fix memory corruption in KVM with THP enabled. -- Aurelien Jarno Tue, 10 May 2016 23:58:07 +0200 diff --git a/debian/patches/bugfix/all/mm-thp-kvm-fix-memory-corruption-in-KVM-with-THP-ena.patch b/debian/patches/bugfix/all/mm-thp-kvm-fix-memory-corruption-in-KVM-with-THP-ena.patch new file mode 100644 index 000000000..0546c7321 --- /dev/null +++ b/debian/patches/bugfix/all/mm-thp-kvm-fix-memory-corruption-in-KVM-with-THP-ena.patch @@ -0,0 +1,143 @@ +From: Andrea Arcangeli +Date: Thu, 5 May 2016 16:22:20 -0700 +Subject: mm: thp: kvm: fix memory corruption in KVM with THP enabled +Origin: https://git.kernel.org/linus/127393fbe597dd85863a9bdccaa11007e7d4948f + +After the THP refcounting change, obtaining a compound pages from +get_user_pages() no longer allows us to assume the entire compound page +is immediately mappable from a secondary MMU. + +A secondary MMU doesn't want to call get_user_pages() more than once for +each compound page, in order to know if it can map the whole compound +page. So a secondary MMU needs to know from a single get_user_pages() +invocation when it can map immediately the entire compound page to avoid +a flood of unnecessary secondary MMU faults and spurious +atomic_inc()/atomic_dec() (pages don't have to be pinned by MMU notifier +users). + +Ideally instead of the page->_mapcount < 1 check, get_user_pages() +should return the granularity of the "page" mapping in the "mm" passed +to get_user_pages(). However it's non trivial change to pass the "pmd" +status belonging to the "mm" walked by get_user_pages up the stack (up +to the caller of get_user_pages). So the fix just checks if there is +not a single pte mapping on the page returned by get_user_pages, and in +turn if the caller can assume that the whole compound page is mapped in +the current "mm" (in a pmd_trans_huge()). In such case the entire +compound page is safe to map into the secondary MMU without additional +get_user_pages() calls on the surrounding tail/head pages. In addition +of being faster, not having to run other get_user_pages() calls also +reduces the memory footprint of the secondary MMU fault in case the pmd +split happened as result of memory pressure. + +Without this fix after a MADV_DONTNEED (like invoked by QEMU during +postcopy live migration or balloning) or after generic swapping (with a +failure in split_huge_page() that would only result in pmd splitting and +not a physical page split), KVM would map the whole compound page into +the shadow pagetables, despite regular faults or userfaults (like +UFFDIO_COPY) may map regular pages into the primary MMU as result of the +pte faults, leading to the guest mode and userland mode going out of +sync and not working on the same memory at all times. + +Any other secondary MMU notifier manager (KVM is just one of the many +MMU notifier users) will need the same information if it doesn't want to +run a flood of get_user_pages_fast and it can support multiple +granularity in the secondary MMU mappings, so I think it is justified to +be exposed not just to KVM. + +The other option would be to move transparent_hugepage_adjust to +mm/huge_memory.c but that currently has all kind of KVM data structures +in it, so it's definitely not a cut-and-paste work, so I couldn't do a +fix as cleaner as this one for 4.6. + +Signed-off-by: Andrea Arcangeli +Cc: "Dr. David Alan Gilbert" +Cc: "Kirill A. Shutemov" +Cc: "Li, Liang Z" +Cc: Amit Shah +Cc: Paolo Bonzini +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +--- + arch/arm/kvm/mmu.c | 2 +- + arch/x86/kvm/mmu.c | 4 ++-- + include/linux/page-flags.h | 22 ++++++++++++++++++++++ + 3 files changed, 25 insertions(+), 3 deletions(-) + +diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c +index 58dbd5c..d6d4191 100644 +--- a/arch/arm/kvm/mmu.c ++++ b/arch/arm/kvm/mmu.c +@@ -1004,7 +1004,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) + kvm_pfn_t pfn = *pfnp; + gfn_t gfn = *ipap >> PAGE_SHIFT; + +- if (PageTransCompound(pfn_to_page(pfn))) { ++ if (PageTransCompoundMap(pfn_to_page(pfn))) { + unsigned long mask; + /* + * The address we faulted on is backed by a transparent huge +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index 1ff4dbb..b6f50e8 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -2823,7 +2823,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, + */ + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && + level == PT_PAGE_TABLE_LEVEL && +- PageTransCompound(pfn_to_page(pfn)) && ++ PageTransCompoundMap(pfn_to_page(pfn)) && + !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) { + unsigned long mask; + /* +@@ -4785,7 +4785,7 @@ restart: + */ + if (sp->role.direct && + !kvm_is_reserved_pfn(pfn) && +- PageTransCompound(pfn_to_page(pfn))) { ++ PageTransCompoundMap(pfn_to_page(pfn))) { + drop_spte(kvm, sptep); + need_tlb_flush = 1; + goto restart; +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index f4ed4f1b..6b052aa 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -517,6 +517,27 @@ static inline int PageTransCompound(struct page *page) + } + + /* ++ * PageTransCompoundMap is the same as PageTransCompound, but it also ++ * guarantees the primary MMU has the entire compound page mapped ++ * through pmd_trans_huge, which in turn guarantees the secondary MMUs ++ * can also map the entire compound page. This allows the secondary ++ * MMUs to call get_user_pages() only once for each compound page and ++ * to immediately map the entire compound page with a single secondary ++ * MMU fault. If there will be a pmd split later, the secondary MMUs ++ * will get an update through the MMU notifier invalidation through ++ * split_huge_pmd(). ++ * ++ * Unlike PageTransCompound, this is safe to be called only while ++ * split_huge_pmd() cannot run from under us, like if protected by the ++ * MMU notifier, otherwise it may result in page->_mapcount < 0 false ++ * positives. ++ */ ++static inline int PageTransCompoundMap(struct page *page) ++{ ++ return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0; ++} ++ ++/* + * PageTransTail returns true for both transparent huge pages + * and hugetlbfs pages, so it should only be called when it's known + * that hugetlbfs pages aren't involved. +@@ -559,6 +580,7 @@ static inline int TestClearPageDoubleMap(struct page *page) + #else + TESTPAGEFLAG_FALSE(TransHuge) + TESTPAGEFLAG_FALSE(TransCompound) ++TESTPAGEFLAG_FALSE(TransCompoundMap) + TESTPAGEFLAG_FALSE(TransTail) + TESTPAGEFLAG_FALSE(DoubleMap) + TESTSETFLAG_FALSE(DoubleMap) +-- +2.8.1 + diff --git a/debian/patches/series b/debian/patches/series index 8ce39e08e..8e8976ebf 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -83,6 +83,7 @@ bugfix/all/mm-zone_device-depends-on-sparsemem_vmemmap.patch bugfix/all/fs-add-module_softdep-declarations-for-hard-coded-cr.patch bugfix/all/atl2-disable-unimplemented-scatter-gather-feature.patch bugfix/all/module-invalidate-signatures-on-force-loaded-modules.patch +bugfix/all/mm-thp-kvm-fix-memory-corruption-in-KVM-with-THP-ena.patch # Miscellaneous features features/all/mm-exclude-zone_device-from-gfp_zone_table.patch