linux/debian/patches/bugfix/x86/itlb_multihit/0010-kvm-mmu-ITLB_MULTIHIT-...

From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 24 Oct 2019 18:34:28 +0200
Subject: kvm: mmu: ITLB_MULTIHIT mitigation

With some Intel processors, putting the same virtual address in the TLB
as both a 4 KiB and 2 MiB page can confuse the instruction fetch unit
and cause the processor to issue a machine check.  Unfortunately if EPT
page tables use huge pages, it possible for a malicious guest to cause
this situation.

This patch adds a knob to mark huge pages as non-executable. When the
nx_huge_pages parameter is enabled (and we are using EPT), all huge pages
are marked as NX. If the guest attempts to execute in one of those pages,
the page is broken down into 4K pages, which are then marked executable.

This is not an issue for shadow paging (except nested EPT), because then
the host is in control of TLB flushes and the problematic situation cannot
happen.  With nested EPT, again the nested guest can cause problems so we
treat shadow and direct EPT the same.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[bwh: Backported to 4.19:
 - Use kvm_mmu_invalidate_zap_all_pages() instead of kvm_mmu_zap_all_fast()
 - Adjust context]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 .../admin-guide/kernel-parameters.txt         |  11 ++
 arch/x86/include/asm/kvm_host.h               |   2 +
 arch/x86/kernel/cpu/bugs.c                    |  13 +-
 arch/x86/kvm/mmu.c                            | 135 +++++++++++++++++-
 arch/x86/kvm/paging_tmpl.h                    |  29 +++-
 arch/x86/kvm/x86.c                            |   9 ++
 6 files changed, 186 insertions(+), 13 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 16607b178b47..b2c1a5c63ab3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1956,6 +1956,17 @@
 			KVM MMU at runtime.
 			Default is 0 (off)
 
+	kvm.nx_huge_pages=
+			[KVM] Controls the sw workaround for bug
+			X86_BUG_ITLB_MULTIHIT.
+			force	: Always deploy workaround.
+			off	: Default. Never deploy workaround.
+			auto	: Deploy workaround based on presence of
+				  X86_BUG_ITLB_MULTIHIT.
+
+			If the sw workaround is enabled for the host, guests
+			need not enable it for nested guests.
+
 	kvm-amd.nested=	[KVM,AMD] Allow nested virtualization in KVM/SVM.
 			Default is 1 (enabled)
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 90dccb5c79d9..59b44445ed59 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -282,6 +282,7 @@ struct kvm_mmu_page {
 	struct list_head link;
 	struct hlist_node hash_link;
 	bool unsync;
+	bool lpage_disallowed; /* Can't be replaced by an equiv large page */
 
 	/*
 	 * The following two entries are used to key the shadow page in the
@@ -890,6 +891,7 @@ struct kvm_vm_stat {
 	ulong mmu_unsync;
 	ulong remote_tlb_flush;
 	ulong lpages;
+	ulong nx_lpage_splits;
 	ulong max_mmu_page_hash_collisions;
 };
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 60e47e492c2f..1e764992fa64 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1119,6 +1119,9 @@ void x86_spec_ctrl_setup_ap(void)
 		x86_amd_ssb_disable();
 }
 
+bool itlb_multihit_kvm_mitigation;
+EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+
 #undef pr_fmt
 #define pr_fmt(fmt)	"L1TF: " fmt
 
@@ -1274,17 +1277,25 @@ static ssize_t l1tf_show_state(char *buf)
 		       l1tf_vmx_states[l1tf_vmx_mitigation],
 		       sched_smt_active() ? "vulnerable" : "disabled");
 }
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+	if (itlb_multihit_kvm_mitigation)
+		return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
+	else
+		return sprintf(buf, "KVM: Vulnerable\n");
+}
 #else
 static ssize_t l1tf_show_state(char *buf)
 {
 	return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
 }
-#endif
 
 static ssize_t itlb_multihit_show_state(char *buf)
 {
 	return sprintf(buf, "Processor vulnerable\n");
 }
+#endif
 
 static ssize_t mds_show_state(char *buf)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7f9be921df7c..19c3dc9b05cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -49,6 +49,20 @@
 #include <asm/kvm_page_track.h>
 #include "trace.h"
 
+extern bool itlb_multihit_kvm_mitigation;
+
+static int __read_mostly nx_huge_pages = -1;
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+
+static struct kernel_param_ops nx_huge_pages_ops = {
+	.set = set_nx_huge_pages,
+	.get = param_get_bool,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
  * where the hardware walks 2 page tables:
@@ -284,6 +298,11 @@ static inline bool spte_ad_enabled(u64 spte)
 	return !(spte & shadow_acc_track_value);
 }
 
+static bool is_nx_huge_page_enabled(void)
+{
+	return READ_ONCE(nx_huge_pages);
+}
+
 static inline u64 spte_shadow_accessed_mask(u64 spte)
 {
 	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
@@ -1096,6 +1115,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	kvm_mmu_gfn_disallow_lpage(slot, gfn);
 }
 
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	if (sp->lpage_disallowed)
+		return;
+
+	++kvm->stat.nx_lpage_splits;
+	sp->lpage_disallowed = true;
+}
+
 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	struct kvm_memslots *slots;
@@ -1113,6 +1141,12 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	kvm_mmu_gfn_allow_lpage(slot, gfn);
 }
 
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	--kvm->stat.nx_lpage_splits;
+	sp->lpage_disallowed = false;
+}
+
 static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
 					  struct kvm_memory_slot *slot)
 {
@@ -2665,6 +2699,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 			kvm_reload_remote_mmus(kvm);
 	}
 
+	if (sp->lpage_disallowed)
+		unaccount_huge_nx_page(kvm, sp);
+
 	sp->role.invalid = 1;
 	return ret;
 }
@@ -2873,6 +2910,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (!speculative)
 		spte |= spte_shadow_accessed_mask(spte);
 
+	if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+	    is_nx_huge_page_enabled()) {
+		pte_access &= ~ACC_EXEC_MASK;
+	}
+
 	if (pte_access & ACC_EXEC_MASK)
 		spte |= shadow_x_mask;
 	else
@@ -3091,9 +3133,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 	__direct_pte_prefetch(vcpu, sp, sptep);
 }
 
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
+				       gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
+{
+	int level = *levelp;
+	u64 spte = *it.sptep;
+
+	if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+	    is_nx_huge_page_enabled() &&
+	    is_shadow_present_pte(spte) &&
+	    !is_large_pte(spte)) {
+		/*
+		 * A small SPTE exists for this pfn, but FNAME(fetch)
+		 * and __direct_map would like to create a large PTE
+		 * instead: just force them to go down another level,
+		 * patching back for them into pfn the next 9 bits of
+		 * the address.
+		 */
+		u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
+		*pfnp |= gfn & page_mask;
+		(*levelp)--;
+	}
+}
+
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
 			int map_writable, int level, kvm_pfn_t pfn,
-			bool prefault)
+			bool prefault, bool lpage_disallowed)
 {
 	struct kvm_shadow_walk_iterator it;
 	struct kvm_mmu_page *sp;
@@ -3106,6 +3171,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
 
 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
 	for_each_shadow_entry(vcpu, gpa, it) {
+		/*
+		 * We cannot overwrite existing page tables with an NX
+		 * large page, as the leaf could be executable.
+		 */
+		disallowed_hugepage_adjust(it, gfn, &pfn, &level);
+
 		base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
 		if (it.level == level)
 			break;
@@ -3116,6 +3187,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
 					      it.level - 1, true, ACC_ALL);
 
 			link_shadow_page(vcpu, it.sptep, sp);
+			if (lpage_disallowed)
+				account_huge_nx_page(vcpu->kvm, sp);
 		}
 	}
 
@@ -3416,11 +3489,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 {
 	int r;
 	int level;
-	bool force_pt_level = false;
+	bool force_pt_level;
 	kvm_pfn_t pfn;
 	unsigned long mmu_seq;
 	bool map_writable, write = error_code & PFERR_WRITE_MASK;
+	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+				is_nx_huge_page_enabled();
 
+	force_pt_level = lpage_disallowed;
 	level = mapping_level(vcpu, gfn, &force_pt_level);
 	if (likely(!force_pt_level)) {
 		/*
@@ -3454,7 +3530,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 		goto out_unlock;
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-	r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
+	r = __direct_map(vcpu, v, write, map_writable, level, pfn,
+			 prefault, false);
 out_unlock:
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	kvm_release_pfn_clean(pfn);
@@ -4048,6 +4125,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	unsigned long mmu_seq;
 	int write = error_code & PFERR_WRITE_MASK;
 	bool map_writable;
+	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+				is_nx_huge_page_enabled();
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
@@ -4058,8 +4137,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (r)
 		return r;
 
-	force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
-							   PT_DIRECTORY_LEVEL);
+	force_pt_level =
+		lpage_disallowed ||
+		!check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
 	level = mapping_level(vcpu, gfn, &force_pt_level);
 	if (likely(!force_pt_level)) {
 		if (level > PT_DIRECTORY_LEVEL &&
@@ -4088,7 +4168,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 		goto out_unlock;
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-	r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
+	r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+			 prefault, lpage_disallowed);
 out_unlock:
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	kvm_release_pfn_clean(pfn);
@@ -5886,10 +5967,52 @@ static void mmu_destroy_caches(void)
 	kmem_cache_destroy(mmu_page_header_cache);
 }
 
+static void __set_nx_huge_pages(bool val)
+{
+	nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+	bool old_val = nx_huge_pages;
+	bool new_val;
+
+	/* In "auto" mode deploy workaround only if CPU has the bug. */
+	if (sysfs_streq(val, "off"))
+		new_val = 0;
+	else if (sysfs_streq(val, "force"))
+		new_val = 1;
+	else if (sysfs_streq(val, "auto"))
+		new_val = boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT);
+	else if (strtobool(val, &new_val) < 0)
+		return -EINVAL;
+
+	__set_nx_huge_pages(new_val);
+
+	if (new_val != old_val) {
+		struct kvm *kvm;
+		int idx;
+
+		mutex_lock(&kvm_lock);
+
+		list_for_each_entry(kvm, &vm_list, vm_list) {
+			idx = srcu_read_lock(&kvm->srcu);
+			kvm_mmu_invalidate_zap_all_pages(kvm);
+			srcu_read_unlock(&kvm->srcu, idx);
+		}
+		mutex_unlock(&kvm_lock);
+	}
+
+	return 0;
+}
+
 int kvm_mmu_module_init(void)
 {
 	int ret = -ENOMEM;
 
+	if (nx_huge_pages == -1)
+		__set_nx_huge_pages(boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT));
+
 	kvm_mmu_reset_all_pte_masks();
 
 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 3b022b08b577..adf42dc8d38b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -596,13 +596,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *gw,
 			 int write_fault, int hlevel,
-			 kvm_pfn_t pfn, bool map_writable, bool prefault)
+			 kvm_pfn_t pfn, bool map_writable, bool prefault,
+			 bool lpage_disallowed)
 {
 	struct kvm_mmu_page *sp = NULL;
 	struct kvm_shadow_walk_iterator it;
 	unsigned direct_access, access = gw->pt_access;
 	int top_level, ret;
-	gfn_t base_gfn;
+	gfn_t gfn, base_gfn;
 
 	direct_access = gw->pte_access;
 
@@ -647,13 +648,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			link_shadow_page(vcpu, it.sptep, sp);
 	}
 
-	base_gfn = gw->gfn;
+	/*
+	 * FNAME(page_fault) might have clobbered the bottom bits of
+	 * gw->gfn, restore them from the virtual address.
+	 */
+	gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
+	base_gfn = gfn;
 
 	trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
 
 	for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
 		clear_sp_write_flooding_count(it.sptep);
-		base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+
+		/*
+		 * We cannot overwrite existing page tables with an NX
+		 * large page, as the leaf could be executable.
+		 */
+		disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
+
+		base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
 		if (it.level == hlevel)
 			break;
 
@@ -665,6 +678,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
 					      it.level - 1, true, direct_access);
 			link_shadow_page(vcpu, it.sptep, sp);
+			if (lpage_disallowed)
+				account_huge_nx_page(vcpu->kvm, sp);
 		}
 	}
 
@@ -741,9 +756,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int r;
 	kvm_pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
-	bool force_pt_level = false;
 	unsigned long mmu_seq;
 	bool map_writable, is_self_change_mapping;
+	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+				is_nx_huge_page_enabled();
+	bool force_pt_level = lpage_disallowed;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
@@ -833,7 +850,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	if (!force_pt_level)
 		transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
 	r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
-			 level, pfn, map_writable, prefault);
+			 level, pfn, map_writable, prefault, lpage_disallowed);
 	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 
 out_unlock:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2714c1a0e59f..ec80bb27504f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -206,6 +206,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmu_unsync", VM_STAT(mmu_unsync) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages, .mode = 0444) },
+	{ "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
 	{ "max_mmu_page_hash_collisions",
 		VM_STAT(max_mmu_page_hash_collisions) },
 	{ NULL }
@@ -1130,6 +1131,14 @@ u64 kvm_get_arch_capabilities(void)
 
 	rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
 
+	/*
+	 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+	 * the nested hypervisor runs with NX huge pages.  If it is not,
+	 * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+	 * L1 guests, so it need not worry about its own (L2) guests.
+	 */
+	data |= ARCH_CAP_PSCHANGE_MC_NO;
+
 	/*
 	 * If we're doing cache flushes (either "always" or "cond")
 	 * we will do one whenever the guest does a vmlaunch/vmresume.
[x86] KVM: Add mitigation for Machine Check Error on Page Size Change (aka iTLB multi-hit, CVE-2018-12207) This is a backport of v6 of the "NX" patch set, and will probably require updates before release. 2019-10-20 13:32:35 +00:00			`From: Paolo Bonzini <pbonzini@redhat.com>`
[x86] Update NX patch set to v7 2019-10-24 21:48:50 +00:00			`Date: Thu, 24 Oct 2019 18:34:28 +0200`
[x86] KVM: Add mitigation for Machine Check Error on Page Size Change (aka iTLB multi-hit, CVE-2018-12207) This is a backport of v6 of the "NX" patch set, and will probably require updates before release. 2019-10-20 13:32:35 +00:00			`Subject: kvm: mmu: ITLB_MULTIHIT mitigation`

			`With some Intel processors, putting the same virtual address in the TLB`
			`as both a 4 KiB and 2 MiB page can confuse the instruction fetch unit`
			`and cause the processor to issue a machine check. Unfortunately if EPT`
			`page tables use huge pages, it possible for a malicious guest to cause`
			`this situation.`

			`This patch adds a knob to mark huge pages as non-executable. When the`
			`nx_huge_pages parameter is enabled (and we are using EPT), all huge pages`
			`are marked as NX. If the guest attempts to execute in one of those pages,`
			`the page is broken down into 4K pages, which are then marked executable.`

			`This is not an issue for shadow paging (except nested EPT), because then`
			`the host is in control of TLB flushes and the problematic situation cannot`
			`happen. With nested EPT, again the nested guest can cause problems so we`
			`treat shadow and direct EPT the same.`

			`Signed-off-by: Junaid Shahid <junaids@google.com>`
			`Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>`
			`[bwh: Backported to 4.19:`
			`- Use kvm_mmu_invalidate_zap_all_pages() instead of kvm_mmu_zap_all_fast()`
			`- Adjust context]`
			`Signed-off-by: Ben Hutchings <ben@decadent.org.uk>`
			`---`
			`.../admin-guide/kernel-parameters.txt \| 11 ++`
			`arch/x86/include/asm/kvm_host.h \| 2 +`
			`arch/x86/kernel/cpu/bugs.c \| 13 +-`
			`arch/x86/kvm/mmu.c \| 135 +++++++++++++++++-`
			`arch/x86/kvm/paging_tmpl.h \| 29 +++-`
[x86] Update NX patch set to v7 2019-10-24 21:48:50 +00:00			`arch/x86/kvm/x86.c \| 9 ++`
			`6 files changed, 186 insertions(+), 13 deletions(-)`
[x86] KVM: Add mitigation for Machine Check Error on Page Size Change (aka iTLB multi-hit, CVE-2018-12207) This is a backport of v6 of the "NX" patch set, and will probably require updates before release. 2019-10-20 13:32:35 +00:00
			`diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt`
			`index 16607b178b47..b2c1a5c63ab3 100644`
			`--- a/Documentation/admin-guide/kernel-parameters.txt`
			`+++ b/Documentation/admin-guide/kernel-parameters.txt`
			`@@ -1956,6 +1956,17 @@`
			`KVM MMU at runtime.`
			`Default is 0 (off)`

			`+ kvm.nx_huge_pages=`
			`+ [KVM] Controls the sw workaround for bug`
			`+ X86_BUG_ITLB_MULTIHIT.`
			`+ force : Always deploy workaround.`
			`+ off : Default. Never deploy workaround.`
			`+ auto : Deploy workaround based on presence of`
			`+ X86_BUG_ITLB_MULTIHIT.`
			`+`
			`+ If the sw workaround is enabled for the host, guests`
			`+ need not enable it for nested guests.`
			`+`
			`kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.`
			`Default is 1 (enabled)`

			`diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h`
			`index 90dccb5c79d9..59b44445ed59 100644`
			`--- a/arch/x86/include/asm/kvm_host.h`
			`+++ b/arch/x86/include/asm/kvm_host.h`
			`@@ -282,6 +282,7 @@ struct kvm_mmu_page {`
			`struct list_head link;`
			`struct hlist_node hash_link;`
			`bool unsync;`
			`+ bool lpage_disallowed; /* Can't be replaced by an equiv large page */`

			`/*`
			`* The following two entries are used to key the shadow page in the`
			`@@ -890,6 +891,7 @@ struct kvm_vm_stat {`
			`ulong mmu_unsync;`
			`ulong remote_tlb_flush;`
			`ulong lpages;`
			`+ ulong nx_lpage_splits;`
			`ulong max_mmu_page_hash_collisions;`
			`};`

			`diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c`
			`index 60e47e492c2f..1e764992fa64 100644`
			`--- a/arch/x86/kernel/cpu/bugs.c`
			`+++ b/arch/x86/kernel/cpu/bugs.c`
			`@@ -1119,6 +1119,9 @@ void x86_spec_ctrl_setup_ap(void)`
			`x86_amd_ssb_disable();`
			`}`

			`+bool itlb_multihit_kvm_mitigation;`
			`+EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);`
			`+`
			`#undef pr_fmt`
			`#define pr_fmt(fmt) "L1TF: " fmt`

			`@@ -1274,17 +1277,25 @@ static ssize_t l1tf_show_state(char *buf)`
			`l1tf_vmx_states[l1tf_vmx_mitigation],`
			`sched_smt_active() ? "vulnerable" : "disabled");`
			`}`
			`+`
			`+static ssize_t itlb_multihit_show_state(char *buf)`
			`+{`
			`+ if (itlb_multihit_kvm_mitigation)`
			`+ return sprintf(buf, "KVM: Mitigation: Split huge pages\n");`
			`+ else`
			`+ return sprintf(buf, "KVM: Vulnerable\n");`
			`+}`
			`#else`
			`static ssize_t l1tf_show_state(char *buf)`
			`{`
			`return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);`
			`}`
			`-#endif`

			`static ssize_t itlb_multihit_show_state(char *buf)`
			`{`
			`return sprintf(buf, "Processor vulnerable\n");`
			`}`
			`+#endif`

			`static ssize_t mds_show_state(char *buf)`
			`{`
			`diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c`
			`index 7f9be921df7c..19c3dc9b05cb 100644`
			`--- a/arch/x86/kvm/mmu.c`
			`+++ b/arch/x86/kvm/mmu.c`
			`@@ -49,6 +49,20 @@`
			`#include <asm/kvm_page_track.h>`
			`#include "trace.h"`

			`+extern bool itlb_multihit_kvm_mitigation;`
			`+`
			`+static int __read_mostly nx_huge_pages = -1;`
			`+`
			`+static int set_nx_huge_pages(const char val, const struct kernel_param kp);`
			`+`
			`+static struct kernel_param_ops nx_huge_pages_ops = {`
			`+ .set = set_nx_huge_pages,`
			`+ .get = param_get_bool,`
			`+};`
			`+`
			`+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);`
			`+__MODULE_PARM_TYPE(nx_huge_pages, "bool");`
			`+`
			`/*`
			`* When setting this variable to true it enables Two-Dimensional-Paging`
			`* where the hardware walks 2 page tables:`
			`@@ -284,6 +298,11 @@ static inline bool spte_ad_enabled(u64 spte)`
			`return !(spte & shadow_acc_track_value);`
			`}`

			`+static bool is_nx_huge_page_enabled(void)`
			`+{`
			`+ return READ_ONCE(nx_huge_pages);`
			`+}`
			`+`
			`static inline u64 spte_shadow_accessed_mask(u64 spte)`
			`{`
			`MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);`
			`@@ -1096,6 +1115,15 @@ static void account_shadowed(struct kvm kvm, struct kvm_mmu_page sp)`
			`kvm_mmu_gfn_disallow_lpage(slot, gfn);`
			`}`

			`+static void account_huge_nx_page(struct kvm kvm, struct kvm_mmu_page sp)`
			`+{`
			`+ if (sp->lpage_disallowed)`
			`+ return;`
			`+`
			`+ ++kvm->stat.nx_lpage_splits;`
			`+ sp->lpage_disallowed = true;`
			`+}`
			`+`
			`static void unaccount_shadowed(struct kvm kvm, struct kvm_mmu_page sp)`
			`{`
			`struct kvm_memslots *slots;`
			`@@ -1113,6 +1141,12 @@ static void unaccount_shadowed(struct kvm kvm, struct kvm_mmu_page sp)`
			`kvm_mmu_gfn_allow_lpage(slot, gfn);`
			`}`

			`+static void unaccount_huge_nx_page(struct kvm kvm, struct kvm_mmu_page sp)`
			`+{`
			`+ --kvm->stat.nx_lpage_splits;`
			`+ sp->lpage_disallowed = false;`
			`+}`
			`+`
			`static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,`
			`struct kvm_memory_slot *slot)`
			`{`
			`@@ -2665,6 +2699,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm kvm, struct kvm_mmu_page sp,`
			`kvm_reload_remote_mmus(kvm);`
			`}`

			`+ if (sp->lpage_disallowed)`
			`+ unaccount_huge_nx_page(kvm, sp);`
			`+`
			`sp->role.invalid = 1;`
			`return ret;`
			`}`
			`@@ -2873,6 +2910,11 @@ static int set_spte(struct kvm_vcpu vcpu, u64 sptep,`
			`if (!speculative)`
			`spte \|= spte_shadow_accessed_mask(spte);`

			`+ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&`
			`+ is_nx_huge_page_enabled()) {`
			`+ pte_access &= ~ACC_EXEC_MASK;`
			`+ }`
			`+`
			`if (pte_access & ACC_EXEC_MASK)`
			`spte \|= shadow_x_mask;`
			`else`
			`@@ -3091,9 +3133,32 @@ static void direct_pte_prefetch(struct kvm_vcpu vcpu, u64 sptep)`
			`__direct_pte_prefetch(vcpu, sp, sptep);`
			`}`

			`+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,`
			`+ gfn_t gfn, kvm_pfn_t pfnp, int levelp)`
			`+{`
			`+ int level = *levelp;`
			`+ u64 spte = *it.sptep;`
			`+`
			`+ if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&`
			`+ is_nx_huge_page_enabled() &&`
			`+ is_shadow_present_pte(spte) &&`
			`+ !is_large_pte(spte)) {`
			`+ /*`
			`+ * A small SPTE exists for this pfn, but FNAME(fetch)`
			`+ * and __direct_map would like to create a large PTE`
			`+ * instead: just force them to go down another level,`
			`+ * patching back for them into pfn the next 9 bits of`
			`+ * the address.`
			`+ */`
			`+ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);`
			`+ *pfnp \|= gfn & page_mask;`
			`+ (*levelp)--;`
			`+ }`
			`+}`
			`+`
			`static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,`
			`int map_writable, int level, kvm_pfn_t pfn,`
			`- bool prefault)`
			`+ bool prefault, bool lpage_disallowed)`
			`{`
			`struct kvm_shadow_walk_iterator it;`
			`struct kvm_mmu_page *sp;`
			`@@ -3106,6 +3171,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,`

			`trace_kvm_mmu_spte_requested(gpa, level, pfn);`
			`for_each_shadow_entry(vcpu, gpa, it) {`
			`+ /*`
			`+ * We cannot overwrite existing page tables with an NX`
			`+ * large page, as the leaf could be executable.`
			`+ */`
			`+ disallowed_hugepage_adjust(it, gfn, &pfn, &level);`
			`+`
			`base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);`
			`if (it.level == level)`
			`break;`
			`@@ -3116,6 +3187,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,`
			`it.level - 1, true, ACC_ALL);`

			`link_shadow_page(vcpu, it.sptep, sp);`
			`+ if (lpage_disallowed)`
			`+ account_huge_nx_page(vcpu->kvm, sp);`
			`}`
			`}`

			`@@ -3416,11 +3489,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,`
			`{`
			`int r;`
			`int level;`
			`- bool force_pt_level = false;`
			`+ bool force_pt_level;`
			`kvm_pfn_t pfn;`
			`unsigned long mmu_seq;`
			`bool map_writable, write = error_code & PFERR_WRITE_MASK;`
			`+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&`
			`+ is_nx_huge_page_enabled();`

			`+ force_pt_level = lpage_disallowed;`
			`level = mapping_level(vcpu, gfn, &force_pt_level);`
			`if (likely(!force_pt_level)) {`
			`/*`
			`@@ -3454,7 +3530,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,`
			`goto out_unlock;`
			`if (likely(!force_pt_level))`
			`transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);`
			`- r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);`
			`+ r = __direct_map(vcpu, v, write, map_writable, level, pfn,`
			`+ prefault, false);`
			`out_unlock:`
			`spin_unlock(&vcpu->kvm->mmu_lock);`
			`kvm_release_pfn_clean(pfn);`
			`@@ -4048,6 +4125,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,`
			`unsigned long mmu_seq;`
			`int write = error_code & PFERR_WRITE_MASK;`
			`bool map_writable;`
			`+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&`
			`+ is_nx_huge_page_enabled();`

			`MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));`

			`@@ -4058,8 +4137,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,`
			`if (r)`
			`return r;`

			`- force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,`
			`- PT_DIRECTORY_LEVEL);`
			`+ force_pt_level =`
			`+ lpage_disallowed \|\|`
			`+ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);`
			`level = mapping_level(vcpu, gfn, &force_pt_level);`
			`if (likely(!force_pt_level)) {`
			`if (level > PT_DIRECTORY_LEVEL &&`
			`@@ -4088,7 +4168,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,`
			`goto out_unlock;`
			`if (likely(!force_pt_level))`
			`transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);`
			`- r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);`
			`+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,`
			`+ prefault, lpage_disallowed);`
			`out_unlock:`
			`spin_unlock(&vcpu->kvm->mmu_lock);`
			`kvm_release_pfn_clean(pfn);`
			`@@ -5886,10 +5967,52 @@ static void mmu_destroy_caches(void)`
			`kmem_cache_destroy(mmu_page_header_cache);`
			`}`

			`+static void __set_nx_huge_pages(bool val)`
			`+{`
			`+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;`
			`+}`
			`+`
			`+static int set_nx_huge_pages(const char val, const struct kernel_param kp)`
			`+{`
			`+ bool old_val = nx_huge_pages;`
			`+ bool new_val;`
			`+`
			`+ /* In "auto" mode deploy workaround only if CPU has the bug. */`
			`+ if (sysfs_streq(val, "off"))`
			`+ new_val = 0;`
			`+ else if (sysfs_streq(val, "force"))`
			`+ new_val = 1;`
			`+ else if (sysfs_streq(val, "auto"))`
			`+ new_val = boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT);`
			`+ else if (strtobool(val, &new_val) < 0)`
			`+ return -EINVAL;`
			`+`
			`+ __set_nx_huge_pages(new_val);`
			`+`
			`+ if (new_val != old_val) {`
			`+ struct kvm *kvm;`
			`+ int idx;`
			`+`
			`+ mutex_lock(&kvm_lock);`
			`+`
			`+ list_for_each_entry(kvm, &vm_list, vm_list) {`
			`+ idx = srcu_read_lock(&kvm->srcu);`
			`+ kvm_mmu_invalidate_zap_all_pages(kvm);`
			`+ srcu_read_unlock(&kvm->srcu, idx);`
			`+ }`
			`+ mutex_unlock(&kvm_lock);`
			`+ }`
			`+`
			`+ return 0;`
			`+}`
			`+`
			`int kvm_mmu_module_init(void)`
			`{`
			`int ret = -ENOMEM;`

			`+ if (nx_huge_pages == -1)`
			`+ __set_nx_huge_pages(boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT));`
			`+`
			`kvm_mmu_reset_all_pte_masks();`

			`pte_list_desc_cache = kmem_cache_create("pte_list_desc",`
			`diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h`
			`index 3b022b08b577..adf42dc8d38b 100644`
			`--- a/arch/x86/kvm/paging_tmpl.h`
			`+++ b/arch/x86/kvm/paging_tmpl.h`
			`@@ -596,13 +596,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu vcpu, struct guest_walker gw,`
			`static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,`
			`struct guest_walker *gw,`
			`int write_fault, int hlevel,`
			`- kvm_pfn_t pfn, bool map_writable, bool prefault)`
			`+ kvm_pfn_t pfn, bool map_writable, bool prefault,`
			`+ bool lpage_disallowed)`
			`{`
			`struct kvm_mmu_page *sp = NULL;`
			`struct kvm_shadow_walk_iterator it;`
			`unsigned direct_access, access = gw->pt_access;`
			`int top_level, ret;`
			`- gfn_t base_gfn;`
			`+ gfn_t gfn, base_gfn;`

			`direct_access = gw->pte_access;`

			`@@ -647,13 +648,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,`
			`link_shadow_page(vcpu, it.sptep, sp);`
			`}`

			`- base_gfn = gw->gfn;`
			`+ /*`
			`+ * FNAME(page_fault) might have clobbered the bottom bits of`
			`+ * gw->gfn, restore them from the virtual address.`
			`+ */`
			`+ gfn = gw->gfn \| ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);`
			`+ base_gfn = gfn;`

			`trace_kvm_mmu_spte_requested(addr, gw->level, pfn);`

			`for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {`
			`clear_sp_write_flooding_count(it.sptep);`
			`- base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);`
			`+`
			`+ /*`
			`+ * We cannot overwrite existing page tables with an NX`
			`+ * large page, as the leaf could be executable.`
			`+ */`
			`+ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);`
			`+`
			`+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);`
			`if (it.level == hlevel)`
			`break;`

			`@@ -665,6 +678,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,`
			`sp = kvm_mmu_get_page(vcpu, base_gfn, addr,`
			`it.level - 1, true, direct_access);`
			`link_shadow_page(vcpu, it.sptep, sp);`
			`+ if (lpage_disallowed)`
			`+ account_huge_nx_page(vcpu->kvm, sp);`
			`}`
			`}`

			`@@ -741,9 +756,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,`
			`int r;`
			`kvm_pfn_t pfn;`
			`int level = PT_PAGE_TABLE_LEVEL;`
			`- bool force_pt_level = false;`
			`unsigned long mmu_seq;`
			`bool map_writable, is_self_change_mapping;`
			`+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&`
			`+ is_nx_huge_page_enabled();`
			`+ bool force_pt_level = lpage_disallowed;`

			`pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);`

			`@@ -833,7 +850,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,`
			`if (!force_pt_level)`
			`transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);`
			`r = FNAME(fetch)(vcpu, addr, &walker, write_fault,`
			`- level, pfn, map_writable, prefault);`
			`+ level, pfn, map_writable, prefault, lpage_disallowed);`
			`kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);`

			`out_unlock:`
			`diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c`
[x86] Update NX patch set to v7 2019-10-24 21:48:50 +00:00			`index 2714c1a0e59f..ec80bb27504f 100644`
[x86] KVM: Add mitigation for Machine Check Error on Page Size Change (aka iTLB multi-hit, CVE-2018-12207) This is a backport of v6 of the "NX" patch set, and will probably require updates before release. 2019-10-20 13:32:35 +00:00			`--- a/arch/x86/kvm/x86.c`
			`+++ b/arch/x86/kvm/x86.c`
			`@@ -206,6 +206,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {`
			`{ "mmu_unsync", VM_STAT(mmu_unsync) },`
			`{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },`
			`{ "largepages", VM_STAT(lpages, .mode = 0444) },`
			`+ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },`
			`{ "max_mmu_page_hash_collisions",`
			`VM_STAT(max_mmu_page_hash_collisions) },`
			`{ NULL }`
[x86] Update NX patch set to v7 2019-10-24 21:48:50 +00:00			`@@ -1130,6 +1131,14 @@ u64 kvm_get_arch_capabilities(void)`

			`rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);`

			`+ /*`
			`+ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that`
			`+ * the nested hypervisor runs with NX huge pages. If it is not,`
			`+ * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other`
			`+ * L1 guests, so it need not worry about its own (L2) guests.`
			`+ */`
			`+ data \|= ARCH_CAP_PSCHANGE_MC_NO;`
			`+`
			`/*`
			`* If we're doing cache flushes (either "always" or "cond")`
			`* we will do one whenever the guest does a vmlaunch/vmresume.`