493 lines
15 KiB
Diff
493 lines
15 KiB
Diff
From: Paolo Bonzini <pbonzini@redhat.com>
|
|
Date: Mon, 4 Nov 2019 12:22:02 +0100
|
|
Subject: kvm: mmu: ITLB_MULTIHIT mitigation
|
|
|
|
commit b8e8c8303ff28c61046a4d0f6ea99aea609a7dc0 upstream
|
|
|
|
With some Intel processors, putting the same virtual address in the TLB
|
|
as both a 4 KiB and 2 MiB page can confuse the instruction fetch unit
|
|
and cause the processor to issue a machine check resulting in a CPU lockup.
|
|
|
|
Unfortunately when EPT page tables use huge pages, it is possible for a
|
|
malicious guest to cause this situation.
|
|
|
|
Add a knob to mark huge pages as non-executable. When the nx_huge_pages
|
|
parameter is enabled (and we are using EPT), all huge pages are marked as
|
|
NX. If the guest attempts to execute in one of those pages, the page is
|
|
broken down into 4K pages, which are then marked executable.
|
|
|
|
This is not an issue for shadow paging (except nested EPT), because then
|
|
the host is in control of TLB flushes and the problematic situation cannot
|
|
happen. With nested EPT, again the nested guest can cause problems shadow
|
|
and direct EPT is treated in the same way.
|
|
|
|
[ tglx: Fixup default to auto and massage wording a bit ]
|
|
|
|
Originally-by: Junaid Shahid <junaids@google.com>
|
|
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
---
|
|
.../admin-guide/kernel-parameters.txt | 19 +++
|
|
arch/x86/include/asm/kvm_host.h | 2 +
|
|
arch/x86/kernel/cpu/bugs.c | 13 +-
|
|
arch/x86/kvm/mmu.c | 141 +++++++++++++++++-
|
|
arch/x86/kvm/paging_tmpl.h | 29 +++-
|
|
arch/x86/kvm/x86.c | 9 ++
|
|
6 files changed, 200 insertions(+), 13 deletions(-)
|
|
|
|
--- a/Documentation/admin-guide/kernel-parameters.txt
|
|
+++ b/Documentation/admin-guide/kernel-parameters.txt
|
|
@@ -1956,6 +1956,19 @@
|
|
KVM MMU at runtime.
|
|
Default is 0 (off)
|
|
|
|
+ kvm.nx_huge_pages=
|
|
+ [KVM] Controls the software workaround for the
|
|
+ X86_BUG_ITLB_MULTIHIT bug.
|
|
+ force : Always deploy workaround.
|
|
+ off : Never deploy workaround.
|
|
+ auto : Deploy workaround based on the presence of
|
|
+ X86_BUG_ITLB_MULTIHIT.
|
|
+
|
|
+ Default is 'auto'.
|
|
+
|
|
+ If the software workaround is enabled for the host,
|
|
+ guests do need not to enable it for nested guests.
|
|
+
|
|
kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
|
|
Default is 1 (enabled)
|
|
|
|
@@ -2522,6 +2535,12 @@
|
|
l1tf=off [X86]
|
|
mds=off [X86]
|
|
tsx_async_abort=off [X86]
|
|
+ kvm.nx_huge_pages=off [X86]
|
|
+
|
|
+ Exceptions:
|
|
+ This does not have any effect on
|
|
+ kvm.nx_huge_pages when
|
|
+ kvm.nx_huge_pages=force.
|
|
|
|
auto (default)
|
|
Mitigate all CPU vulnerabilities, but leave SMT
|
|
--- a/arch/x86/include/asm/kvm_host.h
|
|
+++ b/arch/x86/include/asm/kvm_host.h
|
|
@@ -293,6 +293,7 @@ struct kvm_mmu_page {
|
|
/* hold the gfn of each spte inside spt */
|
|
gfn_t *gfns;
|
|
bool unsync;
|
|
+ bool lpage_disallowed; /* Can't be replaced by an equiv large page */
|
|
int root_count; /* Currently serving as active root */
|
|
unsigned int unsync_children;
|
|
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
|
|
@@ -887,6 +888,7 @@ struct kvm_vm_stat {
|
|
ulong mmu_unsync;
|
|
ulong remote_tlb_flush;
|
|
ulong lpages;
|
|
+ ulong nx_lpage_splits;
|
|
ulong max_mmu_page_hash_collisions;
|
|
};
|
|
|
|
--- a/arch/x86/kernel/cpu/bugs.c
|
|
+++ b/arch/x86/kernel/cpu/bugs.c
|
|
@@ -1225,6 +1225,9 @@ void x86_spec_ctrl_setup_ap(void)
|
|
x86_amd_ssb_disable();
|
|
}
|
|
|
|
+bool itlb_multihit_kvm_mitigation;
|
|
+EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
|
|
+
|
|
#undef pr_fmt
|
|
#define pr_fmt(fmt) "L1TF: " fmt
|
|
|
|
@@ -1380,17 +1383,25 @@ static ssize_t l1tf_show_state(char *buf
|
|
l1tf_vmx_states[l1tf_vmx_mitigation],
|
|
sched_smt_active() ? "vulnerable" : "disabled");
|
|
}
|
|
+
|
|
+static ssize_t itlb_multihit_show_state(char *buf)
|
|
+{
|
|
+ if (itlb_multihit_kvm_mitigation)
|
|
+ return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
|
|
+ else
|
|
+ return sprintf(buf, "KVM: Vulnerable\n");
|
|
+}
|
|
#else
|
|
static ssize_t l1tf_show_state(char *buf)
|
|
{
|
|
return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
|
|
}
|
|
-#endif
|
|
|
|
static ssize_t itlb_multihit_show_state(char *buf)
|
|
{
|
|
return sprintf(buf, "Processor vulnerable\n");
|
|
}
|
|
+#endif
|
|
|
|
static ssize_t mds_show_state(char *buf)
|
|
{
|
|
--- a/arch/x86/kvm/mmu.c
|
|
+++ b/arch/x86/kvm/mmu.c
|
|
@@ -49,6 +49,20 @@
|
|
#include <asm/kvm_page_track.h>
|
|
#include "trace.h"
|
|
|
|
+extern bool itlb_multihit_kvm_mitigation;
|
|
+
|
|
+static int __read_mostly nx_huge_pages = -1;
|
|
+
|
|
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
|
|
+
|
|
+static struct kernel_param_ops nx_huge_pages_ops = {
|
|
+ .set = set_nx_huge_pages,
|
|
+ .get = param_get_bool,
|
|
+};
|
|
+
|
|
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
|
|
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
|
|
+
|
|
/*
|
|
* When setting this variable to true it enables Two-Dimensional-Paging
|
|
* where the hardware walks 2 page tables:
|
|
@@ -285,6 +299,11 @@ static inline bool spte_ad_enabled(u64 s
|
|
return !(spte & shadow_acc_track_value);
|
|
}
|
|
|
|
+static bool is_nx_huge_page_enabled(void)
|
|
+{
|
|
+ return READ_ONCE(nx_huge_pages);
|
|
+}
|
|
+
|
|
static inline u64 spte_shadow_accessed_mask(u64 spte)
|
|
{
|
|
MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
|
|
@@ -1097,6 +1116,15 @@ static void account_shadowed(struct kvm
|
|
kvm_mmu_gfn_disallow_lpage(slot, gfn);
|
|
}
|
|
|
|
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
|
+{
|
|
+ if (sp->lpage_disallowed)
|
|
+ return;
|
|
+
|
|
+ ++kvm->stat.nx_lpage_splits;
|
|
+ sp->lpage_disallowed = true;
|
|
+}
|
|
+
|
|
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
|
|
{
|
|
struct kvm_memslots *slots;
|
|
@@ -1114,6 +1142,12 @@ static void unaccount_shadowed(struct kv
|
|
kvm_mmu_gfn_allow_lpage(slot, gfn);
|
|
}
|
|
|
|
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
|
+{
|
|
+ --kvm->stat.nx_lpage_splits;
|
|
+ sp->lpage_disallowed = false;
|
|
+}
|
|
+
|
|
static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
|
|
struct kvm_memory_slot *slot)
|
|
{
|
|
@@ -2666,6 +2700,9 @@ static int kvm_mmu_prepare_zap_page(stru
|
|
kvm_reload_remote_mmus(kvm);
|
|
}
|
|
|
|
+ if (sp->lpage_disallowed)
|
|
+ unaccount_huge_nx_page(kvm, sp);
|
|
+
|
|
sp->role.invalid = 1;
|
|
return ret;
|
|
}
|
|
@@ -2874,6 +2911,11 @@ static int set_spte(struct kvm_vcpu *vcp
|
|
if (!speculative)
|
|
spte |= spte_shadow_accessed_mask(spte);
|
|
|
|
+ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
|
|
+ is_nx_huge_page_enabled()) {
|
|
+ pte_access &= ~ACC_EXEC_MASK;
|
|
+ }
|
|
+
|
|
if (pte_access & ACC_EXEC_MASK)
|
|
spte |= shadow_x_mask;
|
|
else
|
|
@@ -3092,9 +3134,32 @@ static void direct_pte_prefetch(struct k
|
|
__direct_pte_prefetch(vcpu, sp, sptep);
|
|
}
|
|
|
|
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
|
|
+ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
|
|
+{
|
|
+ int level = *levelp;
|
|
+ u64 spte = *it.sptep;
|
|
+
|
|
+ if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
|
|
+ is_nx_huge_page_enabled() &&
|
|
+ is_shadow_present_pte(spte) &&
|
|
+ !is_large_pte(spte)) {
|
|
+ /*
|
|
+ * A small SPTE exists for this pfn, but FNAME(fetch)
|
|
+ * and __direct_map would like to create a large PTE
|
|
+ * instead: just force them to go down another level,
|
|
+ * patching back for them into pfn the next 9 bits of
|
|
+ * the address.
|
|
+ */
|
|
+ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
|
|
+ *pfnp |= gfn & page_mask;
|
|
+ (*levelp)--;
|
|
+ }
|
|
+}
|
|
+
|
|
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
|
|
int map_writable, int level, kvm_pfn_t pfn,
|
|
- bool prefault)
|
|
+ bool prefault, bool lpage_disallowed)
|
|
{
|
|
struct kvm_shadow_walk_iterator it;
|
|
struct kvm_mmu_page *sp;
|
|
@@ -3107,6 +3172,12 @@ static int __direct_map(struct kvm_vcpu
|
|
|
|
trace_kvm_mmu_spte_requested(gpa, level, pfn);
|
|
for_each_shadow_entry(vcpu, gpa, it) {
|
|
+ /*
|
|
+ * We cannot overwrite existing page tables with an NX
|
|
+ * large page, as the leaf could be executable.
|
|
+ */
|
|
+ disallowed_hugepage_adjust(it, gfn, &pfn, &level);
|
|
+
|
|
base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
|
|
if (it.level == level)
|
|
break;
|
|
@@ -3117,6 +3188,8 @@ static int __direct_map(struct kvm_vcpu
|
|
it.level - 1, true, ACC_ALL);
|
|
|
|
link_shadow_page(vcpu, it.sptep, sp);
|
|
+ if (lpage_disallowed)
|
|
+ account_huge_nx_page(vcpu->kvm, sp);
|
|
}
|
|
}
|
|
|
|
@@ -3417,11 +3490,14 @@ static int nonpaging_map(struct kvm_vcpu
|
|
{
|
|
int r;
|
|
int level;
|
|
- bool force_pt_level = false;
|
|
+ bool force_pt_level;
|
|
kvm_pfn_t pfn;
|
|
unsigned long mmu_seq;
|
|
bool map_writable, write = error_code & PFERR_WRITE_MASK;
|
|
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
|
|
+ is_nx_huge_page_enabled();
|
|
|
|
+ force_pt_level = lpage_disallowed;
|
|
level = mapping_level(vcpu, gfn, &force_pt_level);
|
|
if (likely(!force_pt_level)) {
|
|
/*
|
|
@@ -3455,7 +3531,8 @@ static int nonpaging_map(struct kvm_vcpu
|
|
goto out_unlock;
|
|
if (likely(!force_pt_level))
|
|
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
|
|
- r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
|
|
+ r = __direct_map(vcpu, v, write, map_writable, level, pfn,
|
|
+ prefault, false);
|
|
out_unlock:
|
|
spin_unlock(&vcpu->kvm->mmu_lock);
|
|
kvm_release_pfn_clean(pfn);
|
|
@@ -4049,6 +4126,8 @@ static int tdp_page_fault(struct kvm_vcp
|
|
unsigned long mmu_seq;
|
|
int write = error_code & PFERR_WRITE_MASK;
|
|
bool map_writable;
|
|
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
|
|
+ is_nx_huge_page_enabled();
|
|
|
|
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
|
|
|
|
@@ -4059,8 +4138,9 @@ static int tdp_page_fault(struct kvm_vcp
|
|
if (r)
|
|
return r;
|
|
|
|
- force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
|
|
- PT_DIRECTORY_LEVEL);
|
|
+ force_pt_level =
|
|
+ lpage_disallowed ||
|
|
+ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
|
|
level = mapping_level(vcpu, gfn, &force_pt_level);
|
|
if (likely(!force_pt_level)) {
|
|
if (level > PT_DIRECTORY_LEVEL &&
|
|
@@ -4089,7 +4169,8 @@ static int tdp_page_fault(struct kvm_vcp
|
|
goto out_unlock;
|
|
if (likely(!force_pt_level))
|
|
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
|
|
- r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
|
|
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
|
|
+ prefault, lpage_disallowed);
|
|
out_unlock:
|
|
spin_unlock(&vcpu->kvm->mmu_lock);
|
|
kvm_release_pfn_clean(pfn);
|
|
@@ -5887,10 +5968,58 @@ static void mmu_destroy_caches(void)
|
|
kmem_cache_destroy(mmu_page_header_cache);
|
|
}
|
|
|
|
+static bool get_nx_auto_mode(void)
|
|
+{
|
|
+ /* Return true when CPU has the bug, and mitigations are ON */
|
|
+ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
|
|
+}
|
|
+
|
|
+static void __set_nx_huge_pages(bool val)
|
|
+{
|
|
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;
|
|
+}
|
|
+
|
|
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
|
|
+{
|
|
+ bool old_val = nx_huge_pages;
|
|
+ bool new_val;
|
|
+
|
|
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
|
|
+ if (sysfs_streq(val, "off"))
|
|
+ new_val = 0;
|
|
+ else if (sysfs_streq(val, "force"))
|
|
+ new_val = 1;
|
|
+ else if (sysfs_streq(val, "auto"))
|
|
+ new_val = get_nx_auto_mode();
|
|
+ else if (strtobool(val, &new_val) < 0)
|
|
+ return -EINVAL;
|
|
+
|
|
+ __set_nx_huge_pages(new_val);
|
|
+
|
|
+ if (new_val != old_val) {
|
|
+ struct kvm *kvm;
|
|
+ int idx;
|
|
+
|
|
+ mutex_lock(&kvm_lock);
|
|
+
|
|
+ list_for_each_entry(kvm, &vm_list, vm_list) {
|
|
+ idx = srcu_read_lock(&kvm->srcu);
|
|
+ kvm_mmu_invalidate_zap_all_pages(kvm);
|
|
+ srcu_read_unlock(&kvm->srcu, idx);
|
|
+ }
|
|
+ mutex_unlock(&kvm_lock);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
int kvm_mmu_module_init(void)
|
|
{
|
|
int ret = -ENOMEM;
|
|
|
|
+ if (nx_huge_pages == -1)
|
|
+ __set_nx_huge_pages(get_nx_auto_mode());
|
|
+
|
|
kvm_mmu_reset_all_pte_masks();
|
|
|
|
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
|
|
--- a/arch/x86/kvm/paging_tmpl.h
|
|
+++ b/arch/x86/kvm/paging_tmpl.h
|
|
@@ -596,13 +596,14 @@ static void FNAME(pte_prefetch)(struct k
|
|
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
|
|
struct guest_walker *gw,
|
|
int write_fault, int hlevel,
|
|
- kvm_pfn_t pfn, bool map_writable, bool prefault)
|
|
+ kvm_pfn_t pfn, bool map_writable, bool prefault,
|
|
+ bool lpage_disallowed)
|
|
{
|
|
struct kvm_mmu_page *sp = NULL;
|
|
struct kvm_shadow_walk_iterator it;
|
|
unsigned direct_access, access = gw->pt_access;
|
|
int top_level, ret;
|
|
- gfn_t base_gfn;
|
|
+ gfn_t gfn, base_gfn;
|
|
|
|
direct_access = gw->pte_access;
|
|
|
|
@@ -647,13 +648,25 @@ static int FNAME(fetch)(struct kvm_vcpu
|
|
link_shadow_page(vcpu, it.sptep, sp);
|
|
}
|
|
|
|
- base_gfn = gw->gfn;
|
|
+ /*
|
|
+ * FNAME(page_fault) might have clobbered the bottom bits of
|
|
+ * gw->gfn, restore them from the virtual address.
|
|
+ */
|
|
+ gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
|
|
+ base_gfn = gfn;
|
|
|
|
trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
|
|
|
|
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
|
|
clear_sp_write_flooding_count(it.sptep);
|
|
- base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
|
|
+
|
|
+ /*
|
|
+ * We cannot overwrite existing page tables with an NX
|
|
+ * large page, as the leaf could be executable.
|
|
+ */
|
|
+ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
|
|
+
|
|
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
|
|
if (it.level == hlevel)
|
|
break;
|
|
|
|
@@ -665,6 +678,8 @@ static int FNAME(fetch)(struct kvm_vcpu
|
|
sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
|
|
it.level - 1, true, direct_access);
|
|
link_shadow_page(vcpu, it.sptep, sp);
|
|
+ if (lpage_disallowed)
|
|
+ account_huge_nx_page(vcpu->kvm, sp);
|
|
}
|
|
}
|
|
|
|
@@ -741,9 +756,11 @@ static int FNAME(page_fault)(struct kvm_
|
|
int r;
|
|
kvm_pfn_t pfn;
|
|
int level = PT_PAGE_TABLE_LEVEL;
|
|
- bool force_pt_level = false;
|
|
unsigned long mmu_seq;
|
|
bool map_writable, is_self_change_mapping;
|
|
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
|
|
+ is_nx_huge_page_enabled();
|
|
+ bool force_pt_level = lpage_disallowed;
|
|
|
|
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
|
|
|
|
@@ -833,7 +850,7 @@ static int FNAME(page_fault)(struct kvm_
|
|
if (!force_pt_level)
|
|
transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
|
|
r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
|
|
- level, pfn, map_writable, prefault);
|
|
+ level, pfn, map_writable, prefault, lpage_disallowed);
|
|
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
|
|
|
|
out_unlock:
|
|
--- a/arch/x86/kvm/x86.c
|
|
+++ b/arch/x86/kvm/x86.c
|
|
@@ -206,6 +206,7 @@ struct kvm_stats_debugfs_item debugfs_en
|
|
{ "mmu_unsync", VM_STAT(mmu_unsync) },
|
|
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
|
|
{ "largepages", VM_STAT(lpages, .mode = 0444) },
|
|
+ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
|
|
{ "max_mmu_page_hash_collisions",
|
|
VM_STAT(max_mmu_page_hash_collisions) },
|
|
{ NULL }
|
|
@@ -1116,6 +1117,14 @@ u64 kvm_get_arch_capabilities(void)
|
|
rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
|
|
|
|
/*
|
|
+ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
|
|
+ * the nested hypervisor runs with NX huge pages. If it is not,
|
|
+ * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
|
|
+ * L1 guests, so it need not worry about its own (L2) guests.
|
|
+ */
|
|
+ data |= ARCH_CAP_PSCHANGE_MC_NO;
|
|
+
|
|
+ /*
|
|
* If we're doing cache flushes (either "always" or "cond")
|
|
* we will do one whenever the guest does a vmlaunch/vmresume.
|
|
* If an outer hypervisor is doing the cache flush for us
|