[x86] KVM: Add mitigation for Machine Check Error on Page Size Change
(aka iTLB multi-hit, CVE-2018-12207) This is a backport of v6 of the "NX" patch set, and will probably require updates before release.
This commit is contained in:
parent
9aee5ae400
commit
d9bd594144
|
@ -1,3 +1,22 @@
|
|||
linux (4.19.67-2+deb10u2) UNRELEASED; urgency=medium
|
||||
|
||||
* [x86] KVM: Add mitigation for Machine Check Error on Page Size Change
|
||||
(aka iTLB multi-hit, CVE-2018-12207):
|
||||
- KVM: x86: adjust kvm_mmu_page member to save 8 bytes
|
||||
- kvm: Convert kvm_lock to a mutex
|
||||
- kvm: x86: Do not release the page inside mmu_set_spte()
|
||||
- KVM: x86: make FNAME(fetch) and __direct_map more similar
|
||||
- KVM: x86: remove now unneeded hugepage gfn adjustment
|
||||
- KVM: x86: change kvm_mmu_page_get_gfn BUG_ON to WARN_ON
|
||||
- KVM: x86: add tracepoints around __direct_map and FNAME(fetch)
|
||||
- kvm: x86, powerpc: do not allow clearing largepages debugfs entry
|
||||
- x86: Add ITLB_MULTIHIT bug infrastructure
|
||||
- kvm: mmu: ITLB_MULTIHIT mitigation
|
||||
- kvm: Add helper function for creating VM worker threads
|
||||
- kvm: x86: mmu: Recovery of shattered NX large pages
|
||||
|
||||
-- Ben Hutchings <ben@decadent.org.uk> Sun, 20 Oct 2019 14:21:28 +0100
|
||||
|
||||
linux (4.19.67-2+deb10u1) buster-security; urgency=high
|
||||
|
||||
[ Romain Perier ]
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
From: Wei Yang <richard.weiyang@gmail.com>
|
||||
Date: Thu, 6 Sep 2018 05:58:16 +0800
|
||||
Subject: KVM: x86: adjust kvm_mmu_page member to save 8 bytes
|
||||
|
||||
commit 3ff519f29d98ecdc1961d825d105d68711093b6b upstream.
|
||||
|
||||
On a 64bits machine, struct is naturally aligned with 8 bytes. Since
|
||||
kvm_mmu_page member *unsync* and *role* are less then 4 bytes, we can
|
||||
rearrange the sequence to compace the struct.
|
||||
|
||||
As the comment shows, *role* and *gfn* are used to key the shadow page. In
|
||||
order to keep the comment valid, this patch moves the *unsync* up and
|
||||
exchange the position of *role* and *gfn*.
|
||||
|
||||
From /proc/slabinfo, it shows the size of kvm_mmu_page is 8 bytes less and
|
||||
with one more object per slap after applying this patch.
|
||||
|
||||
# name <active_objs> <num_objs> <objsize> <objperslab>
|
||||
kvm_mmu_page_header 0 0 168 24
|
||||
|
||||
kvm_mmu_page_header 0 0 160 25
|
||||
|
||||
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
arch/x86/include/asm/kvm_host.h | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
|
||||
index 0d3f5cf3ff3e..90dccb5c79d9 100644
|
||||
--- a/arch/x86/include/asm/kvm_host.h
|
||||
+++ b/arch/x86/include/asm/kvm_host.h
|
||||
@@ -281,18 +281,18 @@ struct kvm_rmap_head {
|
||||
struct kvm_mmu_page {
|
||||
struct list_head link;
|
||||
struct hlist_node hash_link;
|
||||
+ bool unsync;
|
||||
|
||||
/*
|
||||
* The following two entries are used to key the shadow page in the
|
||||
* hash table.
|
||||
*/
|
||||
- gfn_t gfn;
|
||||
union kvm_mmu_page_role role;
|
||||
+ gfn_t gfn;
|
||||
|
||||
u64 *spt;
|
||||
/* hold the gfn of each spte inside spt */
|
||||
gfn_t *gfns;
|
||||
- bool unsync;
|
||||
int root_count; /* Currently serving as active root */
|
||||
unsigned int unsync_children;
|
||||
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
|
275
debian/patches/bugfix/x86/itlb_multihit/0002-kvm-Convert-kvm_lock-to-a-mutex.patch
vendored
Normal file
275
debian/patches/bugfix/x86/itlb_multihit/0002-kvm-Convert-kvm_lock-to-a-mutex.patch
vendored
Normal file
|
@ -0,0 +1,275 @@
|
|||
From: Junaid Shahid <junaids@google.com>
|
||||
Date: Thu, 3 Jan 2019 17:14:28 -0800
|
||||
Subject: kvm: Convert kvm_lock to a mutex
|
||||
|
||||
commit 0d9ce162cf46c99628cc5da9510b959c7976735b upstream.
|
||||
|
||||
It doesn't seem as if there is any particular need for kvm_lock to be a
|
||||
spinlock, so convert the lock to a mutex so that sleepable functions (in
|
||||
particular cond_resched()) can be called while holding it.
|
||||
|
||||
Signed-off-by: Junaid Shahid <junaids@google.com>
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
[bwh: Backported to 4.19: adjust context]
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
Documentation/virtual/kvm/locking.txt | 4 +---
|
||||
arch/s390/kvm/kvm-s390.c | 4 ++--
|
||||
arch/x86/kvm/mmu.c | 4 ++--
|
||||
arch/x86/kvm/x86.c | 14 ++++++-------
|
||||
include/linux/kvm_host.h | 2 +-
|
||||
virt/kvm/kvm_main.c | 30 +++++++++++++--------------
|
||||
6 files changed, 28 insertions(+), 30 deletions(-)
|
||||
|
||||
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
|
||||
index 1bb8bcaf8497..635cd6eaf714 100644
|
||||
--- a/Documentation/virtual/kvm/locking.txt
|
||||
+++ b/Documentation/virtual/kvm/locking.txt
|
||||
@@ -15,8 +15,6 @@ KVM Lock Overview
|
||||
|
||||
On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock.
|
||||
|
||||
-For spinlocks, kvm_lock is taken outside kvm->mmu_lock.
|
||||
-
|
||||
Everything else is a leaf: no other lock is taken inside the critical
|
||||
sections.
|
||||
|
||||
@@ -169,7 +167,7 @@ which time it will be set using the Dirty tracking mechanism described above.
|
||||
------------
|
||||
|
||||
Name: kvm_lock
|
||||
-Type: spinlock_t
|
||||
+Type: mutex
|
||||
Arch: any
|
||||
Protects: - vm_list
|
||||
|
||||
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
|
||||
index fac1d4eaa426..3c317bc6b799 100644
|
||||
--- a/arch/s390/kvm/kvm-s390.c
|
||||
+++ b/arch/s390/kvm/kvm-s390.c
|
||||
@@ -2110,13 +2110,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
||||
kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
|
||||
if (!kvm->arch.sca)
|
||||
goto out_err;
|
||||
- spin_lock(&kvm_lock);
|
||||
+ mutex_lock(&kvm_lock);
|
||||
sca_offset += 16;
|
||||
if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE)
|
||||
sca_offset = 0;
|
||||
kvm->arch.sca = (struct bsca_block *)
|
||||
((char *) kvm->arch.sca + sca_offset);
|
||||
- spin_unlock(&kvm_lock);
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
|
||||
sprintf(debug_name, "kvm-%u", current->pid);
|
||||
|
||||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
|
||||
index 88940261fb53..c9d4e02bd73a 100644
|
||||
--- a/arch/x86/kvm/mmu.c
|
||||
+++ b/arch/x86/kvm/mmu.c
|
||||
@@ -5819,7 +5819,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
|
||||
int nr_to_scan = sc->nr_to_scan;
|
||||
unsigned long freed = 0;
|
||||
|
||||
- spin_lock(&kvm_lock);
|
||||
+ mutex_lock(&kvm_lock);
|
||||
|
||||
list_for_each_entry(kvm, &vm_list, vm_list) {
|
||||
int idx;
|
||||
@@ -5869,7 +5869,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
|
||||
break;
|
||||
}
|
||||
|
||||
- spin_unlock(&kvm_lock);
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
return freed;
|
||||
}
|
||||
|
||||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
|
||||
index 6ae8a013af31..0c085b895e6e 100644
|
||||
--- a/arch/x86/kvm/x86.c
|
||||
+++ b/arch/x86/kvm/x86.c
|
||||
@@ -6502,7 +6502,7 @@ static void kvm_hyperv_tsc_notifier(void)
|
||||
struct kvm_vcpu *vcpu;
|
||||
int cpu;
|
||||
|
||||
- spin_lock(&kvm_lock);
|
||||
+ mutex_lock(&kvm_lock);
|
||||
list_for_each_entry(kvm, &vm_list, vm_list)
|
||||
kvm_make_mclock_inprogress_request(kvm);
|
||||
|
||||
@@ -6528,7 +6528,7 @@ static void kvm_hyperv_tsc_notifier(void)
|
||||
|
||||
spin_unlock(&ka->pvclock_gtod_sync_lock);
|
||||
}
|
||||
- spin_unlock(&kvm_lock);
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -6586,17 +6586,17 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
|
||||
|
||||
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
|
||||
|
||||
- spin_lock(&kvm_lock);
|
||||
+ mutex_lock(&kvm_lock);
|
||||
list_for_each_entry(kvm, &vm_list, vm_list) {
|
||||
kvm_for_each_vcpu(i, vcpu, kvm) {
|
||||
if (vcpu->cpu != freq->cpu)
|
||||
continue;
|
||||
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
|
||||
- if (vcpu->cpu != smp_processor_id())
|
||||
+ if (vcpu->cpu != raw_smp_processor_id())
|
||||
send_ipi = 1;
|
||||
}
|
||||
}
|
||||
- spin_unlock(&kvm_lock);
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
|
||||
if (freq->old < freq->new && send_ipi) {
|
||||
/*
|
||||
@@ -6722,12 +6722,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
|
||||
struct kvm_vcpu *vcpu;
|
||||
int i;
|
||||
|
||||
- spin_lock(&kvm_lock);
|
||||
+ mutex_lock(&kvm_lock);
|
||||
list_for_each_entry(kvm, &vm_list, vm_list)
|
||||
kvm_for_each_vcpu(i, vcpu, kvm)
|
||||
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
|
||||
atomic_set(&kvm_guest_has_master_clock, 0);
|
||||
- spin_unlock(&kvm_lock);
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
}
|
||||
|
||||
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
|
||||
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
|
||||
index d42a36e4e6c2..5246a480d15a 100644
|
||||
--- a/include/linux/kvm_host.h
|
||||
+++ b/include/linux/kvm_host.h
|
||||
@@ -141,7 +141,7 @@ static inline bool is_error_page(struct page *page)
|
||||
|
||||
extern struct kmem_cache *kvm_vcpu_cache;
|
||||
|
||||
-extern spinlock_t kvm_lock;
|
||||
+extern struct mutex kvm_lock;
|
||||
extern struct list_head vm_list;
|
||||
|
||||
struct kvm_io_range {
|
||||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
|
||||
index 4a584a575221..6a8fe26198b9 100644
|
||||
--- a/virt/kvm/kvm_main.c
|
||||
+++ b/virt/kvm/kvm_main.c
|
||||
@@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
|
||||
* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
|
||||
*/
|
||||
|
||||
-DEFINE_SPINLOCK(kvm_lock);
|
||||
+DEFINE_MUTEX(kvm_lock);
|
||||
static DEFINE_RAW_SPINLOCK(kvm_count_lock);
|
||||
LIST_HEAD(vm_list);
|
||||
|
||||
@@ -684,9 +684,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
|
||||
if (r)
|
||||
goto out_err;
|
||||
|
||||
- spin_lock(&kvm_lock);
|
||||
+ mutex_lock(&kvm_lock);
|
||||
list_add(&kvm->vm_list, &vm_list);
|
||||
- spin_unlock(&kvm_lock);
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
|
||||
preempt_notifier_inc();
|
||||
|
||||
@@ -732,9 +732,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
|
||||
kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
|
||||
kvm_destroy_vm_debugfs(kvm);
|
||||
kvm_arch_sync_events(kvm);
|
||||
- spin_lock(&kvm_lock);
|
||||
+ mutex_lock(&kvm_lock);
|
||||
list_del(&kvm->vm_list);
|
||||
- spin_unlock(&kvm_lock);
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
kvm_free_irq_routing(kvm);
|
||||
for (i = 0; i < KVM_NR_BUSES; i++) {
|
||||
struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
|
||||
@@ -3828,13 +3828,13 @@ static int vm_stat_get(void *_offset, u64 *val)
|
||||
u64 tmp_val;
|
||||
|
||||
*val = 0;
|
||||
- spin_lock(&kvm_lock);
|
||||
+ mutex_lock(&kvm_lock);
|
||||
list_for_each_entry(kvm, &vm_list, vm_list) {
|
||||
stat_tmp.kvm = kvm;
|
||||
vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
|
||||
*val += tmp_val;
|
||||
}
|
||||
- spin_unlock(&kvm_lock);
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -3847,12 +3847,12 @@ static int vm_stat_clear(void *_offset, u64 val)
|
||||
if (val)
|
||||
return -EINVAL;
|
||||
|
||||
- spin_lock(&kvm_lock);
|
||||
+ mutex_lock(&kvm_lock);
|
||||
list_for_each_entry(kvm, &vm_list, vm_list) {
|
||||
stat_tmp.kvm = kvm;
|
||||
vm_stat_clear_per_vm((void *)&stat_tmp, 0);
|
||||
}
|
||||
- spin_unlock(&kvm_lock);
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -3867,13 +3867,13 @@ static int vcpu_stat_get(void *_offset, u64 *val)
|
||||
u64 tmp_val;
|
||||
|
||||
*val = 0;
|
||||
- spin_lock(&kvm_lock);
|
||||
+ mutex_lock(&kvm_lock);
|
||||
list_for_each_entry(kvm, &vm_list, vm_list) {
|
||||
stat_tmp.kvm = kvm;
|
||||
vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
|
||||
*val += tmp_val;
|
||||
}
|
||||
- spin_unlock(&kvm_lock);
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -3886,12 +3886,12 @@ static int vcpu_stat_clear(void *_offset, u64 val)
|
||||
if (val)
|
||||
return -EINVAL;
|
||||
|
||||
- spin_lock(&kvm_lock);
|
||||
+ mutex_lock(&kvm_lock);
|
||||
list_for_each_entry(kvm, &vm_list, vm_list) {
|
||||
stat_tmp.kvm = kvm;
|
||||
vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
|
||||
}
|
||||
- spin_unlock(&kvm_lock);
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -3912,7 +3912,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
|
||||
if (!kvm_dev.this_device || !kvm)
|
||||
return;
|
||||
|
||||
- spin_lock(&kvm_lock);
|
||||
+ mutex_lock(&kvm_lock);
|
||||
if (type == KVM_EVENT_CREATE_VM) {
|
||||
kvm_createvm_count++;
|
||||
kvm_active_vms++;
|
||||
@@ -3921,7 +3921,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
|
||||
}
|
||||
created = kvm_createvm_count;
|
||||
active = kvm_active_vms;
|
||||
- spin_unlock(&kvm_lock);
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
|
||||
env = kzalloc(sizeof(*env), GFP_KERNEL);
|
||||
if (!env)
|
|
@ -0,0 +1,137 @@
|
|||
From: Junaid Shahid <junaids@google.com>
|
||||
Date: Thu, 3 Jan 2019 16:22:21 -0800
|
||||
Subject: kvm: x86: Do not release the page inside mmu_set_spte()
|
||||
|
||||
commit 43fdcda96e2550c6d1c46fb8a78801aa2f7276ed upstream.
|
||||
|
||||
Release the page at the call-site where it was originally acquired.
|
||||
This makes the exit code cleaner for most call sites, since they
|
||||
do not need to duplicate code between success and the failure
|
||||
label.
|
||||
|
||||
Signed-off-by: Junaid Shahid <junaids@google.com>
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
arch/x86/kvm/mmu.c | 18 +++++++-----------
|
||||
arch/x86/kvm/paging_tmpl.h | 8 +++-----
|
||||
2 files changed, 10 insertions(+), 16 deletions(-)
|
||||
|
||||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
|
||||
index c9d4e02bd73a..7dc18fb42168 100644
|
||||
--- a/arch/x86/kvm/mmu.c
|
||||
+++ b/arch/x86/kvm/mmu.c
|
||||
@@ -3001,8 +3001,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
|
||||
}
|
||||
}
|
||||
|
||||
- kvm_release_pfn_clean(pfn);
|
||||
-
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -3037,9 +3035,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
|
||||
if (ret <= 0)
|
||||
return -1;
|
||||
|
||||
- for (i = 0; i < ret; i++, gfn++, start++)
|
||||
+ for (i = 0; i < ret; i++, gfn++, start++) {
|
||||
mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
|
||||
page_to_pfn(pages[i]), true, true);
|
||||
+ put_page(pages[i]);
|
||||
+ }
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -3445,6 +3445,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
|
||||
if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
|
||||
return r;
|
||||
|
||||
+ r = RET_PF_RETRY;
|
||||
spin_lock(&vcpu->kvm->mmu_lock);
|
||||
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
|
||||
goto out_unlock;
|
||||
@@ -3453,14 +3454,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
|
||||
if (likely(!force_pt_level))
|
||||
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
|
||||
r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
|
||||
- spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
-
|
||||
- return r;
|
||||
|
||||
out_unlock:
|
||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
kvm_release_pfn_clean(pfn);
|
||||
- return RET_PF_RETRY;
|
||||
+ return r;
|
||||
}
|
||||
|
||||
static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
|
||||
@@ -4082,6 +4080,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
||||
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
|
||||
return r;
|
||||
|
||||
+ r = RET_PF_RETRY;
|
||||
spin_lock(&vcpu->kvm->mmu_lock);
|
||||
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
|
||||
goto out_unlock;
|
||||
@@ -4090,14 +4089,11 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
||||
if (likely(!force_pt_level))
|
||||
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
|
||||
r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
|
||||
- spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
-
|
||||
- return r;
|
||||
|
||||
out_unlock:
|
||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
kvm_release_pfn_clean(pfn);
|
||||
- return RET_PF_RETRY;
|
||||
+ return r;
|
||||
}
|
||||
|
||||
static void nonpaging_init_context(struct kvm_vcpu *vcpu,
|
||||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
|
||||
index 14ffd973df54..569c55dae3fa 100644
|
||||
--- a/arch/x86/kvm/paging_tmpl.h
|
||||
+++ b/arch/x86/kvm/paging_tmpl.h
|
||||
@@ -522,6 +522,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||
mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
|
||||
true, true);
|
||||
|
||||
+ kvm_release_pfn_clean(pfn);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -673,7 +674,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
|
||||
return ret;
|
||||
|
||||
out_gpte_changed:
|
||||
- kvm_release_pfn_clean(pfn);
|
||||
return RET_PF_RETRY;
|
||||
}
|
||||
|
||||
@@ -821,6 +821,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
|
||||
walker.pte_access &= ~ACC_EXEC_MASK;
|
||||
}
|
||||
|
||||
+ r = RET_PF_RETRY;
|
||||
spin_lock(&vcpu->kvm->mmu_lock);
|
||||
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
|
||||
goto out_unlock;
|
||||
@@ -834,14 +835,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
|
||||
level, pfn, map_writable, prefault);
|
||||
++vcpu->stat.pf_fixed;
|
||||
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
|
||||
- spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
-
|
||||
- return r;
|
||||
|
||||
out_unlock:
|
||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
kvm_release_pfn_clean(pfn);
|
||||
- return RET_PF_RETRY;
|
||||
+ return r;
|
||||
}
|
||||
|
||||
static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
|
|
@ -0,0 +1,173 @@
|
|||
From: Paolo Bonzini <pbonzini@redhat.com>
|
||||
Date: Mon, 24 Jun 2019 13:06:21 +0200
|
||||
Subject: KVM: x86: make FNAME(fetch) and __direct_map more similar
|
||||
|
||||
commit 3fcf2d1bdeb6a513523cb2c77012a6b047aa859c upstream.
|
||||
|
||||
These two functions are basically doing the same thing through
|
||||
kvm_mmu_get_page, link_shadow_page and mmu_set_spte; yet, for historical
|
||||
reasons, their code looks very different. This patch tries to take the
|
||||
best of each and make them very similar, so that it is easy to understand
|
||||
changes that apply to both of them.
|
||||
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
[bwh: Backported to 4.19: adjust context]
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
arch/x86/kvm/mmu.c | 53 ++++++++++++++++++--------------------
|
||||
arch/x86/kvm/paging_tmpl.h | 30 ++++++++++-----------
|
||||
2 files changed, 39 insertions(+), 44 deletions(-)
|
||||
|
||||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
|
||||
index 7dc18fb42168..42a7120323bb 100644
|
||||
--- a/arch/x86/kvm/mmu.c
|
||||
+++ b/arch/x86/kvm/mmu.c
|
||||
@@ -3087,40 +3087,39 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
|
||||
__direct_pte_prefetch(vcpu, sp, sptep);
|
||||
}
|
||||
|
||||
-static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
|
||||
- int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
|
||||
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
|
||||
+ int map_writable, int level, kvm_pfn_t pfn,
|
||||
+ bool prefault)
|
||||
{
|
||||
- struct kvm_shadow_walk_iterator iterator;
|
||||
+ struct kvm_shadow_walk_iterator it;
|
||||
struct kvm_mmu_page *sp;
|
||||
- int emulate = 0;
|
||||
- gfn_t pseudo_gfn;
|
||||
+ int ret;
|
||||
+ gfn_t gfn = gpa >> PAGE_SHIFT;
|
||||
+ gfn_t base_gfn = gfn;
|
||||
|
||||
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
|
||||
- return 0;
|
||||
+ return RET_PF_RETRY;
|
||||
|
||||
- for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
|
||||
- if (iterator.level == level) {
|
||||
- emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
|
||||
- write, level, gfn, pfn, prefault,
|
||||
- map_writable);
|
||||
- direct_pte_prefetch(vcpu, iterator.sptep);
|
||||
- ++vcpu->stat.pf_fixed;
|
||||
+ for_each_shadow_entry(vcpu, gpa, it) {
|
||||
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
|
||||
+ if (it.level == level)
|
||||
break;
|
||||
- }
|
||||
|
||||
- drop_large_spte(vcpu, iterator.sptep);
|
||||
- if (!is_shadow_present_pte(*iterator.sptep)) {
|
||||
- u64 base_addr = iterator.addr;
|
||||
+ drop_large_spte(vcpu, it.sptep);
|
||||
+ if (!is_shadow_present_pte(*it.sptep)) {
|
||||
+ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
|
||||
+ it.level - 1, true, ACC_ALL);
|
||||
|
||||
- base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
|
||||
- pseudo_gfn = base_addr >> PAGE_SHIFT;
|
||||
- sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
|
||||
- iterator.level - 1, 1, ACC_ALL);
|
||||
-
|
||||
- link_shadow_page(vcpu, iterator.sptep, sp);
|
||||
+ link_shadow_page(vcpu, it.sptep, sp);
|
||||
}
|
||||
}
|
||||
- return emulate;
|
||||
+
|
||||
+ ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
|
||||
+ write, level, base_gfn, pfn, prefault,
|
||||
+ map_writable);
|
||||
+ direct_pte_prefetch(vcpu, it.sptep);
|
||||
+ ++vcpu->stat.pf_fixed;
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
|
||||
@@ -3453,8 +3452,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
|
||||
goto out_unlock;
|
||||
if (likely(!force_pt_level))
|
||||
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
|
||||
- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
|
||||
-
|
||||
+ r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
|
||||
out_unlock:
|
||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
kvm_release_pfn_clean(pfn);
|
||||
@@ -4088,8 +4086,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
||||
goto out_unlock;
|
||||
if (likely(!force_pt_level))
|
||||
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
|
||||
- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
|
||||
-
|
||||
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
|
||||
out_unlock:
|
||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
kvm_release_pfn_clean(pfn);
|
||||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
|
||||
index 569c55dae3fa..eb95d3672acd 100644
|
||||
--- a/arch/x86/kvm/paging_tmpl.h
|
||||
+++ b/arch/x86/kvm/paging_tmpl.h
|
||||
@@ -602,6 +602,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
|
||||
struct kvm_shadow_walk_iterator it;
|
||||
unsigned direct_access, access = gw->pt_access;
|
||||
int top_level, ret;
|
||||
+ gfn_t base_gfn;
|
||||
|
||||
direct_access = gw->pte_access;
|
||||
|
||||
@@ -646,31 +647,29 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
|
||||
link_shadow_page(vcpu, it.sptep, sp);
|
||||
}
|
||||
|
||||
- for (;
|
||||
- shadow_walk_okay(&it) && it.level > hlevel;
|
||||
- shadow_walk_next(&it)) {
|
||||
- gfn_t direct_gfn;
|
||||
+ base_gfn = gw->gfn;
|
||||
|
||||
+ for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
|
||||
clear_sp_write_flooding_count(it.sptep);
|
||||
+ base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
|
||||
+ if (it.level == hlevel)
|
||||
+ break;
|
||||
+
|
||||
validate_direct_spte(vcpu, it.sptep, direct_access);
|
||||
|
||||
drop_large_spte(vcpu, it.sptep);
|
||||
|
||||
- if (is_shadow_present_pte(*it.sptep))
|
||||
- continue;
|
||||
-
|
||||
- direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
|
||||
-
|
||||
- sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
|
||||
- true, direct_access);
|
||||
- link_shadow_page(vcpu, it.sptep, sp);
|
||||
+ if (!is_shadow_present_pte(*it.sptep)) {
|
||||
+ sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
|
||||
+ it.level - 1, true, direct_access);
|
||||
+ link_shadow_page(vcpu, it.sptep, sp);
|
||||
+ }
|
||||
}
|
||||
|
||||
- clear_sp_write_flooding_count(it.sptep);
|
||||
ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
|
||||
- it.level, gw->gfn, pfn, prefault, map_writable);
|
||||
+ it.level, base_gfn, pfn, prefault, map_writable);
|
||||
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
|
||||
-
|
||||
+ ++vcpu->stat.pf_fixed;
|
||||
return ret;
|
||||
|
||||
out_gpte_changed:
|
||||
@@ -833,7 +832,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
|
||||
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
|
||||
r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
|
||||
level, pfn, map_writable, prefault);
|
||||
- ++vcpu->stat.pf_fixed;
|
||||
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
|
||||
|
||||
out_unlock:
|
|
@ -0,0 +1,74 @@
|
|||
From: Paolo Bonzini <pbonzini@redhat.com>
|
||||
Date: Sun, 23 Jun 2019 19:15:49 +0200
|
||||
Subject: KVM: x86: remove now unneeded hugepage gfn adjustment
|
||||
|
||||
commit d679b32611c0102ce33b9e1a4e4b94854ed1812a upstream.
|
||||
|
||||
After the previous patch, the low bits of the gfn are masked in
|
||||
both FNAME(fetch) and __direct_map, so we do not need to clear them
|
||||
in transparent_hugepage_adjust.
|
||||
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
arch/x86/kvm/mmu.c | 9 +++------
|
||||
arch/x86/kvm/paging_tmpl.h | 2 +-
|
||||
2 files changed, 4 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
|
||||
index 42a7120323bb..96803f996819 100644
|
||||
--- a/arch/x86/kvm/mmu.c
|
||||
+++ b/arch/x86/kvm/mmu.c
|
||||
@@ -3155,11 +3155,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
|
||||
}
|
||||
|
||||
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
|
||||
- gfn_t *gfnp, kvm_pfn_t *pfnp,
|
||||
+ gfn_t gfn, kvm_pfn_t *pfnp,
|
||||
int *levelp)
|
||||
{
|
||||
kvm_pfn_t pfn = *pfnp;
|
||||
- gfn_t gfn = *gfnp;
|
||||
int level = *levelp;
|
||||
|
||||
/*
|
||||
@@ -3186,8 +3185,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
|
||||
mask = KVM_PAGES_PER_HPAGE(level) - 1;
|
||||
VM_BUG_ON((gfn & mask) != (pfn & mask));
|
||||
if (pfn & mask) {
|
||||
- gfn &= ~mask;
|
||||
- *gfnp = gfn;
|
||||
kvm_release_pfn_clean(pfn);
|
||||
pfn &= ~mask;
|
||||
kvm_get_pfn(pfn);
|
||||
@@ -3451,7 +3448,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
|
||||
if (make_mmu_pages_available(vcpu) < 0)
|
||||
goto out_unlock;
|
||||
if (likely(!force_pt_level))
|
||||
- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
|
||||
+ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
|
||||
r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
|
||||
out_unlock:
|
||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
@@ -4085,7 +4082,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
||||
if (make_mmu_pages_available(vcpu) < 0)
|
||||
goto out_unlock;
|
||||
if (likely(!force_pt_level))
|
||||
- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
|
||||
+ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
|
||||
r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
|
||||
out_unlock:
|
||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
|
||||
index eb95d3672acd..4aab953f1d31 100644
|
||||
--- a/arch/x86/kvm/paging_tmpl.h
|
||||
+++ b/arch/x86/kvm/paging_tmpl.h
|
||||
@@ -829,7 +829,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
|
||||
if (make_mmu_pages_available(vcpu) < 0)
|
||||
goto out_unlock;
|
||||
if (!force_pt_level)
|
||||
- transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
|
||||
+ transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
|
||||
r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
|
||||
level, pfn, map_writable, prefault);
|
||||
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
|
|
@ -0,0 +1,41 @@
|
|||
From: Paolo Bonzini <pbonzini@redhat.com>
|
||||
Date: Sun, 30 Jun 2019 08:36:21 -0400
|
||||
Subject: KVM: x86: change kvm_mmu_page_get_gfn BUG_ON to WARN_ON
|
||||
|
||||
commit e9f2a760b158551bfbef6db31d2cae45ab8072e5 upstream.
|
||||
|
||||
Note that in such a case it is quite likely that KVM will BUG_ON
|
||||
in __pte_list_remove when the VM is closed. However, there is no
|
||||
immediate risk of memory corruption in the host so a WARN_ON is
|
||||
enough and it lets you gather traces for debugging.
|
||||
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
arch/x86/kvm/mmu.c | 12 +++++++++---
|
||||
1 file changed, 9 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
|
||||
index 96803f996819..68fa10d890ee 100644
|
||||
--- a/arch/x86/kvm/mmu.c
|
||||
+++ b/arch/x86/kvm/mmu.c
|
||||
@@ -1027,10 +1027,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
|
||||
|
||||
static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
|
||||
{
|
||||
- if (sp->role.direct)
|
||||
- BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
|
||||
- else
|
||||
+ if (!sp->role.direct) {
|
||||
sp->gfns[index] = gfn;
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
|
||||
+ pr_err_ratelimited("gfn mismatch under direct page %llx "
|
||||
+ "(expected %llx, got %llx)\n",
|
||||
+ sp->gfn,
|
||||
+ kvm_mmu_page_get_gfn(sp, index), gfn);
|
||||
}
|
||||
|
||||
/*
|
|
@ -0,0 +1,148 @@
|
|||
From: Paolo Bonzini <pbonzini@redhat.com>
|
||||
Date: Mon, 1 Jul 2019 06:22:57 -0400
|
||||
Subject: KVM: x86: add tracepoints around __direct_map and FNAME(fetch)
|
||||
|
||||
commit 335e192a3fa415e1202c8b9ecdaaecd643f823cc upstream.
|
||||
|
||||
These are useful in debugging shadow paging.
|
||||
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
[bwh: Backported to 4.19: adjust context]
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
arch/x86/kvm/mmu.c | 13 ++++-----
|
||||
arch/x86/kvm/mmutrace.h | 59 ++++++++++++++++++++++++++++++++++++++
|
||||
arch/x86/kvm/paging_tmpl.h | 2 ++
|
||||
3 files changed, 67 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
|
||||
index 68fa10d890ee..7f9be921df7c 100644
|
||||
--- a/arch/x86/kvm/mmu.c
|
||||
+++ b/arch/x86/kvm/mmu.c
|
||||
@@ -140,9 +140,6 @@ module_param(dbg, bool, 0644);
|
||||
|
||||
#include <trace/events/kvm.h>
|
||||
|
||||
-#define CREATE_TRACE_POINTS
|
||||
-#include "mmutrace.h"
|
||||
-
|
||||
#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
|
||||
#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
|
||||
|
||||
@@ -261,9 +258,13 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
|
||||
|
||||
|
||||
static void mmu_spte_set(u64 *sptep, u64 spte);
|
||||
+static bool is_executable_pte(u64 spte);
|
||||
static union kvm_mmu_page_role
|
||||
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
|
||||
|
||||
+#define CREATE_TRACE_POINTS
|
||||
+#include "mmutrace.h"
|
||||
+
|
||||
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
|
||||
{
|
||||
BUG_ON((mmio_mask & mmio_value) != mmio_value);
|
||||
@@ -2992,10 +2993,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
|
||||
ret = RET_PF_EMULATE;
|
||||
|
||||
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
|
||||
- pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
|
||||
- is_large_pte(*sptep)? "2MB" : "4kB",
|
||||
- *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
|
||||
- *sptep, sptep);
|
||||
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
|
||||
if (!was_rmapped && is_large_pte(*sptep))
|
||||
++vcpu->kvm->stat.lpages;
|
||||
|
||||
@@ -3106,6 +3104,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
|
||||
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
|
||||
return RET_PF_RETRY;
|
||||
|
||||
+ trace_kvm_mmu_spte_requested(gpa, level, pfn);
|
||||
for_each_shadow_entry(vcpu, gpa, it) {
|
||||
base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
|
||||
if (it.level == level)
|
||||
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
|
||||
index c73bf4e4988c..918b0d5bf272 100644
|
||||
--- a/arch/x86/kvm/mmutrace.h
|
||||
+++ b/arch/x86/kvm/mmutrace.h
|
||||
@@ -325,6 +325,65 @@ TRACE_EVENT(
|
||||
__entry->kvm_gen == __entry->spte_gen
|
||||
)
|
||||
);
|
||||
+
|
||||
+TRACE_EVENT(
|
||||
+ kvm_mmu_set_spte,
|
||||
+ TP_PROTO(int level, gfn_t gfn, u64 *sptep),
|
||||
+ TP_ARGS(level, gfn, sptep),
|
||||
+
|
||||
+ TP_STRUCT__entry(
|
||||
+ __field(u64, gfn)
|
||||
+ __field(u64, spte)
|
||||
+ __field(u64, sptep)
|
||||
+ __field(u8, level)
|
||||
+ /* These depend on page entry type, so compute them now. */
|
||||
+ __field(bool, r)
|
||||
+ __field(bool, x)
|
||||
+ __field(u8, u)
|
||||
+ ),
|
||||
+
|
||||
+ TP_fast_assign(
|
||||
+ __entry->gfn = gfn;
|
||||
+ __entry->spte = *sptep;
|
||||
+ __entry->sptep = virt_to_phys(sptep);
|
||||
+ __entry->level = level;
|
||||
+ __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
|
||||
+ __entry->x = is_executable_pte(__entry->spte);
|
||||
+ __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
|
||||
+ ),
|
||||
+
|
||||
+ TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
|
||||
+ __entry->gfn, __entry->spte,
|
||||
+ __entry->r ? "r" : "-",
|
||||
+ __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
|
||||
+ __entry->x ? "x" : "-",
|
||||
+ __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
|
||||
+ __entry->level, __entry->sptep
|
||||
+ )
|
||||
+);
|
||||
+
|
||||
+TRACE_EVENT(
|
||||
+ kvm_mmu_spte_requested,
|
||||
+ TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn),
|
||||
+ TP_ARGS(addr, level, pfn),
|
||||
+
|
||||
+ TP_STRUCT__entry(
|
||||
+ __field(u64, gfn)
|
||||
+ __field(u64, pfn)
|
||||
+ __field(u8, level)
|
||||
+ ),
|
||||
+
|
||||
+ TP_fast_assign(
|
||||
+ __entry->gfn = addr >> PAGE_SHIFT;
|
||||
+ __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
|
||||
+ __entry->level = level;
|
||||
+ ),
|
||||
+
|
||||
+ TP_printk("gfn %llx pfn %llx level %d",
|
||||
+ __entry->gfn, __entry->pfn, __entry->level
|
||||
+ )
|
||||
+);
|
||||
+
|
||||
#endif /* _TRACE_KVMMMU_H */
|
||||
|
||||
#undef TRACE_INCLUDE_PATH
|
||||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
|
||||
index 4aab953f1d31..3b022b08b577 100644
|
||||
--- a/arch/x86/kvm/paging_tmpl.h
|
||||
+++ b/arch/x86/kvm/paging_tmpl.h
|
||||
@@ -649,6 +649,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
|
||||
|
||||
base_gfn = gw->gfn;
|
||||
|
||||
+ trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
|
||||
+
|
||||
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
|
||||
clear_sp_write_flooding_count(it.sptep);
|
||||
base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
|
|
@ -0,0 +1,101 @@
|
|||
From: Paolo Bonzini <pbonzini@redhat.com>
|
||||
Date: Mon, 30 Sep 2019 18:48:44 +0200
|
||||
Subject: kvm: x86, powerpc: do not allow clearing largepages debugfs entry
|
||||
|
||||
commit 833b45de69a6016c4b0cebe6765d526a31a81580 upstream.
|
||||
|
||||
The largepages debugfs entry is incremented/decremented as shadow
|
||||
pages are created or destroyed. Clearing it will result in an
|
||||
underflow, which is harmless to KVM but ugly (and could be
|
||||
misinterpreted by tools that use debugfs information), so make
|
||||
this particular statistic read-only.
|
||||
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
[bwh: Backported to 4.19: drop powerpc changes and the Cc to kvm-ppc]
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
arch/x86/kvm/x86.c | 6 +++---
|
||||
include/linux/kvm_host.h | 2 ++
|
||||
virt/kvm/kvm_main.c | 10 +++++++---
|
||||
3 files changed, 12 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
|
||||
index 0c085b895e6e..2714c1a0e59f 100644
|
||||
--- a/arch/x86/kvm/x86.c
|
||||
+++ b/arch/x86/kvm/x86.c
|
||||
@@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
|
||||
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
|
||||
#endif
|
||||
|
||||
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
|
||||
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
|
||||
+#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
|
||||
+#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
|
||||
|
||||
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
|
||||
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
|
||||
@@ -205,7 +205,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
|
||||
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
|
||||
{ "mmu_unsync", VM_STAT(mmu_unsync) },
|
||||
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
|
||||
- { "largepages", VM_STAT(lpages) },
|
||||
+ { "largepages", VM_STAT(lpages, .mode = 0444) },
|
||||
{ "max_mmu_page_hash_collisions",
|
||||
VM_STAT(max_mmu_page_hash_collisions) },
|
||||
{ NULL }
|
||||
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
|
||||
index 5246a480d15a..553a3115a735 100644
|
||||
--- a/include/linux/kvm_host.h
|
||||
+++ b/include/linux/kvm_host.h
|
||||
@@ -1034,6 +1034,7 @@ enum kvm_stat_kind {
|
||||
|
||||
struct kvm_stat_data {
|
||||
int offset;
|
||||
+ int mode;
|
||||
struct kvm *kvm;
|
||||
};
|
||||
|
||||
@@ -1041,6 +1042,7 @@ struct kvm_stats_debugfs_item {
|
||||
const char *name;
|
||||
int offset;
|
||||
enum kvm_stat_kind kind;
|
||||
+ int mode;
|
||||
};
|
||||
extern struct kvm_stats_debugfs_item debugfs_entries[];
|
||||
extern struct dentry *kvm_debugfs_dir;
|
||||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
|
||||
index 6a8fe26198b9..5482949b452c 100644
|
||||
--- a/virt/kvm/kvm_main.c
|
||||
+++ b/virt/kvm/kvm_main.c
|
||||
@@ -616,8 +616,9 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
|
||||
|
||||
stat_data->kvm = kvm;
|
||||
stat_data->offset = p->offset;
|
||||
+ stat_data->mode = p->mode ? p->mode : 0644;
|
||||
kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
|
||||
- debugfs_create_file(p->name, 0644, kvm->debugfs_dentry,
|
||||
+ debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry,
|
||||
stat_data, stat_fops_per_vm[p->kind]);
|
||||
}
|
||||
return 0;
|
||||
@@ -3714,7 +3715,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
|
||||
if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
|
||||
return -ENOENT;
|
||||
|
||||
- if (simple_attr_open(inode, file, get, set, fmt)) {
|
||||
+ if (simple_attr_open(inode, file, get,
|
||||
+ stat_data->mode & S_IWUGO ? set : NULL,
|
||||
+ fmt)) {
|
||||
kvm_put_kvm(stat_data->kvm);
|
||||
return -ENOMEM;
|
||||
}
|
||||
@@ -3962,7 +3965,8 @@ static void kvm_init_debug(void)
|
||||
|
||||
kvm_debugfs_num_entries = 0;
|
||||
for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
|
||||
- debugfs_create_file(p->name, 0644, kvm_debugfs_dir,
|
||||
+ int mode = p->mode ? p->mode : 0644;
|
||||
+ debugfs_create_file(p->name, mode, kvm_debugfs_dir,
|
||||
(void *)(long)p->offset,
|
||||
stat_fops[p->kind]);
|
||||
}
|
280
debian/patches/bugfix/x86/itlb_multihit/0009-x86-Add-ITLB_MULTIHIT-bug-infrastructure.patch
vendored
Normal file
280
debian/patches/bugfix/x86/itlb_multihit/0009-x86-Add-ITLB_MULTIHIT-bug-infrastructure.patch
vendored
Normal file
|
@ -0,0 +1,280 @@
|
|||
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
|
||||
Date: Fri, 11 Oct 2019 12:40:12 +0200
|
||||
Subject: x86: Add ITLB_MULTIHIT bug infrastructure
|
||||
|
||||
Some processors may incur a machine check error possibly
|
||||
resulting in an unrecoverable cpu hang when an instruction fetch
|
||||
encounters a TLB multi-hit in the instruction TLB. This can occur
|
||||
when the page size is changed along with either the physical
|
||||
address or cache type [1].
|
||||
|
||||
This issue affects both bare-metal x86 page tables and EPT.
|
||||
|
||||
This can be mitigated by either eliminating the use of large
|
||||
pages or by using careful TLB invalidations when changing the
|
||||
page size in the page tables.
|
||||
|
||||
Just like Spectre, Meltdown, L1TF and MDS, a new bit has been
|
||||
allocated in MSR_IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) and will
|
||||
be set on CPUs which are mitigated against this issue.
|
||||
|
||||
[1] For example please refer to erratum SKL002 in "6th Generation
|
||||
Intel Processor Family Specification Update"
|
||||
https://www.intel.com/content/www/us/en/products/docs/processors/core/desktop-6th-gen-core-family-spec-update.html
|
||||
https://www.google.com/search?q=site:intel.com+SKL002
|
||||
|
||||
There are a lot of other affected processors outside of Skylake and
|
||||
that the erratum(referred above) does not fully disclose the issue
|
||||
and the impact, both on Skylake and across all the affected CPUs.
|
||||
|
||||
Signed-off-by: Vineela Tummalapalli <vineela.tummalapalli@intel.com>
|
||||
Co-developed-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
|
||||
Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
[bwh: Backported to 4.19:
|
||||
- No support for X86_VENDOR_HYGON, ATOM_AIRMONT_NP
|
||||
- Adjust context]
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
.../ABI/testing/sysfs-devices-system-cpu | 1 +
|
||||
arch/x86/include/asm/cpufeatures.h | 1 +
|
||||
arch/x86/include/asm/msr-index.h | 7 ++
|
||||
arch/x86/kernel/cpu/bugs.c | 13 ++++
|
||||
arch/x86/kernel/cpu/common.c | 67 ++++++++++---------
|
||||
drivers/base/cpu.c | 8 +++
|
||||
include/linux/cpu.h | 2 +
|
||||
7 files changed, 68 insertions(+), 31 deletions(-)
|
||||
|
||||
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
|
||||
index 8718d4ad227b..a0edcdc7c0b8 100644
|
||||
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
|
||||
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
|
||||
@@ -478,6 +478,7 @@ What: /sys/devices/system/cpu/vulnerabilities
|
||||
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
|
||||
/sys/devices/system/cpu/vulnerabilities/l1tf
|
||||
/sys/devices/system/cpu/vulnerabilities/mds
|
||||
+ /sys/devices/system/cpu/vulnerabilities/itlb_multihit
|
||||
Date: January 2018
|
||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||
Description: Information about CPU vulnerabilities
|
||||
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
|
||||
index 759f0a176612..ccad4f183400 100644
|
||||
--- a/arch/x86/include/asm/cpufeatures.h
|
||||
+++ b/arch/x86/include/asm/cpufeatures.h
|
||||
@@ -389,5 +389,6 @@
|
||||
#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
|
||||
#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */
|
||||
#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */
|
||||
+#define X86_BUG_ITLB_MULTIHIT X86_BUG(22) /* CPU may incur MCE during certain page attribute changes */
|
||||
|
||||
#endif /* _ASM_X86_CPUFEATURES_H */
|
||||
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
|
||||
index a1d22e4428f6..f58e6921cbf7 100644
|
||||
--- a/arch/x86/include/asm/msr-index.h
|
||||
+++ b/arch/x86/include/asm/msr-index.h
|
||||
@@ -84,6 +84,13 @@
|
||||
* Microarchitectural Data
|
||||
* Sampling (MDS) vulnerabilities.
|
||||
*/
|
||||
+#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /*
|
||||
+ * The processor is not susceptible to a
|
||||
+ * machine check error due to modifying the
|
||||
+ * code page size along with either the
|
||||
+ * physical address or cache type
|
||||
+ * without TLB invalidation.
|
||||
+ */
|
||||
|
||||
#define MSR_IA32_FLUSH_CMD 0x0000010b
|
||||
#define L1D_FLUSH BIT(0) /*
|
||||
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
|
||||
index ee7d17611ead..60e47e492c2f 100644
|
||||
--- a/arch/x86/kernel/cpu/bugs.c
|
||||
+++ b/arch/x86/kernel/cpu/bugs.c
|
||||
@@ -1281,6 +1281,11 @@ static ssize_t l1tf_show_state(char *buf)
|
||||
}
|
||||
#endif
|
||||
|
||||
+static ssize_t itlb_multihit_show_state(char *buf)
|
||||
+{
|
||||
+ return sprintf(buf, "Processor vulnerable\n");
|
||||
+}
|
||||
+
|
||||
static ssize_t mds_show_state(char *buf)
|
||||
{
|
||||
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
|
||||
@@ -1366,6 +1371,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
|
||||
case X86_BUG_MDS:
|
||||
return mds_show_state(buf);
|
||||
|
||||
+ case X86_BUG_ITLB_MULTIHIT:
|
||||
+ return itlb_multihit_show_state(buf);
|
||||
+
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -1402,4 +1410,9 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu
|
||||
{
|
||||
return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
|
||||
}
|
||||
+
|
||||
+ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)
|
||||
+{
|
||||
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
|
||||
+}
|
||||
#endif
|
||||
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
|
||||
index b33fdfa0ff49..128808dccd2f 100644
|
||||
--- a/arch/x86/kernel/cpu/common.c
|
||||
+++ b/arch/x86/kernel/cpu/common.c
|
||||
@@ -946,13 +946,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
|
||||
#endif
|
||||
}
|
||||
|
||||
-#define NO_SPECULATION BIT(0)
|
||||
-#define NO_MELTDOWN BIT(1)
|
||||
-#define NO_SSB BIT(2)
|
||||
-#define NO_L1TF BIT(3)
|
||||
-#define NO_MDS BIT(4)
|
||||
-#define MSBDS_ONLY BIT(5)
|
||||
-#define NO_SWAPGS BIT(6)
|
||||
+#define NO_SPECULATION BIT(0)
|
||||
+#define NO_MELTDOWN BIT(1)
|
||||
+#define NO_SSB BIT(2)
|
||||
+#define NO_L1TF BIT(3)
|
||||
+#define NO_MDS BIT(4)
|
||||
+#define MSBDS_ONLY BIT(5)
|
||||
+#define NO_SWAPGS BIT(6)
|
||||
+#define NO_ITLB_MULTIHIT BIT(7)
|
||||
|
||||
#define VULNWL(_vendor, _family, _model, _whitelist) \
|
||||
{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
|
||||
@@ -970,26 +971,26 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
|
||||
VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
|
||||
|
||||
/* Intel Family 6 */
|
||||
- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION),
|
||||
- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION),
|
||||
- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION),
|
||||
- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION),
|
||||
- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION),
|
||||
-
|
||||
- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
|
||||
- VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
|
||||
- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
|
||||
- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
|
||||
- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
|
||||
- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
|
||||
+ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
|
||||
+
|
||||
+ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
|
||||
VULNWL_INTEL(CORE_YONAH, NO_SSB),
|
||||
|
||||
- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
|
||||
+ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
|
||||
- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS),
|
||||
- VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS),
|
||||
- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS),
|
||||
+ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
|
||||
/*
|
||||
* Technically, swapgs isn't serializing on AMD (despite it previously
|
||||
@@ -1000,13 +1001,13 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
|
||||
*/
|
||||
|
||||
/* AMD Family 0xf - 0x12 */
|
||||
- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
|
||||
- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
|
||||
- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
|
||||
- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
|
||||
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
|
||||
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
|
||||
- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS),
|
||||
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
|
||||
{}
|
||||
};
|
||||
|
||||
@@ -1021,15 +1022,19 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
|
||||
{
|
||||
u64 ia32_cap = 0;
|
||||
|
||||
+ if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
|
||||
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
|
||||
+
|
||||
+ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
|
||||
+ if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
|
||||
+ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
|
||||
+
|
||||
if (cpu_matches(NO_SPECULATION))
|
||||
return;
|
||||
|
||||
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
|
||||
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
|
||||
|
||||
- if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
|
||||
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
|
||||
-
|
||||
if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
|
||||
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
|
||||
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
|
||||
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
|
||||
index 2fd6ca1021c2..c21e2aec5cbb 100644
|
||||
--- a/drivers/base/cpu.c
|
||||
+++ b/drivers/base/cpu.c
|
||||
@@ -552,12 +552,19 @@ ssize_t __weak cpu_show_mds(struct device *dev,
|
||||
return sprintf(buf, "Not affected\n");
|
||||
}
|
||||
|
||||
+ssize_t __weak cpu_show_itlb_multihit(struct device *dev,
|
||||
+ struct device_attribute *attr, char *buf)
|
||||
+{
|
||||
+ return sprintf(buf, "Not affected\n");
|
||||
+}
|
||||
+
|
||||
static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
|
||||
static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
|
||||
static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
|
||||
static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
|
||||
static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
|
||||
static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);
|
||||
+static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL);
|
||||
|
||||
static struct attribute *cpu_root_vulnerabilities_attrs[] = {
|
||||
&dev_attr_meltdown.attr,
|
||||
@@ -566,6 +573,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
|
||||
&dev_attr_spec_store_bypass.attr,
|
||||
&dev_attr_l1tf.attr,
|
||||
&dev_attr_mds.attr,
|
||||
+ &dev_attr_itlb_multihit.attr,
|
||||
NULL
|
||||
};
|
||||
|
||||
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
|
||||
index 006f69f9277b..7bb824b0f30e 100644
|
||||
--- a/include/linux/cpu.h
|
||||
+++ b/include/linux/cpu.h
|
||||
@@ -59,6 +59,8 @@ extern ssize_t cpu_show_l1tf(struct device *dev,
|
||||
struct device_attribute *attr, char *buf);
|
||||
extern ssize_t cpu_show_mds(struct device *dev,
|
||||
struct device_attribute *attr, char *buf);
|
||||
+extern ssize_t cpu_show_itlb_multihit(struct device *dev,
|
||||
+ struct device_attribute *attr, char *buf);
|
||||
|
||||
extern __printf(4, 5)
|
||||
struct device *cpu_device_create(struct device *parent, void *drvdata,
|
464
debian/patches/bugfix/x86/itlb_multihit/0010-kvm-mmu-ITLB_MULTIHIT-mitigation.patch
vendored
Normal file
464
debian/patches/bugfix/x86/itlb_multihit/0010-kvm-mmu-ITLB_MULTIHIT-mitigation.patch
vendored
Normal file
|
@ -0,0 +1,464 @@
|
|||
From: Paolo Bonzini <pbonzini@redhat.com>
|
||||
Date: Fri, 11 Oct 2019 12:40:14 +0200
|
||||
Subject: kvm: mmu: ITLB_MULTIHIT mitigation
|
||||
|
||||
With some Intel processors, putting the same virtual address in the TLB
|
||||
as both a 4 KiB and 2 MiB page can confuse the instruction fetch unit
|
||||
and cause the processor to issue a machine check. Unfortunately if EPT
|
||||
page tables use huge pages, it possible for a malicious guest to cause
|
||||
this situation.
|
||||
|
||||
This patch adds a knob to mark huge pages as non-executable. When the
|
||||
nx_huge_pages parameter is enabled (and we are using EPT), all huge pages
|
||||
are marked as NX. If the guest attempts to execute in one of those pages,
|
||||
the page is broken down into 4K pages, which are then marked executable.
|
||||
|
||||
This is not an issue for shadow paging (except nested EPT), because then
|
||||
the host is in control of TLB flushes and the problematic situation cannot
|
||||
happen. With nested EPT, again the nested guest can cause problems so we
|
||||
treat shadow and direct EPT the same.
|
||||
|
||||
Signed-off-by: Junaid Shahid <junaids@google.com>
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
[bwh: Backported to 4.19:
|
||||
- Use kvm_mmu_invalidate_zap_all_pages() instead of kvm_mmu_zap_all_fast()
|
||||
- Adjust context]
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
.../admin-guide/kernel-parameters.txt | 11 ++
|
||||
arch/x86/include/asm/kvm_host.h | 2 +
|
||||
arch/x86/kernel/cpu/bugs.c | 13 +-
|
||||
arch/x86/kvm/mmu.c | 135 +++++++++++++++++-
|
||||
arch/x86/kvm/paging_tmpl.h | 29 +++-
|
||||
arch/x86/kvm/x86.c | 1 +
|
||||
6 files changed, 178 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
|
||||
index 16607b178b47..b2c1a5c63ab3 100644
|
||||
--- a/Documentation/admin-guide/kernel-parameters.txt
|
||||
+++ b/Documentation/admin-guide/kernel-parameters.txt
|
||||
@@ -1956,6 +1956,17 @@
|
||||
KVM MMU at runtime.
|
||||
Default is 0 (off)
|
||||
|
||||
+ kvm.nx_huge_pages=
|
||||
+ [KVM] Controls the sw workaround for bug
|
||||
+ X86_BUG_ITLB_MULTIHIT.
|
||||
+ force : Always deploy workaround.
|
||||
+ off : Default. Never deploy workaround.
|
||||
+ auto : Deploy workaround based on presence of
|
||||
+ X86_BUG_ITLB_MULTIHIT.
|
||||
+
|
||||
+ If the sw workaround is enabled for the host, guests
|
||||
+ need not enable it for nested guests.
|
||||
+
|
||||
kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
|
||||
Default is 1 (enabled)
|
||||
|
||||
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
|
||||
index 90dccb5c79d9..59b44445ed59 100644
|
||||
--- a/arch/x86/include/asm/kvm_host.h
|
||||
+++ b/arch/x86/include/asm/kvm_host.h
|
||||
@@ -282,6 +282,7 @@ struct kvm_mmu_page {
|
||||
struct list_head link;
|
||||
struct hlist_node hash_link;
|
||||
bool unsync;
|
||||
+ bool lpage_disallowed; /* Can't be replaced by an equiv large page */
|
||||
|
||||
/*
|
||||
* The following two entries are used to key the shadow page in the
|
||||
@@ -890,6 +891,7 @@ struct kvm_vm_stat {
|
||||
ulong mmu_unsync;
|
||||
ulong remote_tlb_flush;
|
||||
ulong lpages;
|
||||
+ ulong nx_lpage_splits;
|
||||
ulong max_mmu_page_hash_collisions;
|
||||
};
|
||||
|
||||
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
|
||||
index 60e47e492c2f..1e764992fa64 100644
|
||||
--- a/arch/x86/kernel/cpu/bugs.c
|
||||
+++ b/arch/x86/kernel/cpu/bugs.c
|
||||
@@ -1119,6 +1119,9 @@ void x86_spec_ctrl_setup_ap(void)
|
||||
x86_amd_ssb_disable();
|
||||
}
|
||||
|
||||
+bool itlb_multihit_kvm_mitigation;
|
||||
+EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
|
||||
+
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "L1TF: " fmt
|
||||
|
||||
@@ -1274,17 +1277,25 @@ static ssize_t l1tf_show_state(char *buf)
|
||||
l1tf_vmx_states[l1tf_vmx_mitigation],
|
||||
sched_smt_active() ? "vulnerable" : "disabled");
|
||||
}
|
||||
+
|
||||
+static ssize_t itlb_multihit_show_state(char *buf)
|
||||
+{
|
||||
+ if (itlb_multihit_kvm_mitigation)
|
||||
+ return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
|
||||
+ else
|
||||
+ return sprintf(buf, "KVM: Vulnerable\n");
|
||||
+}
|
||||
#else
|
||||
static ssize_t l1tf_show_state(char *buf)
|
||||
{
|
||||
return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
|
||||
}
|
||||
-#endif
|
||||
|
||||
static ssize_t itlb_multihit_show_state(char *buf)
|
||||
{
|
||||
return sprintf(buf, "Processor vulnerable\n");
|
||||
}
|
||||
+#endif
|
||||
|
||||
static ssize_t mds_show_state(char *buf)
|
||||
{
|
||||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
|
||||
index 7f9be921df7c..19c3dc9b05cb 100644
|
||||
--- a/arch/x86/kvm/mmu.c
|
||||
+++ b/arch/x86/kvm/mmu.c
|
||||
@@ -49,6 +49,20 @@
|
||||
#include <asm/kvm_page_track.h>
|
||||
#include "trace.h"
|
||||
|
||||
+extern bool itlb_multihit_kvm_mitigation;
|
||||
+
|
||||
+static int __read_mostly nx_huge_pages = -1;
|
||||
+
|
||||
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
|
||||
+
|
||||
+static struct kernel_param_ops nx_huge_pages_ops = {
|
||||
+ .set = set_nx_huge_pages,
|
||||
+ .get = param_get_bool,
|
||||
+};
|
||||
+
|
||||
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
|
||||
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
|
||||
+
|
||||
/*
|
||||
* When setting this variable to true it enables Two-Dimensional-Paging
|
||||
* where the hardware walks 2 page tables:
|
||||
@@ -284,6 +298,11 @@ static inline bool spte_ad_enabled(u64 spte)
|
||||
return !(spte & shadow_acc_track_value);
|
||||
}
|
||||
|
||||
+static bool is_nx_huge_page_enabled(void)
|
||||
+{
|
||||
+ return READ_ONCE(nx_huge_pages);
|
||||
+}
|
||||
+
|
||||
static inline u64 spte_shadow_accessed_mask(u64 spte)
|
||||
{
|
||||
MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
|
||||
@@ -1096,6 +1115,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
kvm_mmu_gfn_disallow_lpage(slot, gfn);
|
||||
}
|
||||
|
||||
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
+{
|
||||
+ if (sp->lpage_disallowed)
|
||||
+ return;
|
||||
+
|
||||
+ ++kvm->stat.nx_lpage_splits;
|
||||
+ sp->lpage_disallowed = true;
|
||||
+}
|
||||
+
|
||||
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
{
|
||||
struct kvm_memslots *slots;
|
||||
@@ -1113,6 +1141,12 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
kvm_mmu_gfn_allow_lpage(slot, gfn);
|
||||
}
|
||||
|
||||
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
+{
|
||||
+ --kvm->stat.nx_lpage_splits;
|
||||
+ sp->lpage_disallowed = false;
|
||||
+}
|
||||
+
|
||||
static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
|
||||
struct kvm_memory_slot *slot)
|
||||
{
|
||||
@@ -2665,6 +2699,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
|
||||
kvm_reload_remote_mmus(kvm);
|
||||
}
|
||||
|
||||
+ if (sp->lpage_disallowed)
|
||||
+ unaccount_huge_nx_page(kvm, sp);
|
||||
+
|
||||
sp->role.invalid = 1;
|
||||
return ret;
|
||||
}
|
||||
@@ -2873,6 +2910,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
|
||||
if (!speculative)
|
||||
spte |= spte_shadow_accessed_mask(spte);
|
||||
|
||||
+ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
|
||||
+ is_nx_huge_page_enabled()) {
|
||||
+ pte_access &= ~ACC_EXEC_MASK;
|
||||
+ }
|
||||
+
|
||||
if (pte_access & ACC_EXEC_MASK)
|
||||
spte |= shadow_x_mask;
|
||||
else
|
||||
@@ -3091,9 +3133,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
|
||||
__direct_pte_prefetch(vcpu, sp, sptep);
|
||||
}
|
||||
|
||||
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
|
||||
+ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
|
||||
+{
|
||||
+ int level = *levelp;
|
||||
+ u64 spte = *it.sptep;
|
||||
+
|
||||
+ if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
|
||||
+ is_nx_huge_page_enabled() &&
|
||||
+ is_shadow_present_pte(spte) &&
|
||||
+ !is_large_pte(spte)) {
|
||||
+ /*
|
||||
+ * A small SPTE exists for this pfn, but FNAME(fetch)
|
||||
+ * and __direct_map would like to create a large PTE
|
||||
+ * instead: just force them to go down another level,
|
||||
+ * patching back for them into pfn the next 9 bits of
|
||||
+ * the address.
|
||||
+ */
|
||||
+ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
|
||||
+ *pfnp |= gfn & page_mask;
|
||||
+ (*levelp)--;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
|
||||
int map_writable, int level, kvm_pfn_t pfn,
|
||||
- bool prefault)
|
||||
+ bool prefault, bool lpage_disallowed)
|
||||
{
|
||||
struct kvm_shadow_walk_iterator it;
|
||||
struct kvm_mmu_page *sp;
|
||||
@@ -3106,6 +3171,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
|
||||
|
||||
trace_kvm_mmu_spte_requested(gpa, level, pfn);
|
||||
for_each_shadow_entry(vcpu, gpa, it) {
|
||||
+ /*
|
||||
+ * We cannot overwrite existing page tables with an NX
|
||||
+ * large page, as the leaf could be executable.
|
||||
+ */
|
||||
+ disallowed_hugepage_adjust(it, gfn, &pfn, &level);
|
||||
+
|
||||
base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
|
||||
if (it.level == level)
|
||||
break;
|
||||
@@ -3116,6 +3187,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
|
||||
it.level - 1, true, ACC_ALL);
|
||||
|
||||
link_shadow_page(vcpu, it.sptep, sp);
|
||||
+ if (lpage_disallowed)
|
||||
+ account_huge_nx_page(vcpu->kvm, sp);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3416,11 +3489,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
|
||||
{
|
||||
int r;
|
||||
int level;
|
||||
- bool force_pt_level = false;
|
||||
+ bool force_pt_level;
|
||||
kvm_pfn_t pfn;
|
||||
unsigned long mmu_seq;
|
||||
bool map_writable, write = error_code & PFERR_WRITE_MASK;
|
||||
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
|
||||
+ is_nx_huge_page_enabled();
|
||||
|
||||
+ force_pt_level = lpage_disallowed;
|
||||
level = mapping_level(vcpu, gfn, &force_pt_level);
|
||||
if (likely(!force_pt_level)) {
|
||||
/*
|
||||
@@ -3454,7 +3530,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
|
||||
goto out_unlock;
|
||||
if (likely(!force_pt_level))
|
||||
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
|
||||
- r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
|
||||
+ r = __direct_map(vcpu, v, write, map_writable, level, pfn,
|
||||
+ prefault, false);
|
||||
out_unlock:
|
||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
kvm_release_pfn_clean(pfn);
|
||||
@@ -4048,6 +4125,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
||||
unsigned long mmu_seq;
|
||||
int write = error_code & PFERR_WRITE_MASK;
|
||||
bool map_writable;
|
||||
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
|
||||
+ is_nx_huge_page_enabled();
|
||||
|
||||
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
|
||||
|
||||
@@ -4058,8 +4137,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
- force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
|
||||
- PT_DIRECTORY_LEVEL);
|
||||
+ force_pt_level =
|
||||
+ lpage_disallowed ||
|
||||
+ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
|
||||
level = mapping_level(vcpu, gfn, &force_pt_level);
|
||||
if (likely(!force_pt_level)) {
|
||||
if (level > PT_DIRECTORY_LEVEL &&
|
||||
@@ -4088,7 +4168,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
||||
goto out_unlock;
|
||||
if (likely(!force_pt_level))
|
||||
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
|
||||
- r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
|
||||
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
|
||||
+ prefault, lpage_disallowed);
|
||||
out_unlock:
|
||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
kvm_release_pfn_clean(pfn);
|
||||
@@ -5886,10 +5967,52 @@ static void mmu_destroy_caches(void)
|
||||
kmem_cache_destroy(mmu_page_header_cache);
|
||||
}
|
||||
|
||||
+static void __set_nx_huge_pages(bool val)
|
||||
+{
|
||||
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;
|
||||
+}
|
||||
+
|
||||
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
|
||||
+{
|
||||
+ bool old_val = nx_huge_pages;
|
||||
+ bool new_val;
|
||||
+
|
||||
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
|
||||
+ if (sysfs_streq(val, "off"))
|
||||
+ new_val = 0;
|
||||
+ else if (sysfs_streq(val, "force"))
|
||||
+ new_val = 1;
|
||||
+ else if (sysfs_streq(val, "auto"))
|
||||
+ new_val = boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT);
|
||||
+ else if (strtobool(val, &new_val) < 0)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ __set_nx_huge_pages(new_val);
|
||||
+
|
||||
+ if (new_val != old_val) {
|
||||
+ struct kvm *kvm;
|
||||
+ int idx;
|
||||
+
|
||||
+ mutex_lock(&kvm_lock);
|
||||
+
|
||||
+ list_for_each_entry(kvm, &vm_list, vm_list) {
|
||||
+ idx = srcu_read_lock(&kvm->srcu);
|
||||
+ kvm_mmu_invalidate_zap_all_pages(kvm);
|
||||
+ srcu_read_unlock(&kvm->srcu, idx);
|
||||
+ }
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
int kvm_mmu_module_init(void)
|
||||
{
|
||||
int ret = -ENOMEM;
|
||||
|
||||
+ if (nx_huge_pages == -1)
|
||||
+ __set_nx_huge_pages(boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT));
|
||||
+
|
||||
kvm_mmu_reset_all_pte_masks();
|
||||
|
||||
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
|
||||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
|
||||
index 3b022b08b577..adf42dc8d38b 100644
|
||||
--- a/arch/x86/kvm/paging_tmpl.h
|
||||
+++ b/arch/x86/kvm/paging_tmpl.h
|
||||
@@ -596,13 +596,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
|
||||
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
|
||||
struct guest_walker *gw,
|
||||
int write_fault, int hlevel,
|
||||
- kvm_pfn_t pfn, bool map_writable, bool prefault)
|
||||
+ kvm_pfn_t pfn, bool map_writable, bool prefault,
|
||||
+ bool lpage_disallowed)
|
||||
{
|
||||
struct kvm_mmu_page *sp = NULL;
|
||||
struct kvm_shadow_walk_iterator it;
|
||||
unsigned direct_access, access = gw->pt_access;
|
||||
int top_level, ret;
|
||||
- gfn_t base_gfn;
|
||||
+ gfn_t gfn, base_gfn;
|
||||
|
||||
direct_access = gw->pte_access;
|
||||
|
||||
@@ -647,13 +648,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
|
||||
link_shadow_page(vcpu, it.sptep, sp);
|
||||
}
|
||||
|
||||
- base_gfn = gw->gfn;
|
||||
+ /*
|
||||
+ * FNAME(page_fault) might have clobbered the bottom bits of
|
||||
+ * gw->gfn, restore them from the virtual address.
|
||||
+ */
|
||||
+ gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
|
||||
+ base_gfn = gfn;
|
||||
|
||||
trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
|
||||
|
||||
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
|
||||
clear_sp_write_flooding_count(it.sptep);
|
||||
- base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
|
||||
+
|
||||
+ /*
|
||||
+ * We cannot overwrite existing page tables with an NX
|
||||
+ * large page, as the leaf could be executable.
|
||||
+ */
|
||||
+ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
|
||||
+
|
||||
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
|
||||
if (it.level == hlevel)
|
||||
break;
|
||||
|
||||
@@ -665,6 +678,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
|
||||
sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
|
||||
it.level - 1, true, direct_access);
|
||||
link_shadow_page(vcpu, it.sptep, sp);
|
||||
+ if (lpage_disallowed)
|
||||
+ account_huge_nx_page(vcpu->kvm, sp);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -741,9 +756,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
|
||||
int r;
|
||||
kvm_pfn_t pfn;
|
||||
int level = PT_PAGE_TABLE_LEVEL;
|
||||
- bool force_pt_level = false;
|
||||
unsigned long mmu_seq;
|
||||
bool map_writable, is_self_change_mapping;
|
||||
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
|
||||
+ is_nx_huge_page_enabled();
|
||||
+ bool force_pt_level = lpage_disallowed;
|
||||
|
||||
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
|
||||
|
||||
@@ -833,7 +850,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
|
||||
if (!force_pt_level)
|
||||
transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
|
||||
r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
|
||||
- level, pfn, map_writable, prefault);
|
||||
+ level, pfn, map_writable, prefault, lpage_disallowed);
|
||||
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
|
||||
|
||||
out_unlock:
|
||||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
|
||||
index 2714c1a0e59f..406a37aa61c7 100644
|
||||
--- a/arch/x86/kvm/x86.c
|
||||
+++ b/arch/x86/kvm/x86.c
|
||||
@@ -206,6 +206,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
|
||||
{ "mmu_unsync", VM_STAT(mmu_unsync) },
|
||||
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
|
||||
{ "largepages", VM_STAT(lpages, .mode = 0444) },
|
||||
+ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
|
||||
{ "max_mmu_page_hash_collisions",
|
||||
VM_STAT(max_mmu_page_hash_collisions) },
|
||||
{ NULL }
|
|
@ -0,0 +1,131 @@
|
|||
From: Junaid Shahid <junaids@google.com>
|
||||
Date: Fri, 11 Oct 2019 12:40:15 +0200
|
||||
Subject: kvm: Add helper function for creating VM worker threads
|
||||
|
||||
This adds a function to create a kernel thread associated with a given
|
||||
VM. In particular, it ensures that the worker thread inherits the
|
||||
priority and cgroups of the calling thread.
|
||||
|
||||
Signed-off-by: Junaid Shahid <junaids@google.com>
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
[bwh: Backported to 4.19: adjust context]
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
include/linux/kvm_host.h | 6 +++
|
||||
virt/kvm/kvm_main.c | 84 ++++++++++++++++++++++++++++++++++++++++
|
||||
2 files changed, 90 insertions(+)
|
||||
|
||||
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
|
||||
index 553a3115a735..96207939d862 100644
|
||||
--- a/include/linux/kvm_host.h
|
||||
+++ b/include/linux/kvm_host.h
|
||||
@@ -1305,4 +1305,10 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
#endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */
|
||||
|
||||
+typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);
|
||||
+
|
||||
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
|
||||
+ uintptr_t data, const char *name,
|
||||
+ struct task_struct **thread_ptr);
|
||||
+
|
||||
#endif
|
||||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
|
||||
index 5482949b452c..77da54d334b2 100644
|
||||
--- a/virt/kvm/kvm_main.c
|
||||
+++ b/virt/kvm/kvm_main.c
|
||||
@@ -51,6 +51,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/bsearch.h>
|
||||
+#include <linux/kthread.h>
|
||||
|
||||
#include <asm/processor.h>
|
||||
#include <asm/io.h>
|
||||
@@ -4142,3 +4143,86 @@ void kvm_exit(void)
|
||||
kvm_vfio_ops_exit();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_exit);
|
||||
+
|
||||
+struct kvm_vm_worker_thread_context {
|
||||
+ struct kvm *kvm;
|
||||
+ struct task_struct *parent;
|
||||
+ struct completion init_done;
|
||||
+ kvm_vm_thread_fn_t thread_fn;
|
||||
+ uintptr_t data;
|
||||
+ int err;
|
||||
+};
|
||||
+
|
||||
+static int kvm_vm_worker_thread(void *context)
|
||||
+{
|
||||
+ /*
|
||||
+ * The init_context is allocated on the stack of the parent thread, so
|
||||
+ * we have to locally copy anything that is needed beyond initialization
|
||||
+ */
|
||||
+ struct kvm_vm_worker_thread_context *init_context = context;
|
||||
+ struct kvm *kvm = init_context->kvm;
|
||||
+ kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
|
||||
+ uintptr_t data = init_context->data;
|
||||
+ int err;
|
||||
+
|
||||
+ err = kthread_park(current);
|
||||
+ /* kthread_park(current) is never supposed to return an error */
|
||||
+ WARN_ON(err != 0);
|
||||
+ if (err)
|
||||
+ goto init_complete;
|
||||
+
|
||||
+ err = cgroup_attach_task_all(init_context->parent, current);
|
||||
+ if (err) {
|
||||
+ kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
|
||||
+ __func__, err);
|
||||
+ goto init_complete;
|
||||
+ }
|
||||
+
|
||||
+ set_user_nice(current, task_nice(init_context->parent));
|
||||
+
|
||||
+init_complete:
|
||||
+ init_context->err = err;
|
||||
+ complete(&init_context->init_done);
|
||||
+ init_context = NULL;
|
||||
+
|
||||
+ if (err)
|
||||
+ return err;
|
||||
+
|
||||
+ /* Wait to be woken up by the spawner before proceeding. */
|
||||
+ kthread_parkme();
|
||||
+
|
||||
+ if (!kthread_should_stop())
|
||||
+ err = thread_fn(kvm, data);
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+
|
||||
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
|
||||
+ uintptr_t data, const char *name,
|
||||
+ struct task_struct **thread_ptr)
|
||||
+{
|
||||
+ struct kvm_vm_worker_thread_context init_context = {};
|
||||
+ struct task_struct *thread;
|
||||
+
|
||||
+ *thread_ptr = NULL;
|
||||
+ init_context.kvm = kvm;
|
||||
+ init_context.parent = current;
|
||||
+ init_context.thread_fn = thread_fn;
|
||||
+ init_context.data = data;
|
||||
+ init_completion(&init_context.init_done);
|
||||
+
|
||||
+ thread = kthread_run(kvm_vm_worker_thread, &init_context,
|
||||
+ "%s-%d", name, task_pid_nr(current));
|
||||
+ if (IS_ERR(thread))
|
||||
+ return PTR_ERR(thread);
|
||||
+
|
||||
+ /* kthread_run is never supposed to return NULL */
|
||||
+ WARN_ON(thread == NULL);
|
||||
+
|
||||
+ wait_for_completion(&init_context.init_done);
|
||||
+
|
||||
+ if (!init_context.err)
|
||||
+ *thread_ptr = thread;
|
||||
+
|
||||
+ return init_context.err;
|
||||
+}
|
368
debian/patches/bugfix/x86/itlb_multihit/0012-kvm-x86-mmu-Recovery-of-shattered-NX-large-pages.patch
vendored
Normal file
368
debian/patches/bugfix/x86/itlb_multihit/0012-kvm-x86-mmu-Recovery-of-shattered-NX-large-pages.patch
vendored
Normal file
|
@ -0,0 +1,368 @@
|
|||
From: Junaid Shahid <junaids@google.com>
|
||||
Date: Fri, 11 Oct 2019 12:40:16 +0200
|
||||
Subject: kvm: x86: mmu: Recovery of shattered NX large pages
|
||||
|
||||
The page table pages corresponding to broken down large pages are
|
||||
zapped in FIFO order, so that the large page can potentially
|
||||
be recovered, if it is no longer being used for execution. This removes
|
||||
the performance penalty for walking deeper EPT page tables.
|
||||
|
||||
By default, one large page will last about one hour once the guest
|
||||
reaches a steady state.
|
||||
|
||||
Signed-off-by: Junaid Shahid <junaids@google.com>
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
[bwh: Backported to 4.19: adjust context]
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
.../admin-guide/kernel-parameters.txt | 6 +
|
||||
arch/x86/include/asm/kvm_host.h | 5 +
|
||||
arch/x86/kvm/mmu.c | 129 ++++++++++++++++++
|
||||
arch/x86/kvm/mmu.h | 4 +
|
||||
arch/x86/kvm/x86.c | 11 ++
|
||||
virt/kvm/kvm_main.c | 30 +++-
|
||||
6 files changed, 184 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
|
||||
index b2c1a5c63ab3..efdc471ed0b9 100644
|
||||
--- a/Documentation/admin-guide/kernel-parameters.txt
|
||||
+++ b/Documentation/admin-guide/kernel-parameters.txt
|
||||
@@ -1967,6 +1967,12 @@
|
||||
If the sw workaround is enabled for the host, guests
|
||||
need not enable it for nested guests.
|
||||
|
||||
+ kvm.nx_huge_pages_recovery_ratio=
|
||||
+ [KVM] Controls how many 4KiB pages are periodically zapped
|
||||
+ back to huge pages. 0 disables the recovery, otherwise if
|
||||
+ the value is N KVM will zap 1/Nth of the 4KiB pages every
|
||||
+ minute. The default is 60.
|
||||
+
|
||||
kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
|
||||
Default is 1 (enabled)
|
||||
|
||||
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
|
||||
index 59b44445ed59..efe3ba61fc23 100644
|
||||
--- a/arch/x86/include/asm/kvm_host.h
|
||||
+++ b/arch/x86/include/asm/kvm_host.h
|
||||
@@ -281,6 +281,8 @@ struct kvm_rmap_head {
|
||||
struct kvm_mmu_page {
|
||||
struct list_head link;
|
||||
struct hlist_node hash_link;
|
||||
+ struct list_head lpage_disallowed_link;
|
||||
+
|
||||
bool unsync;
|
||||
bool lpage_disallowed; /* Can't be replaced by an equiv large page */
|
||||
|
||||
@@ -808,6 +810,7 @@ struct kvm_arch {
|
||||
*/
|
||||
struct list_head active_mmu_pages;
|
||||
struct list_head zapped_obsolete_pages;
|
||||
+ struct list_head lpage_disallowed_mmu_pages;
|
||||
struct kvm_page_track_notifier_node mmu_sp_tracker;
|
||||
struct kvm_page_track_notifier_head track_notifier_head;
|
||||
|
||||
@@ -878,6 +881,8 @@ struct kvm_arch {
|
||||
bool x2apic_broadcast_quirk_disabled;
|
||||
|
||||
bool guest_can_read_msr_platform_info;
|
||||
+
|
||||
+ struct task_struct *nx_lpage_recovery_thread;
|
||||
};
|
||||
|
||||
struct kvm_vm_stat {
|
||||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
|
||||
index 19c3dc9b05cb..bafb9001ce94 100644
|
||||
--- a/arch/x86/kvm/mmu.c
|
||||
+++ b/arch/x86/kvm/mmu.c
|
||||
@@ -40,6 +40,7 @@
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/hash.h>
|
||||
#include <linux/kern_levels.h>
|
||||
+#include <linux/kthread.h>
|
||||
|
||||
#include <asm/page.h>
|
||||
#include <asm/pat.h>
|
||||
@@ -52,16 +53,26 @@
|
||||
extern bool itlb_multihit_kvm_mitigation;
|
||||
|
||||
static int __read_mostly nx_huge_pages = -1;
|
||||
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
|
||||
|
||||
static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
|
||||
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
|
||||
|
||||
static struct kernel_param_ops nx_huge_pages_ops = {
|
||||
.set = set_nx_huge_pages,
|
||||
.get = param_get_bool,
|
||||
};
|
||||
|
||||
+static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
|
||||
+ .set = set_nx_huge_pages_recovery_ratio,
|
||||
+ .get = param_get_uint,
|
||||
+};
|
||||
+
|
||||
module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
|
||||
__MODULE_PARM_TYPE(nx_huge_pages, "bool");
|
||||
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
|
||||
+ &nx_huge_pages_recovery_ratio, 0644);
|
||||
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
|
||||
|
||||
/*
|
||||
* When setting this variable to true it enables Two-Dimensional-Paging
|
||||
@@ -1121,6 +1132,8 @@ static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
return;
|
||||
|
||||
++kvm->stat.nx_lpage_splits;
|
||||
+ list_add_tail(&sp->lpage_disallowed_link,
|
||||
+ &kvm->arch.lpage_disallowed_mmu_pages);
|
||||
sp->lpage_disallowed = true;
|
||||
}
|
||||
|
||||
@@ -1145,6 +1158,7 @@ static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
{
|
||||
--kvm->stat.nx_lpage_splits;
|
||||
sp->lpage_disallowed = false;
|
||||
+ list_del(&sp->lpage_disallowed_link);
|
||||
}
|
||||
|
||||
static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
|
||||
@@ -5999,6 +6013,8 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
|
||||
idx = srcu_read_lock(&kvm->srcu);
|
||||
kvm_mmu_invalidate_zap_all_pages(kvm);
|
||||
srcu_read_unlock(&kvm->srcu, idx);
|
||||
+
|
||||
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
|
||||
}
|
||||
mutex_unlock(&kvm_lock);
|
||||
}
|
||||
@@ -6079,3 +6095,116 @@ void kvm_mmu_module_exit(void)
|
||||
unregister_shrinker(&mmu_shrinker);
|
||||
mmu_audit_disable();
|
||||
}
|
||||
+
|
||||
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
|
||||
+{
|
||||
+ unsigned int old_val;
|
||||
+ int err;
|
||||
+
|
||||
+ old_val = nx_huge_pages_recovery_ratio;
|
||||
+ err = param_set_uint(val, kp);
|
||||
+ if (err)
|
||||
+ return err;
|
||||
+
|
||||
+ if (READ_ONCE(nx_huge_pages) &&
|
||||
+ !old_val && nx_huge_pages_recovery_ratio) {
|
||||
+ struct kvm *kvm;
|
||||
+
|
||||
+ mutex_lock(&kvm_lock);
|
||||
+
|
||||
+ list_for_each_entry(kvm, &vm_list, vm_list)
|
||||
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
|
||||
+
|
||||
+ mutex_unlock(&kvm_lock);
|
||||
+ }
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+
|
||||
+static void kvm_recover_nx_lpages(struct kvm *kvm)
|
||||
+{
|
||||
+ int rcu_idx;
|
||||
+ struct kvm_mmu_page *sp;
|
||||
+ unsigned int ratio;
|
||||
+ LIST_HEAD(invalid_list);
|
||||
+ ulong to_zap;
|
||||
+
|
||||
+ rcu_idx = srcu_read_lock(&kvm->srcu);
|
||||
+ spin_lock(&kvm->mmu_lock);
|
||||
+
|
||||
+ ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
|
||||
+ to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
|
||||
+ while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
|
||||
+ /*
|
||||
+ * We use a separate list instead of just using active_mmu_pages
|
||||
+ * because the number of lpage_disallowed pages is expected to
|
||||
+ * be relatively small compared to the total.
|
||||
+ */
|
||||
+ sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
|
||||
+ struct kvm_mmu_page,
|
||||
+ lpage_disallowed_link);
|
||||
+ WARN_ON_ONCE(!sp->lpage_disallowed);
|
||||
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
|
||||
+ WARN_ON_ONCE(sp->lpage_disallowed);
|
||||
+
|
||||
+ if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
|
||||
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
|
||||
+ if (to_zap)
|
||||
+ cond_resched_lock(&kvm->mmu_lock);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ spin_unlock(&kvm->mmu_lock);
|
||||
+ srcu_read_unlock(&kvm->srcu, rcu_idx);
|
||||
+}
|
||||
+
|
||||
+static long get_nx_lpage_recovery_timeout(u64 start_time)
|
||||
+{
|
||||
+ return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
|
||||
+ ? start_time + 60 * HZ - get_jiffies_64()
|
||||
+ : MAX_SCHEDULE_TIMEOUT;
|
||||
+}
|
||||
+
|
||||
+static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
|
||||
+{
|
||||
+ u64 start_time;
|
||||
+ long remaining_time;
|
||||
+
|
||||
+ while (true) {
|
||||
+ start_time = get_jiffies_64();
|
||||
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
|
||||
+
|
||||
+ set_current_state(TASK_INTERRUPTIBLE);
|
||||
+ while (!kthread_should_stop() && remaining_time > 0) {
|
||||
+ schedule_timeout(remaining_time);
|
||||
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
|
||||
+ set_current_state(TASK_INTERRUPTIBLE);
|
||||
+ }
|
||||
+
|
||||
+ set_current_state(TASK_RUNNING);
|
||||
+
|
||||
+ if (kthread_should_stop())
|
||||
+ return 0;
|
||||
+
|
||||
+ kvm_recover_nx_lpages(kvm);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+int kvm_mmu_post_init_vm(struct kvm *kvm)
|
||||
+{
|
||||
+ int err;
|
||||
+
|
||||
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
|
||||
+ "kvm-nx-lpage-recovery",
|
||||
+ &kvm->arch.nx_lpage_recovery_thread);
|
||||
+ if (!err)
|
||||
+ kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+
|
||||
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
|
||||
+{
|
||||
+ if (kvm->arch.nx_lpage_recovery_thread)
|
||||
+ kthread_stop(kvm->arch.nx_lpage_recovery_thread);
|
||||
+}
|
||||
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
|
||||
index 65892288bf51..f7b2de7b6382 100644
|
||||
--- a/arch/x86/kvm/mmu.h
|
||||
+++ b/arch/x86/kvm/mmu.h
|
||||
@@ -216,4 +216,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
|
||||
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot, u64 gfn);
|
||||
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
|
||||
+
|
||||
+int kvm_mmu_post_init_vm(struct kvm *kvm);
|
||||
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
|
||||
+
|
||||
#endif
|
||||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
|
||||
index 406a37aa61c7..1ecadf51f154 100644
|
||||
--- a/arch/x86/kvm/x86.c
|
||||
+++ b/arch/x86/kvm/x86.c
|
||||
@@ -8950,6 +8950,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
||||
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
|
||||
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
|
||||
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
|
||||
+ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
|
||||
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
|
||||
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
|
||||
|
||||
@@ -8981,6 +8982,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
||||
return 0;
|
||||
}
|
||||
|
||||
+int kvm_arch_post_init_vm(struct kvm *kvm)
|
||||
+{
|
||||
+ return kvm_mmu_post_init_vm(kvm);
|
||||
+}
|
||||
+
|
||||
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
vcpu_load(vcpu);
|
||||
@@ -9082,6 +9088,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(x86_set_memory_region);
|
||||
|
||||
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
|
||||
+{
|
||||
+ kvm_mmu_pre_destroy_vm(kvm);
|
||||
+}
|
||||
+
|
||||
void kvm_arch_destroy_vm(struct kvm *kvm)
|
||||
{
|
||||
if (current->mm == kvm->mm) {
|
||||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
|
||||
index 77da54d334b2..7a0d86d52230 100644
|
||||
--- a/virt/kvm/kvm_main.c
|
||||
+++ b/virt/kvm/kvm_main.c
|
||||
@@ -625,6 +625,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
|
||||
return 0;
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Called after the VM is otherwise initialized, but just before adding it to
|
||||
+ * the vm_list.
|
||||
+ */
|
||||
+int __weak kvm_arch_post_init_vm(struct kvm *kvm)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Called just after removing the VM from the vm_list, but before doing any
|
||||
+ * other destruction.
|
||||
+ */
|
||||
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
static struct kvm *kvm_create_vm(unsigned long type)
|
||||
{
|
||||
int r, i;
|
||||
@@ -679,10 +696,14 @@ static struct kvm *kvm_create_vm(unsigned long type)
|
||||
rcu_assign_pointer(kvm->buses[i],
|
||||
kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
|
||||
if (!kvm->buses[i])
|
||||
- goto out_err;
|
||||
+ goto out_err_no_mmu_notifier;
|
||||
}
|
||||
|
||||
r = kvm_init_mmu_notifier(kvm);
|
||||
+ if (r)
|
||||
+ goto out_err_no_mmu_notifier;
|
||||
+
|
||||
+ r = kvm_arch_post_init_vm(kvm);
|
||||
if (r)
|
||||
goto out_err;
|
||||
|
||||
@@ -695,6 +716,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
|
||||
return kvm;
|
||||
|
||||
out_err:
|
||||
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
|
||||
+ if (kvm->mmu_notifier.ops)
|
||||
+ mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
|
||||
+#endif
|
||||
+out_err_no_mmu_notifier:
|
||||
cleanup_srcu_struct(&kvm->irq_srcu);
|
||||
out_err_no_irq_srcu:
|
||||
cleanup_srcu_struct(&kvm->srcu);
|
||||
@@ -737,6 +763,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
|
||||
mutex_lock(&kvm_lock);
|
||||
list_del(&kvm->vm_list);
|
||||
mutex_unlock(&kvm_lock);
|
||||
+ kvm_arch_pre_destroy_vm(kvm);
|
||||
+
|
||||
kvm_free_irq_routing(kvm);
|
||||
for (i = 0; i < KVM_NR_BUSES; i++) {
|
||||
struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
|
|
@ -258,6 +258,18 @@ bugfix/all/ALSA-usb-audio-Fix-a-stack-buffer-overflow-bug-in-check_input_term.pa
|
|||
bugfix/all/vhost-make-sure-log_num-in_num.patch
|
||||
bugfix/x86/x86-ptrace-fix-up-botched-merge-of-spectrev1-fix.patch
|
||||
bugfix/all/KVM-coalesced_mmio-add-bounds-checking.patch
|
||||
bugfix/x86//itlb_multihit/0001-KVM-x86-adjust-kvm_mmu_page-member-to-save-8-bytes.patch
|
||||
bugfix/x86//itlb_multihit/0002-kvm-Convert-kvm_lock-to-a-mutex.patch
|
||||
bugfix/x86//itlb_multihit/0003-kvm-x86-Do-not-release-the-page-inside-mmu_set_spte.patch
|
||||
bugfix/x86//itlb_multihit/0004-KVM-x86-make-FNAME-fetch-and-__direct_map-more-simil.patch
|
||||
bugfix/x86//itlb_multihit/0005-KVM-x86-remove-now-unneeded-hugepage-gfn-adjustment.patch
|
||||
bugfix/x86//itlb_multihit/0006-KVM-x86-change-kvm_mmu_page_get_gfn-BUG_ON-to-WARN_O.patch
|
||||
bugfix/x86//itlb_multihit/0007-KVM-x86-add-tracepoints-around-__direct_map-and-FNAM.patch
|
||||
bugfix/x86//itlb_multihit/0008-kvm-x86-powerpc-do-not-allow-clearing-largepages-deb.patch
|
||||
bugfix/x86//itlb_multihit/0009-x86-Add-ITLB_MULTIHIT-bug-infrastructure.patch
|
||||
bugfix/x86//itlb_multihit/0010-kvm-mmu-ITLB_MULTIHIT-mitigation.patch
|
||||
bugfix/x86//itlb_multihit/0011-kvm-Add-helper-function-for-creating-VM-worker-threa.patch
|
||||
bugfix/x86//itlb_multihit/0012-kvm-x86-mmu-Recovery-of-shattered-NX-large-pages.patch
|
||||
|
||||
# ABI maintenance
|
||||
debian/abi/powerpc-avoid-abi-change-for-disabling-tm.patch
|
||||
|
|
Loading…
Reference in New Issue