From d9bd5941448fac3369ea59234201babc7c20a5dc Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 20 Oct 2019 14:32:35 +0100 Subject: [PATCH] [x86] KVM: Add mitigation for Machine Check Error on Page Size Change (aka iTLB multi-hit, CVE-2018-12207) This is a backport of v6 of the "NX" patch set, and will probably require updates before release. --- debian/changelog | 19 + ...-kvm_mmu_page-member-to-save-8-bytes.patch | 54 ++ ...0002-kvm-Convert-kvm_lock-to-a-mutex.patch | 275 +++++++++++ ...release-the-page-inside-mmu_set_spte.patch | 137 ++++++ ...ME-fetch-and-__direct_map-more-simil.patch | 173 +++++++ ...now-unneeded-hugepage-gfn-adjustment.patch | 74 +++ ...vm_mmu_page_get_gfn-BUG_ON-to-WARN_O.patch | 41 ++ ...epoints-around-__direct_map-and-FNAM.patch | 148 ++++++ ...do-not-allow-clearing-largepages-deb.patch | 101 ++++ ...Add-ITLB_MULTIHIT-bug-infrastructure.patch | 280 +++++++++++ ...010-kvm-mmu-ITLB_MULTIHIT-mitigation.patch | 464 ++++++++++++++++++ ...unction-for-creating-VM-worker-threa.patch | 131 +++++ ...Recovery-of-shattered-NX-large-pages.patch | 368 ++++++++++++++ debian/patches/series | 12 + 14 files changed, 2277 insertions(+) create mode 100644 debian/patches/bugfix/x86/itlb_multihit/0001-KVM-x86-adjust-kvm_mmu_page-member-to-save-8-bytes.patch create mode 100644 debian/patches/bugfix/x86/itlb_multihit/0002-kvm-Convert-kvm_lock-to-a-mutex.patch create mode 100644 debian/patches/bugfix/x86/itlb_multihit/0003-kvm-x86-Do-not-release-the-page-inside-mmu_set_spte.patch create mode 100644 debian/patches/bugfix/x86/itlb_multihit/0004-KVM-x86-make-FNAME-fetch-and-__direct_map-more-simil.patch create mode 100644 debian/patches/bugfix/x86/itlb_multihit/0005-KVM-x86-remove-now-unneeded-hugepage-gfn-adjustment.patch create mode 100644 debian/patches/bugfix/x86/itlb_multihit/0006-KVM-x86-change-kvm_mmu_page_get_gfn-BUG_ON-to-WARN_O.patch create mode 100644 debian/patches/bugfix/x86/itlb_multihit/0007-KVM-x86-add-tracepoints-around-__direct_map-and-FNAM.patch create mode 100644 debian/patches/bugfix/x86/itlb_multihit/0008-kvm-x86-powerpc-do-not-allow-clearing-largepages-deb.patch create mode 100644 debian/patches/bugfix/x86/itlb_multihit/0009-x86-Add-ITLB_MULTIHIT-bug-infrastructure.patch create mode 100644 debian/patches/bugfix/x86/itlb_multihit/0010-kvm-mmu-ITLB_MULTIHIT-mitigation.patch create mode 100644 debian/patches/bugfix/x86/itlb_multihit/0011-kvm-Add-helper-function-for-creating-VM-worker-threa.patch create mode 100644 debian/patches/bugfix/x86/itlb_multihit/0012-kvm-x86-mmu-Recovery-of-shattered-NX-large-pages.patch diff --git a/debian/changelog b/debian/changelog index 13e18b429..49e892720 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,22 @@ +linux (4.19.67-2+deb10u2) UNRELEASED; urgency=medium + + * [x86] KVM: Add mitigation for Machine Check Error on Page Size Change + (aka iTLB multi-hit, CVE-2018-12207): + - KVM: x86: adjust kvm_mmu_page member to save 8 bytes + - kvm: Convert kvm_lock to a mutex + - kvm: x86: Do not release the page inside mmu_set_spte() + - KVM: x86: make FNAME(fetch) and __direct_map more similar + - KVM: x86: remove now unneeded hugepage gfn adjustment + - KVM: x86: change kvm_mmu_page_get_gfn BUG_ON to WARN_ON + - KVM: x86: add tracepoints around __direct_map and FNAME(fetch) + - kvm: x86, powerpc: do not allow clearing largepages debugfs entry + - x86: Add ITLB_MULTIHIT bug infrastructure + - kvm: mmu: ITLB_MULTIHIT mitigation + - kvm: Add helper function for creating VM worker threads + - kvm: x86: mmu: Recovery of shattered NX large pages + + -- Ben Hutchings Sun, 20 Oct 2019 14:21:28 +0100 + linux (4.19.67-2+deb10u1) buster-security; urgency=high [ Romain Perier ] diff --git a/debian/patches/bugfix/x86/itlb_multihit/0001-KVM-x86-adjust-kvm_mmu_page-member-to-save-8-bytes.patch b/debian/patches/bugfix/x86/itlb_multihit/0001-KVM-x86-adjust-kvm_mmu_page-member-to-save-8-bytes.patch new file mode 100644 index 000000000..3a332cb2a --- /dev/null +++ b/debian/patches/bugfix/x86/itlb_multihit/0001-KVM-x86-adjust-kvm_mmu_page-member-to-save-8-bytes.patch @@ -0,0 +1,54 @@ +From: Wei Yang +Date: Thu, 6 Sep 2018 05:58:16 +0800 +Subject: KVM: x86: adjust kvm_mmu_page member to save 8 bytes + +commit 3ff519f29d98ecdc1961d825d105d68711093b6b upstream. + +On a 64bits machine, struct is naturally aligned with 8 bytes. Since +kvm_mmu_page member *unsync* and *role* are less then 4 bytes, we can +rearrange the sequence to compace the struct. + +As the comment shows, *role* and *gfn* are used to key the shadow page. In +order to keep the comment valid, this patch moves the *unsync* up and +exchange the position of *role* and *gfn*. + +From /proc/slabinfo, it shows the size of kvm_mmu_page is 8 bytes less and +with one more object per slap after applying this patch. + + # name + kvm_mmu_page_header 0 0 168 24 + + kvm_mmu_page_header 0 0 160 25 + +Signed-off-by: Wei Yang +Signed-off-by: Paolo Bonzini +Signed-off-by: Ben Hutchings +--- + arch/x86/include/asm/kvm_host.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 0d3f5cf3ff3e..90dccb5c79d9 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -281,18 +281,18 @@ struct kvm_rmap_head { + struct kvm_mmu_page { + struct list_head link; + struct hlist_node hash_link; ++ bool unsync; + + /* + * The following two entries are used to key the shadow page in the + * hash table. + */ +- gfn_t gfn; + union kvm_mmu_page_role role; ++ gfn_t gfn; + + u64 *spt; + /* hold the gfn of each spte inside spt */ + gfn_t *gfns; +- bool unsync; + int root_count; /* Currently serving as active root */ + unsigned int unsync_children; + struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ diff --git a/debian/patches/bugfix/x86/itlb_multihit/0002-kvm-Convert-kvm_lock-to-a-mutex.patch b/debian/patches/bugfix/x86/itlb_multihit/0002-kvm-Convert-kvm_lock-to-a-mutex.patch new file mode 100644 index 000000000..d1f52e63d --- /dev/null +++ b/debian/patches/bugfix/x86/itlb_multihit/0002-kvm-Convert-kvm_lock-to-a-mutex.patch @@ -0,0 +1,275 @@ +From: Junaid Shahid +Date: Thu, 3 Jan 2019 17:14:28 -0800 +Subject: kvm: Convert kvm_lock to a mutex + +commit 0d9ce162cf46c99628cc5da9510b959c7976735b upstream. + +It doesn't seem as if there is any particular need for kvm_lock to be a +spinlock, so convert the lock to a mutex so that sleepable functions (in +particular cond_resched()) can be called while holding it. + +Signed-off-by: Junaid Shahid +Signed-off-by: Paolo Bonzini +[bwh: Backported to 4.19: adjust context] +Signed-off-by: Ben Hutchings +--- + Documentation/virtual/kvm/locking.txt | 4 +--- + arch/s390/kvm/kvm-s390.c | 4 ++-- + arch/x86/kvm/mmu.c | 4 ++-- + arch/x86/kvm/x86.c | 14 ++++++------- + include/linux/kvm_host.h | 2 +- + virt/kvm/kvm_main.c | 30 +++++++++++++-------------- + 6 files changed, 28 insertions(+), 30 deletions(-) + +diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt +index 1bb8bcaf8497..635cd6eaf714 100644 +--- a/Documentation/virtual/kvm/locking.txt ++++ b/Documentation/virtual/kvm/locking.txt +@@ -15,8 +15,6 @@ KVM Lock Overview + + On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. + +-For spinlocks, kvm_lock is taken outside kvm->mmu_lock. +- + Everything else is a leaf: no other lock is taken inside the critical + sections. + +@@ -169,7 +167,7 @@ which time it will be set using the Dirty tracking mechanism described above. + ------------ + + Name: kvm_lock +-Type: spinlock_t ++Type: mutex + Arch: any + Protects: - vm_list + +diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c +index fac1d4eaa426..3c317bc6b799 100644 +--- a/arch/s390/kvm/kvm-s390.c ++++ b/arch/s390/kvm/kvm-s390.c +@@ -2110,13 +2110,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) + kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); + if (!kvm->arch.sca) + goto out_err; +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + sca_offset += 16; + if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) + sca_offset = 0; + kvm->arch.sca = (struct bsca_block *) + ((char *) kvm->arch.sca + sca_offset); +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + sprintf(debug_name, "kvm-%u", current->pid); + +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index 88940261fb53..c9d4e02bd73a 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -5819,7 +5819,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) + int nr_to_scan = sc->nr_to_scan; + unsigned long freed = 0; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + int idx; +@@ -5869,7 +5869,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) + break; + } + +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + return freed; + } + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 6ae8a013af31..0c085b895e6e 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -6502,7 +6502,7 @@ static void kvm_hyperv_tsc_notifier(void) + struct kvm_vcpu *vcpu; + int cpu; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_make_mclock_inprogress_request(kvm); + +@@ -6528,7 +6528,7 @@ static void kvm_hyperv_tsc_notifier(void) + + spin_unlock(&ka->pvclock_gtod_sync_lock); + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + } + #endif + +@@ -6586,17 +6586,17 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va + + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->cpu != freq->cpu) + continue; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); +- if (vcpu->cpu != smp_processor_id()) ++ if (vcpu->cpu != raw_smp_processor_id()) + send_ipi = 1; + } + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + if (freq->old < freq->new && send_ipi) { + /* +@@ -6722,12 +6722,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) + struct kvm_vcpu *vcpu; + int i; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + atomic_set(&kvm_guest_has_master_clock, 0); +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + } + + static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index d42a36e4e6c2..5246a480d15a 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -141,7 +141,7 @@ static inline bool is_error_page(struct page *page) + + extern struct kmem_cache *kvm_vcpu_cache; + +-extern spinlock_t kvm_lock; ++extern struct mutex kvm_lock; + extern struct list_head vm_list; + + struct kvm_io_range { +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 4a584a575221..6a8fe26198b9 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); + * kvm->lock --> kvm->slots_lock --> kvm->irq_lock + */ + +-DEFINE_SPINLOCK(kvm_lock); ++DEFINE_MUTEX(kvm_lock); + static DEFINE_RAW_SPINLOCK(kvm_count_lock); + LIST_HEAD(vm_list); + +@@ -684,9 +684,9 @@ static struct kvm *kvm_create_vm(unsigned long type) + if (r) + goto out_err; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_add(&kvm->vm_list, &vm_list); +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + preempt_notifier_inc(); + +@@ -732,9 +732,9 @@ static void kvm_destroy_vm(struct kvm *kvm) + kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); + kvm_destroy_vm_debugfs(kvm); + kvm_arch_sync_events(kvm); +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_del(&kvm->vm_list); +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + kvm_free_irq_routing(kvm); + for (i = 0; i < KVM_NR_BUSES; i++) { + struct kvm_io_bus *bus = kvm_get_bus(kvm, i); +@@ -3828,13 +3828,13 @@ static int vm_stat_get(void *_offset, u64 *val) + u64 tmp_val; + + *val = 0; +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + stat_tmp.kvm = kvm; + vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); + *val += tmp_val; + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + return 0; + } + +@@ -3847,12 +3847,12 @@ static int vm_stat_clear(void *_offset, u64 val) + if (val) + return -EINVAL; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + stat_tmp.kvm = kvm; + vm_stat_clear_per_vm((void *)&stat_tmp, 0); + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + return 0; + } +@@ -3867,13 +3867,13 @@ static int vcpu_stat_get(void *_offset, u64 *val) + u64 tmp_val; + + *val = 0; +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + stat_tmp.kvm = kvm; + vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); + *val += tmp_val; + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + return 0; + } + +@@ -3886,12 +3886,12 @@ static int vcpu_stat_clear(void *_offset, u64 val) + if (val) + return -EINVAL; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + stat_tmp.kvm = kvm; + vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + return 0; + } +@@ -3912,7 +3912,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) + if (!kvm_dev.this_device || !kvm) + return; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + if (type == KVM_EVENT_CREATE_VM) { + kvm_createvm_count++; + kvm_active_vms++; +@@ -3921,7 +3921,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) + } + created = kvm_createvm_count; + active = kvm_active_vms; +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + env = kzalloc(sizeof(*env), GFP_KERNEL); + if (!env) diff --git a/debian/patches/bugfix/x86/itlb_multihit/0003-kvm-x86-Do-not-release-the-page-inside-mmu_set_spte.patch b/debian/patches/bugfix/x86/itlb_multihit/0003-kvm-x86-Do-not-release-the-page-inside-mmu_set_spte.patch new file mode 100644 index 000000000..51cb71d1f --- /dev/null +++ b/debian/patches/bugfix/x86/itlb_multihit/0003-kvm-x86-Do-not-release-the-page-inside-mmu_set_spte.patch @@ -0,0 +1,137 @@ +From: Junaid Shahid +Date: Thu, 3 Jan 2019 16:22:21 -0800 +Subject: kvm: x86: Do not release the page inside mmu_set_spte() + +commit 43fdcda96e2550c6d1c46fb8a78801aa2f7276ed upstream. + +Release the page at the call-site where it was originally acquired. +This makes the exit code cleaner for most call sites, since they +do not need to duplicate code between success and the failure +label. + +Signed-off-by: Junaid Shahid +Signed-off-by: Paolo Bonzini +Signed-off-by: Ben Hutchings +--- + arch/x86/kvm/mmu.c | 18 +++++++----------- + arch/x86/kvm/paging_tmpl.h | 8 +++----- + 2 files changed, 10 insertions(+), 16 deletions(-) + +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index c9d4e02bd73a..7dc18fb42168 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -3001,8 +3001,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + } + } + +- kvm_release_pfn_clean(pfn); +- + return ret; + } + +@@ -3037,9 +3035,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, + if (ret <= 0) + return -1; + +- for (i = 0; i < ret; i++, gfn++, start++) ++ for (i = 0; i < ret; i++, gfn++, start++) { + mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, + page_to_pfn(pages[i]), true, true); ++ put_page(pages[i]); ++ } + + return 0; + } +@@ -3445,6 +3445,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) + return r; + ++ r = RET_PF_RETRY; + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + goto out_unlock; +@@ -3453,14 +3454,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); + r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); +- spin_unlock(&vcpu->kvm->mmu_lock); +- +- return r; + + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +- return RET_PF_RETRY; ++ return r; + } + + static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, +@@ -4082,6 +4080,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) + return r; + ++ r = RET_PF_RETRY; + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + goto out_unlock; +@@ -4090,14 +4089,11 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); + r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); +- spin_unlock(&vcpu->kvm->mmu_lock); +- +- return r; + + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +- return RET_PF_RETRY; ++ return r; + } + + static void nonpaging_init_context(struct kvm_vcpu *vcpu, +diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h +index 14ffd973df54..569c55dae3fa 100644 +--- a/arch/x86/kvm/paging_tmpl.h ++++ b/arch/x86/kvm/paging_tmpl.h +@@ -522,6 +522,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, + true, true); + ++ kvm_release_pfn_clean(pfn); + return true; + } + +@@ -673,7 +674,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + return ret; + + out_gpte_changed: +- kvm_release_pfn_clean(pfn); + return RET_PF_RETRY; + } + +@@ -821,6 +821,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + walker.pte_access &= ~ACC_EXEC_MASK; + } + ++ r = RET_PF_RETRY; + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + goto out_unlock; +@@ -834,14 +835,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + level, pfn, map_writable, prefault); + ++vcpu->stat.pf_fixed; + kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); +- spin_unlock(&vcpu->kvm->mmu_lock); +- +- return r; + + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +- return RET_PF_RETRY; ++ return r; + } + + static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) diff --git a/debian/patches/bugfix/x86/itlb_multihit/0004-KVM-x86-make-FNAME-fetch-and-__direct_map-more-simil.patch b/debian/patches/bugfix/x86/itlb_multihit/0004-KVM-x86-make-FNAME-fetch-and-__direct_map-more-simil.patch new file mode 100644 index 000000000..436fb76b1 --- /dev/null +++ b/debian/patches/bugfix/x86/itlb_multihit/0004-KVM-x86-make-FNAME-fetch-and-__direct_map-more-simil.patch @@ -0,0 +1,173 @@ +From: Paolo Bonzini +Date: Mon, 24 Jun 2019 13:06:21 +0200 +Subject: KVM: x86: make FNAME(fetch) and __direct_map more similar + +commit 3fcf2d1bdeb6a513523cb2c77012a6b047aa859c upstream. + +These two functions are basically doing the same thing through +kvm_mmu_get_page, link_shadow_page and mmu_set_spte; yet, for historical +reasons, their code looks very different. This patch tries to take the +best of each and make them very similar, so that it is easy to understand +changes that apply to both of them. + +Signed-off-by: Paolo Bonzini +[bwh: Backported to 4.19: adjust context] +Signed-off-by: Ben Hutchings +--- + arch/x86/kvm/mmu.c | 53 ++++++++++++++++++-------------------- + arch/x86/kvm/paging_tmpl.h | 30 ++++++++++----------- + 2 files changed, 39 insertions(+), 44 deletions(-) + +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index 7dc18fb42168..42a7120323bb 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -3087,40 +3087,39 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) + __direct_pte_prefetch(vcpu, sp, sptep); + } + +-static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, +- int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) ++static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, ++ int map_writable, int level, kvm_pfn_t pfn, ++ bool prefault) + { +- struct kvm_shadow_walk_iterator iterator; ++ struct kvm_shadow_walk_iterator it; + struct kvm_mmu_page *sp; +- int emulate = 0; +- gfn_t pseudo_gfn; ++ int ret; ++ gfn_t gfn = gpa >> PAGE_SHIFT; ++ gfn_t base_gfn = gfn; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) +- return 0; ++ return RET_PF_RETRY; + +- for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { +- if (iterator.level == level) { +- emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, +- write, level, gfn, pfn, prefault, +- map_writable); +- direct_pte_prefetch(vcpu, iterator.sptep); +- ++vcpu->stat.pf_fixed; ++ for_each_shadow_entry(vcpu, gpa, it) { ++ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); ++ if (it.level == level) + break; +- } + +- drop_large_spte(vcpu, iterator.sptep); +- if (!is_shadow_present_pte(*iterator.sptep)) { +- u64 base_addr = iterator.addr; ++ drop_large_spte(vcpu, it.sptep); ++ if (!is_shadow_present_pte(*it.sptep)) { ++ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, ++ it.level - 1, true, ACC_ALL); + +- base_addr &= PT64_LVL_ADDR_MASK(iterator.level); +- pseudo_gfn = base_addr >> PAGE_SHIFT; +- sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, +- iterator.level - 1, 1, ACC_ALL); +- +- link_shadow_page(vcpu, iterator.sptep, sp); ++ link_shadow_page(vcpu, it.sptep, sp); + } + } +- return emulate; ++ ++ ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, ++ write, level, base_gfn, pfn, prefault, ++ map_writable); ++ direct_pte_prefetch(vcpu, it.sptep); ++ ++vcpu->stat.pf_fixed; ++ return ret; + } + + static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) +@@ -3453,8 +3452,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + goto out_unlock; + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); +- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); +- ++ r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +@@ -4088,8 +4086,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + goto out_unlock; + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); +- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); +- ++ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h +index 569c55dae3fa..eb95d3672acd 100644 +--- a/arch/x86/kvm/paging_tmpl.h ++++ b/arch/x86/kvm/paging_tmpl.h +@@ -602,6 +602,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct kvm_shadow_walk_iterator it; + unsigned direct_access, access = gw->pt_access; + int top_level, ret; ++ gfn_t base_gfn; + + direct_access = gw->pte_access; + +@@ -646,31 +647,29 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + link_shadow_page(vcpu, it.sptep, sp); + } + +- for (; +- shadow_walk_okay(&it) && it.level > hlevel; +- shadow_walk_next(&it)) { +- gfn_t direct_gfn; ++ base_gfn = gw->gfn; + ++ for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { + clear_sp_write_flooding_count(it.sptep); ++ base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); ++ if (it.level == hlevel) ++ break; ++ + validate_direct_spte(vcpu, it.sptep, direct_access); + + drop_large_spte(vcpu, it.sptep); + +- if (is_shadow_present_pte(*it.sptep)) +- continue; +- +- direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); +- +- sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, +- true, direct_access); +- link_shadow_page(vcpu, it.sptep, sp); ++ if (!is_shadow_present_pte(*it.sptep)) { ++ sp = kvm_mmu_get_page(vcpu, base_gfn, addr, ++ it.level - 1, true, direct_access); ++ link_shadow_page(vcpu, it.sptep, sp); ++ } + } + +- clear_sp_write_flooding_count(it.sptep); + ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, +- it.level, gw->gfn, pfn, prefault, map_writable); ++ it.level, base_gfn, pfn, prefault, map_writable); + FNAME(pte_prefetch)(vcpu, gw, it.sptep); +- ++ ++vcpu->stat.pf_fixed; + return ret; + + out_gpte_changed: +@@ -833,7 +832,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, + level, pfn, map_writable, prefault); +- ++vcpu->stat.pf_fixed; + kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); + + out_unlock: diff --git a/debian/patches/bugfix/x86/itlb_multihit/0005-KVM-x86-remove-now-unneeded-hugepage-gfn-adjustment.patch b/debian/patches/bugfix/x86/itlb_multihit/0005-KVM-x86-remove-now-unneeded-hugepage-gfn-adjustment.patch new file mode 100644 index 000000000..37aebee2b --- /dev/null +++ b/debian/patches/bugfix/x86/itlb_multihit/0005-KVM-x86-remove-now-unneeded-hugepage-gfn-adjustment.patch @@ -0,0 +1,74 @@ +From: Paolo Bonzini +Date: Sun, 23 Jun 2019 19:15:49 +0200 +Subject: KVM: x86: remove now unneeded hugepage gfn adjustment + +commit d679b32611c0102ce33b9e1a4e4b94854ed1812a upstream. + +After the previous patch, the low bits of the gfn are masked in +both FNAME(fetch) and __direct_map, so we do not need to clear them +in transparent_hugepage_adjust. + +Signed-off-by: Paolo Bonzini +Signed-off-by: Ben Hutchings +--- + arch/x86/kvm/mmu.c | 9 +++------ + arch/x86/kvm/paging_tmpl.h | 2 +- + 2 files changed, 4 insertions(+), 7 deletions(-) + +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index 42a7120323bb..96803f996819 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -3155,11 +3155,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) + } + + static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, +- gfn_t *gfnp, kvm_pfn_t *pfnp, ++ gfn_t gfn, kvm_pfn_t *pfnp, + int *levelp) + { + kvm_pfn_t pfn = *pfnp; +- gfn_t gfn = *gfnp; + int level = *levelp; + + /* +@@ -3186,8 +3185,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { +- gfn &= ~mask; +- *gfnp = gfn; + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + kvm_get_pfn(pfn); +@@ -3451,7 +3448,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + if (make_mmu_pages_available(vcpu) < 0) + goto out_unlock; + if (likely(!force_pt_level)) +- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); ++ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); +@@ -4085,7 +4082,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + if (make_mmu_pages_available(vcpu) < 0) + goto out_unlock; + if (likely(!force_pt_level)) +- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); ++ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); +diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h +index eb95d3672acd..4aab953f1d31 100644 +--- a/arch/x86/kvm/paging_tmpl.h ++++ b/arch/x86/kvm/paging_tmpl.h +@@ -829,7 +829,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + if (make_mmu_pages_available(vcpu) < 0) + goto out_unlock; + if (!force_pt_level) +- transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); ++ transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, + level, pfn, map_writable, prefault); + kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); diff --git a/debian/patches/bugfix/x86/itlb_multihit/0006-KVM-x86-change-kvm_mmu_page_get_gfn-BUG_ON-to-WARN_O.patch b/debian/patches/bugfix/x86/itlb_multihit/0006-KVM-x86-change-kvm_mmu_page_get_gfn-BUG_ON-to-WARN_O.patch new file mode 100644 index 000000000..58cd52ba5 --- /dev/null +++ b/debian/patches/bugfix/x86/itlb_multihit/0006-KVM-x86-change-kvm_mmu_page_get_gfn-BUG_ON-to-WARN_O.patch @@ -0,0 +1,41 @@ +From: Paolo Bonzini +Date: Sun, 30 Jun 2019 08:36:21 -0400 +Subject: KVM: x86: change kvm_mmu_page_get_gfn BUG_ON to WARN_ON + +commit e9f2a760b158551bfbef6db31d2cae45ab8072e5 upstream. + +Note that in such a case it is quite likely that KVM will BUG_ON +in __pte_list_remove when the VM is closed. However, there is no +immediate risk of memory corruption in the host so a WARN_ON is +enough and it lets you gather traces for debugging. + +Signed-off-by: Paolo Bonzini +Signed-off-by: Ben Hutchings +--- + arch/x86/kvm/mmu.c | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index 96803f996819..68fa10d890ee 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -1027,10 +1027,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) + + static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) + { +- if (sp->role.direct) +- BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); +- else ++ if (!sp->role.direct) { + sp->gfns[index] = gfn; ++ return; ++ } ++ ++ if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) ++ pr_err_ratelimited("gfn mismatch under direct page %llx " ++ "(expected %llx, got %llx)\n", ++ sp->gfn, ++ kvm_mmu_page_get_gfn(sp, index), gfn); + } + + /* diff --git a/debian/patches/bugfix/x86/itlb_multihit/0007-KVM-x86-add-tracepoints-around-__direct_map-and-FNAM.patch b/debian/patches/bugfix/x86/itlb_multihit/0007-KVM-x86-add-tracepoints-around-__direct_map-and-FNAM.patch new file mode 100644 index 000000000..ce11a4504 --- /dev/null +++ b/debian/patches/bugfix/x86/itlb_multihit/0007-KVM-x86-add-tracepoints-around-__direct_map-and-FNAM.patch @@ -0,0 +1,148 @@ +From: Paolo Bonzini +Date: Mon, 1 Jul 2019 06:22:57 -0400 +Subject: KVM: x86: add tracepoints around __direct_map and FNAME(fetch) + +commit 335e192a3fa415e1202c8b9ecdaaecd643f823cc upstream. + +These are useful in debugging shadow paging. + +Signed-off-by: Paolo Bonzini +[bwh: Backported to 4.19: adjust context] +Signed-off-by: Ben Hutchings +--- + arch/x86/kvm/mmu.c | 13 ++++----- + arch/x86/kvm/mmutrace.h | 59 ++++++++++++++++++++++++++++++++++++++ + arch/x86/kvm/paging_tmpl.h | 2 ++ + 3 files changed, 67 insertions(+), 7 deletions(-) + +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index 68fa10d890ee..7f9be921df7c 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -140,9 +140,6 @@ module_param(dbg, bool, 0644); + + #include + +-#define CREATE_TRACE_POINTS +-#include "mmutrace.h" +- + #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) + +@@ -261,9 +258,13 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + + + static void mmu_spte_set(u64 *sptep, u64 spte); ++static bool is_executable_pte(u64 spte); + static union kvm_mmu_page_role + kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); + ++#define CREATE_TRACE_POINTS ++#include "mmutrace.h" ++ + void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) + { + BUG_ON((mmio_mask & mmio_value) != mmio_value); +@@ -2992,10 +2993,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + ret = RET_PF_EMULATE; + + pgprintk("%s: setting spte %llx\n", __func__, *sptep); +- pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", +- is_large_pte(*sptep)? "2MB" : "4kB", +- *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, +- *sptep, sptep); ++ trace_kvm_mmu_set_spte(level, gfn, sptep); + if (!was_rmapped && is_large_pte(*sptep)) + ++vcpu->kvm->stat.lpages; + +@@ -3106,6 +3104,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return RET_PF_RETRY; + ++ trace_kvm_mmu_spte_requested(gpa, level, pfn); + for_each_shadow_entry(vcpu, gpa, it) { + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == level) +diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h +index c73bf4e4988c..918b0d5bf272 100644 +--- a/arch/x86/kvm/mmutrace.h ++++ b/arch/x86/kvm/mmutrace.h +@@ -325,6 +325,65 @@ TRACE_EVENT( + __entry->kvm_gen == __entry->spte_gen + ) + ); ++ ++TRACE_EVENT( ++ kvm_mmu_set_spte, ++ TP_PROTO(int level, gfn_t gfn, u64 *sptep), ++ TP_ARGS(level, gfn, sptep), ++ ++ TP_STRUCT__entry( ++ __field(u64, gfn) ++ __field(u64, spte) ++ __field(u64, sptep) ++ __field(u8, level) ++ /* These depend on page entry type, so compute them now. */ ++ __field(bool, r) ++ __field(bool, x) ++ __field(u8, u) ++ ), ++ ++ TP_fast_assign( ++ __entry->gfn = gfn; ++ __entry->spte = *sptep; ++ __entry->sptep = virt_to_phys(sptep); ++ __entry->level = level; ++ __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); ++ __entry->x = is_executable_pte(__entry->spte); ++ __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; ++ ), ++ ++ TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", ++ __entry->gfn, __entry->spte, ++ __entry->r ? "r" : "-", ++ __entry->spte & PT_WRITABLE_MASK ? "w" : "-", ++ __entry->x ? "x" : "-", ++ __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), ++ __entry->level, __entry->sptep ++ ) ++); ++ ++TRACE_EVENT( ++ kvm_mmu_spte_requested, ++ TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), ++ TP_ARGS(addr, level, pfn), ++ ++ TP_STRUCT__entry( ++ __field(u64, gfn) ++ __field(u64, pfn) ++ __field(u8, level) ++ ), ++ ++ TP_fast_assign( ++ __entry->gfn = addr >> PAGE_SHIFT; ++ __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); ++ __entry->level = level; ++ ), ++ ++ TP_printk("gfn %llx pfn %llx level %d", ++ __entry->gfn, __entry->pfn, __entry->level ++ ) ++); ++ + #endif /* _TRACE_KVMMMU_H */ + + #undef TRACE_INCLUDE_PATH +diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h +index 4aab953f1d31..3b022b08b577 100644 +--- a/arch/x86/kvm/paging_tmpl.h ++++ b/arch/x86/kvm/paging_tmpl.h +@@ -649,6 +649,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + + base_gfn = gw->gfn; + ++ trace_kvm_mmu_spte_requested(addr, gw->level, pfn); ++ + for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { + clear_sp_write_flooding_count(it.sptep); + base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); diff --git a/debian/patches/bugfix/x86/itlb_multihit/0008-kvm-x86-powerpc-do-not-allow-clearing-largepages-deb.patch b/debian/patches/bugfix/x86/itlb_multihit/0008-kvm-x86-powerpc-do-not-allow-clearing-largepages-deb.patch new file mode 100644 index 000000000..186eef648 --- /dev/null +++ b/debian/patches/bugfix/x86/itlb_multihit/0008-kvm-x86-powerpc-do-not-allow-clearing-largepages-deb.patch @@ -0,0 +1,101 @@ +From: Paolo Bonzini +Date: Mon, 30 Sep 2019 18:48:44 +0200 +Subject: kvm: x86, powerpc: do not allow clearing largepages debugfs entry + +commit 833b45de69a6016c4b0cebe6765d526a31a81580 upstream. + +The largepages debugfs entry is incremented/decremented as shadow +pages are created or destroyed. Clearing it will result in an +underflow, which is harmless to KVM but ugly (and could be +misinterpreted by tools that use debugfs information), so make +this particular statistic read-only. + +Signed-off-by: Paolo Bonzini +[bwh: Backported to 4.19: drop powerpc changes and the Cc to kvm-ppc] +Signed-off-by: Ben Hutchings +--- + arch/x86/kvm/x86.c | 6 +++--- + include/linux/kvm_host.h | 2 ++ + virt/kvm/kvm_main.c | 10 +++++++--- + 3 files changed, 12 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 0c085b895e6e..2714c1a0e59f 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); + static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); + #endif + +-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM +-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU ++#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ ++#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ + + #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) +@@ -205,7 +205,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { + { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, + { "mmu_unsync", VM_STAT(mmu_unsync) }, + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, +- { "largepages", VM_STAT(lpages) }, ++ { "largepages", VM_STAT(lpages, .mode = 0444) }, + { "max_mmu_page_hash_collisions", + VM_STAT(max_mmu_page_hash_collisions) }, + { NULL } +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 5246a480d15a..553a3115a735 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -1034,6 +1034,7 @@ enum kvm_stat_kind { + + struct kvm_stat_data { + int offset; ++ int mode; + struct kvm *kvm; + }; + +@@ -1041,6 +1042,7 @@ struct kvm_stats_debugfs_item { + const char *name; + int offset; + enum kvm_stat_kind kind; ++ int mode; + }; + extern struct kvm_stats_debugfs_item debugfs_entries[]; + extern struct dentry *kvm_debugfs_dir; +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 6a8fe26198b9..5482949b452c 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -616,8 +616,9 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) + + stat_data->kvm = kvm; + stat_data->offset = p->offset; ++ stat_data->mode = p->mode ? p->mode : 0644; + kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; +- debugfs_create_file(p->name, 0644, kvm->debugfs_dentry, ++ debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, + stat_data, stat_fops_per_vm[p->kind]); + } + return 0; +@@ -3714,7 +3715,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, + if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) + return -ENOENT; + +- if (simple_attr_open(inode, file, get, set, fmt)) { ++ if (simple_attr_open(inode, file, get, ++ stat_data->mode & S_IWUGO ? set : NULL, ++ fmt)) { + kvm_put_kvm(stat_data->kvm); + return -ENOMEM; + } +@@ -3962,7 +3965,8 @@ static void kvm_init_debug(void) + + kvm_debugfs_num_entries = 0; + for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { +- debugfs_create_file(p->name, 0644, kvm_debugfs_dir, ++ int mode = p->mode ? p->mode : 0644; ++ debugfs_create_file(p->name, mode, kvm_debugfs_dir, + (void *)(long)p->offset, + stat_fops[p->kind]); + } diff --git a/debian/patches/bugfix/x86/itlb_multihit/0009-x86-Add-ITLB_MULTIHIT-bug-infrastructure.patch b/debian/patches/bugfix/x86/itlb_multihit/0009-x86-Add-ITLB_MULTIHIT-bug-infrastructure.patch new file mode 100644 index 000000000..0ff74e465 --- /dev/null +++ b/debian/patches/bugfix/x86/itlb_multihit/0009-x86-Add-ITLB_MULTIHIT-bug-infrastructure.patch @@ -0,0 +1,280 @@ +From: Pawan Gupta +Date: Fri, 11 Oct 2019 12:40:12 +0200 +Subject: x86: Add ITLB_MULTIHIT bug infrastructure + +Some processors may incur a machine check error possibly +resulting in an unrecoverable cpu hang when an instruction fetch +encounters a TLB multi-hit in the instruction TLB. This can occur +when the page size is changed along with either the physical +address or cache type [1]. + +This issue affects both bare-metal x86 page tables and EPT. + +This can be mitigated by either eliminating the use of large +pages or by using careful TLB invalidations when changing the +page size in the page tables. + +Just like Spectre, Meltdown, L1TF and MDS, a new bit has been +allocated in MSR_IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) and will +be set on CPUs which are mitigated against this issue. + +[1] For example please refer to erratum SKL002 in "6th Generation +Intel Processor Family Specification Update" +https://www.intel.com/content/www/us/en/products/docs/processors/core/desktop-6th-gen-core-family-spec-update.html +https://www.google.com/search?q=site:intel.com+SKL002 + +There are a lot of other affected processors outside of Skylake and +that the erratum(referred above) does not fully disclose the issue +and the impact, both on Skylake and across all the affected CPUs. + +Signed-off-by: Vineela Tummalapalli +Co-developed-by: Pawan Gupta +Signed-off-by: Pawan Gupta +Signed-off-by: Paolo Bonzini +[bwh: Backported to 4.19: + - No support for X86_VENDOR_HYGON, ATOM_AIRMONT_NP + - Adjust context] +Signed-off-by: Ben Hutchings +--- + .../ABI/testing/sysfs-devices-system-cpu | 1 + + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/include/asm/msr-index.h | 7 ++ + arch/x86/kernel/cpu/bugs.c | 13 ++++ + arch/x86/kernel/cpu/common.c | 67 ++++++++++--------- + drivers/base/cpu.c | 8 +++ + include/linux/cpu.h | 2 + + 7 files changed, 68 insertions(+), 31 deletions(-) + +diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu +index 8718d4ad227b..a0edcdc7c0b8 100644 +--- a/Documentation/ABI/testing/sysfs-devices-system-cpu ++++ b/Documentation/ABI/testing/sysfs-devices-system-cpu +@@ -478,6 +478,7 @@ What: /sys/devices/system/cpu/vulnerabilities + /sys/devices/system/cpu/vulnerabilities/spec_store_bypass + /sys/devices/system/cpu/vulnerabilities/l1tf + /sys/devices/system/cpu/vulnerabilities/mds ++ /sys/devices/system/cpu/vulnerabilities/itlb_multihit + Date: January 2018 + Contact: Linux kernel mailing list + Description: Information about CPU vulnerabilities +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 759f0a176612..ccad4f183400 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -389,5 +389,6 @@ + #define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ + #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ + #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ ++#define X86_BUG_ITLB_MULTIHIT X86_BUG(22) /* CPU may incur MCE during certain page attribute changes */ + + #endif /* _ASM_X86_CPUFEATURES_H */ +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index a1d22e4428f6..f58e6921cbf7 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -84,6 +84,13 @@ + * Microarchitectural Data + * Sampling (MDS) vulnerabilities. + */ ++#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /* ++ * The processor is not susceptible to a ++ * machine check error due to modifying the ++ * code page size along with either the ++ * physical address or cache type ++ * without TLB invalidation. ++ */ + + #define MSR_IA32_FLUSH_CMD 0x0000010b + #define L1D_FLUSH BIT(0) /* +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index ee7d17611ead..60e47e492c2f 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1281,6 +1281,11 @@ static ssize_t l1tf_show_state(char *buf) + } + #endif + ++static ssize_t itlb_multihit_show_state(char *buf) ++{ ++ return sprintf(buf, "Processor vulnerable\n"); ++} ++ + static ssize_t mds_show_state(char *buf) + { + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { +@@ -1366,6 +1371,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr + case X86_BUG_MDS: + return mds_show_state(buf); + ++ case X86_BUG_ITLB_MULTIHIT: ++ return itlb_multihit_show_state(buf); ++ + default: + break; + } +@@ -1402,4 +1410,9 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu + { + return cpu_show_common(dev, attr, buf, X86_BUG_MDS); + } ++ ++ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); ++} + #endif +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index b33fdfa0ff49..128808dccd2f 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -946,13 +946,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) + #endif + } + +-#define NO_SPECULATION BIT(0) +-#define NO_MELTDOWN BIT(1) +-#define NO_SSB BIT(2) +-#define NO_L1TF BIT(3) +-#define NO_MDS BIT(4) +-#define MSBDS_ONLY BIT(5) +-#define NO_SWAPGS BIT(6) ++#define NO_SPECULATION BIT(0) ++#define NO_MELTDOWN BIT(1) ++#define NO_SSB BIT(2) ++#define NO_L1TF BIT(3) ++#define NO_MDS BIT(4) ++#define MSBDS_ONLY BIT(5) ++#define NO_SWAPGS BIT(6) ++#define NO_ITLB_MULTIHIT BIT(7) + + #define VULNWL(_vendor, _family, _model, _whitelist) \ + { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } +@@ -970,26 +971,26 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { + VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), + + /* Intel Family 6 */ +- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), +- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), +- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), +- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), +- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), +- +- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), ++ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ ++ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(CORE_YONAH, NO_SSB), + +- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), ++ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + +- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), +- VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS), +- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), ++ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + + /* + * Technically, swapgs isn't serializing on AMD (despite it previously +@@ -1000,13 +1001,13 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { + */ + + /* AMD Family 0xf - 0x12 */ +- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), +- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), +- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), +- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), ++ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + + /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ +- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), ++ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + {} + }; + +@@ -1021,15 +1022,19 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) + { + u64 ia32_cap = 0; + ++ if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) ++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); ++ ++ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ ++ if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) ++ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); ++ + if (cpu_matches(NO_SPECULATION)) + return; + + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + +- if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) +- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); +- + if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && + !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); +diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c +index 2fd6ca1021c2..c21e2aec5cbb 100644 +--- a/drivers/base/cpu.c ++++ b/drivers/base/cpu.c +@@ -552,12 +552,19 @@ ssize_t __weak cpu_show_mds(struct device *dev, + return sprintf(buf, "Not affected\n"); + } + ++ssize_t __weak cpu_show_itlb_multihit(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "Not affected\n"); ++} ++ + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); + static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); + static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); + static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); + static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); + static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); ++static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); + + static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_meltdown.attr, +@@ -566,6 +573,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_spec_store_bypass.attr, + &dev_attr_l1tf.attr, + &dev_attr_mds.attr, ++ &dev_attr_itlb_multihit.attr, + NULL + }; + +diff --git a/include/linux/cpu.h b/include/linux/cpu.h +index 006f69f9277b..7bb824b0f30e 100644 +--- a/include/linux/cpu.h ++++ b/include/linux/cpu.h +@@ -59,6 +59,8 @@ extern ssize_t cpu_show_l1tf(struct device *dev, + struct device_attribute *attr, char *buf); + extern ssize_t cpu_show_mds(struct device *dev, + struct device_attribute *attr, char *buf); ++extern ssize_t cpu_show_itlb_multihit(struct device *dev, ++ struct device_attribute *attr, char *buf); + + extern __printf(4, 5) + struct device *cpu_device_create(struct device *parent, void *drvdata, diff --git a/debian/patches/bugfix/x86/itlb_multihit/0010-kvm-mmu-ITLB_MULTIHIT-mitigation.patch b/debian/patches/bugfix/x86/itlb_multihit/0010-kvm-mmu-ITLB_MULTIHIT-mitigation.patch new file mode 100644 index 000000000..62959cf6c --- /dev/null +++ b/debian/patches/bugfix/x86/itlb_multihit/0010-kvm-mmu-ITLB_MULTIHIT-mitigation.patch @@ -0,0 +1,464 @@ +From: Paolo Bonzini +Date: Fri, 11 Oct 2019 12:40:14 +0200 +Subject: kvm: mmu: ITLB_MULTIHIT mitigation + +With some Intel processors, putting the same virtual address in the TLB +as both a 4 KiB and 2 MiB page can confuse the instruction fetch unit +and cause the processor to issue a machine check. Unfortunately if EPT +page tables use huge pages, it possible for a malicious guest to cause +this situation. + +This patch adds a knob to mark huge pages as non-executable. When the +nx_huge_pages parameter is enabled (and we are using EPT), all huge pages +are marked as NX. If the guest attempts to execute in one of those pages, +the page is broken down into 4K pages, which are then marked executable. + +This is not an issue for shadow paging (except nested EPT), because then +the host is in control of TLB flushes and the problematic situation cannot +happen. With nested EPT, again the nested guest can cause problems so we +treat shadow and direct EPT the same. + +Signed-off-by: Junaid Shahid +Signed-off-by: Paolo Bonzini +[bwh: Backported to 4.19: + - Use kvm_mmu_invalidate_zap_all_pages() instead of kvm_mmu_zap_all_fast() + - Adjust context] +Signed-off-by: Ben Hutchings +--- + .../admin-guide/kernel-parameters.txt | 11 ++ + arch/x86/include/asm/kvm_host.h | 2 + + arch/x86/kernel/cpu/bugs.c | 13 +- + arch/x86/kvm/mmu.c | 135 +++++++++++++++++- + arch/x86/kvm/paging_tmpl.h | 29 +++- + arch/x86/kvm/x86.c | 1 + + 6 files changed, 178 insertions(+), 13 deletions(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 16607b178b47..b2c1a5c63ab3 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -1956,6 +1956,17 @@ + KVM MMU at runtime. + Default is 0 (off) + ++ kvm.nx_huge_pages= ++ [KVM] Controls the sw workaround for bug ++ X86_BUG_ITLB_MULTIHIT. ++ force : Always deploy workaround. ++ off : Default. Never deploy workaround. ++ auto : Deploy workaround based on presence of ++ X86_BUG_ITLB_MULTIHIT. ++ ++ If the sw workaround is enabled for the host, guests ++ need not enable it for nested guests. ++ + kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. + Default is 1 (enabled) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 90dccb5c79d9..59b44445ed59 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -282,6 +282,7 @@ struct kvm_mmu_page { + struct list_head link; + struct hlist_node hash_link; + bool unsync; ++ bool lpage_disallowed; /* Can't be replaced by an equiv large page */ + + /* + * The following two entries are used to key the shadow page in the +@@ -890,6 +891,7 @@ struct kvm_vm_stat { + ulong mmu_unsync; + ulong remote_tlb_flush; + ulong lpages; ++ ulong nx_lpage_splits; + ulong max_mmu_page_hash_collisions; + }; + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index 60e47e492c2f..1e764992fa64 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1119,6 +1119,9 @@ void x86_spec_ctrl_setup_ap(void) + x86_amd_ssb_disable(); + } + ++bool itlb_multihit_kvm_mitigation; ++EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); ++ + #undef pr_fmt + #define pr_fmt(fmt) "L1TF: " fmt + +@@ -1274,17 +1277,25 @@ static ssize_t l1tf_show_state(char *buf) + l1tf_vmx_states[l1tf_vmx_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); + } ++ ++static ssize_t itlb_multihit_show_state(char *buf) ++{ ++ if (itlb_multihit_kvm_mitigation) ++ return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); ++ else ++ return sprintf(buf, "KVM: Vulnerable\n"); ++} + #else + static ssize_t l1tf_show_state(char *buf) + { + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + } +-#endif + + static ssize_t itlb_multihit_show_state(char *buf) + { + return sprintf(buf, "Processor vulnerable\n"); + } ++#endif + + static ssize_t mds_show_state(char *buf) + { +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index 7f9be921df7c..19c3dc9b05cb 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -49,6 +49,20 @@ + #include + #include "trace.h" + ++extern bool itlb_multihit_kvm_mitigation; ++ ++static int __read_mostly nx_huge_pages = -1; ++ ++static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); ++ ++static struct kernel_param_ops nx_huge_pages_ops = { ++ .set = set_nx_huge_pages, ++ .get = param_get_bool, ++}; ++ ++module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); ++__MODULE_PARM_TYPE(nx_huge_pages, "bool"); ++ + /* + * When setting this variable to true it enables Two-Dimensional-Paging + * where the hardware walks 2 page tables: +@@ -284,6 +298,11 @@ static inline bool spte_ad_enabled(u64 spte) + return !(spte & shadow_acc_track_value); + } + ++static bool is_nx_huge_page_enabled(void) ++{ ++ return READ_ONCE(nx_huge_pages); ++} ++ + static inline u64 spte_shadow_accessed_mask(u64 spte) + { + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); +@@ -1096,6 +1115,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) + kvm_mmu_gfn_disallow_lpage(slot, gfn); + } + ++static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) ++{ ++ if (sp->lpage_disallowed) ++ return; ++ ++ ++kvm->stat.nx_lpage_splits; ++ sp->lpage_disallowed = true; ++} ++ + static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) + { + struct kvm_memslots *slots; +@@ -1113,6 +1141,12 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) + kvm_mmu_gfn_allow_lpage(slot, gfn); + } + ++static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) ++{ ++ --kvm->stat.nx_lpage_splits; ++ sp->lpage_disallowed = false; ++} ++ + static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, + struct kvm_memory_slot *slot) + { +@@ -2665,6 +2699,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + kvm_reload_remote_mmus(kvm); + } + ++ if (sp->lpage_disallowed) ++ unaccount_huge_nx_page(kvm, sp); ++ + sp->role.invalid = 1; + return ret; + } +@@ -2873,6 +2910,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, + if (!speculative) + spte |= spte_shadow_accessed_mask(spte); + ++ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && ++ is_nx_huge_page_enabled()) { ++ pte_access &= ~ACC_EXEC_MASK; ++ } ++ + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_x_mask; + else +@@ -3091,9 +3133,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) + __direct_pte_prefetch(vcpu, sp, sptep); + } + ++static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, ++ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) ++{ ++ int level = *levelp; ++ u64 spte = *it.sptep; ++ ++ if (it.level == level && level > PT_PAGE_TABLE_LEVEL && ++ is_nx_huge_page_enabled() && ++ is_shadow_present_pte(spte) && ++ !is_large_pte(spte)) { ++ /* ++ * A small SPTE exists for this pfn, but FNAME(fetch) ++ * and __direct_map would like to create a large PTE ++ * instead: just force them to go down another level, ++ * patching back for them into pfn the next 9 bits of ++ * the address. ++ */ ++ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); ++ *pfnp |= gfn & page_mask; ++ (*levelp)--; ++ } ++} ++ + static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, + int map_writable, int level, kvm_pfn_t pfn, +- bool prefault) ++ bool prefault, bool lpage_disallowed) + { + struct kvm_shadow_walk_iterator it; + struct kvm_mmu_page *sp; +@@ -3106,6 +3171,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, + + trace_kvm_mmu_spte_requested(gpa, level, pfn); + for_each_shadow_entry(vcpu, gpa, it) { ++ /* ++ * We cannot overwrite existing page tables with an NX ++ * large page, as the leaf could be executable. ++ */ ++ disallowed_hugepage_adjust(it, gfn, &pfn, &level); ++ + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == level) + break; +@@ -3116,6 +3187,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, + it.level - 1, true, ACC_ALL); + + link_shadow_page(vcpu, it.sptep, sp); ++ if (lpage_disallowed) ++ account_huge_nx_page(vcpu->kvm, sp); + } + } + +@@ -3416,11 +3489,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + { + int r; + int level; +- bool force_pt_level = false; ++ bool force_pt_level; + kvm_pfn_t pfn; + unsigned long mmu_seq; + bool map_writable, write = error_code & PFERR_WRITE_MASK; ++ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && ++ is_nx_huge_page_enabled(); + ++ force_pt_level = lpage_disallowed; + level = mapping_level(vcpu, gfn, &force_pt_level); + if (likely(!force_pt_level)) { + /* +@@ -3454,7 +3530,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + goto out_unlock; + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); +- r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); ++ r = __direct_map(vcpu, v, write, map_writable, level, pfn, ++ prefault, false); + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +@@ -4048,6 +4125,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + unsigned long mmu_seq; + int write = error_code & PFERR_WRITE_MASK; + bool map_writable; ++ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && ++ is_nx_huge_page_enabled(); + + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + +@@ -4058,8 +4137,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + if (r) + return r; + +- force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, +- PT_DIRECTORY_LEVEL); ++ force_pt_level = ++ lpage_disallowed || ++ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); + level = mapping_level(vcpu, gfn, &force_pt_level); + if (likely(!force_pt_level)) { + if (level > PT_DIRECTORY_LEVEL && +@@ -4088,7 +4168,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + goto out_unlock; + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); +- r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); ++ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, ++ prefault, lpage_disallowed); + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +@@ -5886,10 +5967,52 @@ static void mmu_destroy_caches(void) + kmem_cache_destroy(mmu_page_header_cache); + } + ++static void __set_nx_huge_pages(bool val) ++{ ++ nx_huge_pages = itlb_multihit_kvm_mitigation = val; ++} ++ ++static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) ++{ ++ bool old_val = nx_huge_pages; ++ bool new_val; ++ ++ /* In "auto" mode deploy workaround only if CPU has the bug. */ ++ if (sysfs_streq(val, "off")) ++ new_val = 0; ++ else if (sysfs_streq(val, "force")) ++ new_val = 1; ++ else if (sysfs_streq(val, "auto")) ++ new_val = boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT); ++ else if (strtobool(val, &new_val) < 0) ++ return -EINVAL; ++ ++ __set_nx_huge_pages(new_val); ++ ++ if (new_val != old_val) { ++ struct kvm *kvm; ++ int idx; ++ ++ mutex_lock(&kvm_lock); ++ ++ list_for_each_entry(kvm, &vm_list, vm_list) { ++ idx = srcu_read_lock(&kvm->srcu); ++ kvm_mmu_invalidate_zap_all_pages(kvm); ++ srcu_read_unlock(&kvm->srcu, idx); ++ } ++ mutex_unlock(&kvm_lock); ++ } ++ ++ return 0; ++} ++ + int kvm_mmu_module_init(void) + { + int ret = -ENOMEM; + ++ if (nx_huge_pages == -1) ++ __set_nx_huge_pages(boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT)); ++ + kvm_mmu_reset_all_pte_masks(); + + pte_list_desc_cache = kmem_cache_create("pte_list_desc", +diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h +index 3b022b08b577..adf42dc8d38b 100644 +--- a/arch/x86/kvm/paging_tmpl.h ++++ b/arch/x86/kvm/paging_tmpl.h +@@ -596,13 +596,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, + static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *gw, + int write_fault, int hlevel, +- kvm_pfn_t pfn, bool map_writable, bool prefault) ++ kvm_pfn_t pfn, bool map_writable, bool prefault, ++ bool lpage_disallowed) + { + struct kvm_mmu_page *sp = NULL; + struct kvm_shadow_walk_iterator it; + unsigned direct_access, access = gw->pt_access; + int top_level, ret; +- gfn_t base_gfn; ++ gfn_t gfn, base_gfn; + + direct_access = gw->pte_access; + +@@ -647,13 +648,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + link_shadow_page(vcpu, it.sptep, sp); + } + +- base_gfn = gw->gfn; ++ /* ++ * FNAME(page_fault) might have clobbered the bottom bits of ++ * gw->gfn, restore them from the virtual address. ++ */ ++ gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); ++ base_gfn = gfn; + + trace_kvm_mmu_spte_requested(addr, gw->level, pfn); + + for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { + clear_sp_write_flooding_count(it.sptep); +- base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); ++ ++ /* ++ * We cannot overwrite existing page tables with an NX ++ * large page, as the leaf could be executable. ++ */ ++ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); ++ ++ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == hlevel) + break; + +@@ -665,6 +678,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + sp = kvm_mmu_get_page(vcpu, base_gfn, addr, + it.level - 1, true, direct_access); + link_shadow_page(vcpu, it.sptep, sp); ++ if (lpage_disallowed) ++ account_huge_nx_page(vcpu->kvm, sp); + } + } + +@@ -741,9 +756,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + int r; + kvm_pfn_t pfn; + int level = PT_PAGE_TABLE_LEVEL; +- bool force_pt_level = false; + unsigned long mmu_seq; + bool map_writable, is_self_change_mapping; ++ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && ++ is_nx_huge_page_enabled(); ++ bool force_pt_level = lpage_disallowed; + + pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); + +@@ -833,7 +850,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + if (!force_pt_level) + transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, +- level, pfn, map_writable, prefault); ++ level, pfn, map_writable, prefault, lpage_disallowed); + kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); + + out_unlock: +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 2714c1a0e59f..406a37aa61c7 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -206,6 +206,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { + { "mmu_unsync", VM_STAT(mmu_unsync) }, + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, + { "largepages", VM_STAT(lpages, .mode = 0444) }, ++ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, + { "max_mmu_page_hash_collisions", + VM_STAT(max_mmu_page_hash_collisions) }, + { NULL } diff --git a/debian/patches/bugfix/x86/itlb_multihit/0011-kvm-Add-helper-function-for-creating-VM-worker-threa.patch b/debian/patches/bugfix/x86/itlb_multihit/0011-kvm-Add-helper-function-for-creating-VM-worker-threa.patch new file mode 100644 index 000000000..81acc63f2 --- /dev/null +++ b/debian/patches/bugfix/x86/itlb_multihit/0011-kvm-Add-helper-function-for-creating-VM-worker-threa.patch @@ -0,0 +1,131 @@ +From: Junaid Shahid +Date: Fri, 11 Oct 2019 12:40:15 +0200 +Subject: kvm: Add helper function for creating VM worker threads + +This adds a function to create a kernel thread associated with a given +VM. In particular, it ensures that the worker thread inherits the +priority and cgroups of the calling thread. + +Signed-off-by: Junaid Shahid +Signed-off-by: Paolo Bonzini +[bwh: Backported to 4.19: adjust context] +Signed-off-by: Ben Hutchings +--- + include/linux/kvm_host.h | 6 +++ + virt/kvm/kvm_main.c | 84 ++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 90 insertions(+) + +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 553a3115a735..96207939d862 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -1305,4 +1305,10 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) + } + #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */ + ++typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data); ++ ++int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, ++ uintptr_t data, const char *name, ++ struct task_struct **thread_ptr); ++ + #endif +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 5482949b452c..77da54d334b2 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -51,6 +51,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -4142,3 +4143,86 @@ void kvm_exit(void) + kvm_vfio_ops_exit(); + } + EXPORT_SYMBOL_GPL(kvm_exit); ++ ++struct kvm_vm_worker_thread_context { ++ struct kvm *kvm; ++ struct task_struct *parent; ++ struct completion init_done; ++ kvm_vm_thread_fn_t thread_fn; ++ uintptr_t data; ++ int err; ++}; ++ ++static int kvm_vm_worker_thread(void *context) ++{ ++ /* ++ * The init_context is allocated on the stack of the parent thread, so ++ * we have to locally copy anything that is needed beyond initialization ++ */ ++ struct kvm_vm_worker_thread_context *init_context = context; ++ struct kvm *kvm = init_context->kvm; ++ kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; ++ uintptr_t data = init_context->data; ++ int err; ++ ++ err = kthread_park(current); ++ /* kthread_park(current) is never supposed to return an error */ ++ WARN_ON(err != 0); ++ if (err) ++ goto init_complete; ++ ++ err = cgroup_attach_task_all(init_context->parent, current); ++ if (err) { ++ kvm_err("%s: cgroup_attach_task_all failed with err %d\n", ++ __func__, err); ++ goto init_complete; ++ } ++ ++ set_user_nice(current, task_nice(init_context->parent)); ++ ++init_complete: ++ init_context->err = err; ++ complete(&init_context->init_done); ++ init_context = NULL; ++ ++ if (err) ++ return err; ++ ++ /* Wait to be woken up by the spawner before proceeding. */ ++ kthread_parkme(); ++ ++ if (!kthread_should_stop()) ++ err = thread_fn(kvm, data); ++ ++ return err; ++} ++ ++int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, ++ uintptr_t data, const char *name, ++ struct task_struct **thread_ptr) ++{ ++ struct kvm_vm_worker_thread_context init_context = {}; ++ struct task_struct *thread; ++ ++ *thread_ptr = NULL; ++ init_context.kvm = kvm; ++ init_context.parent = current; ++ init_context.thread_fn = thread_fn; ++ init_context.data = data; ++ init_completion(&init_context.init_done); ++ ++ thread = kthread_run(kvm_vm_worker_thread, &init_context, ++ "%s-%d", name, task_pid_nr(current)); ++ if (IS_ERR(thread)) ++ return PTR_ERR(thread); ++ ++ /* kthread_run is never supposed to return NULL */ ++ WARN_ON(thread == NULL); ++ ++ wait_for_completion(&init_context.init_done); ++ ++ if (!init_context.err) ++ *thread_ptr = thread; ++ ++ return init_context.err; ++} diff --git a/debian/patches/bugfix/x86/itlb_multihit/0012-kvm-x86-mmu-Recovery-of-shattered-NX-large-pages.patch b/debian/patches/bugfix/x86/itlb_multihit/0012-kvm-x86-mmu-Recovery-of-shattered-NX-large-pages.patch new file mode 100644 index 000000000..dd448bbf7 --- /dev/null +++ b/debian/patches/bugfix/x86/itlb_multihit/0012-kvm-x86-mmu-Recovery-of-shattered-NX-large-pages.patch @@ -0,0 +1,368 @@ +From: Junaid Shahid +Date: Fri, 11 Oct 2019 12:40:16 +0200 +Subject: kvm: x86: mmu: Recovery of shattered NX large pages + +The page table pages corresponding to broken down large pages are +zapped in FIFO order, so that the large page can potentially +be recovered, if it is no longer being used for execution. This removes +the performance penalty for walking deeper EPT page tables. + +By default, one large page will last about one hour once the guest +reaches a steady state. + +Signed-off-by: Junaid Shahid +Signed-off-by: Paolo Bonzini +[bwh: Backported to 4.19: adjust context] +Signed-off-by: Ben Hutchings +--- + .../admin-guide/kernel-parameters.txt | 6 + + arch/x86/include/asm/kvm_host.h | 5 + + arch/x86/kvm/mmu.c | 129 ++++++++++++++++++ + arch/x86/kvm/mmu.h | 4 + + arch/x86/kvm/x86.c | 11 ++ + virt/kvm/kvm_main.c | 30 +++- + 6 files changed, 184 insertions(+), 1 deletion(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index b2c1a5c63ab3..efdc471ed0b9 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -1967,6 +1967,12 @@ + If the sw workaround is enabled for the host, guests + need not enable it for nested guests. + ++ kvm.nx_huge_pages_recovery_ratio= ++ [KVM] Controls how many 4KiB pages are periodically zapped ++ back to huge pages. 0 disables the recovery, otherwise if ++ the value is N KVM will zap 1/Nth of the 4KiB pages every ++ minute. The default is 60. ++ + kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. + Default is 1 (enabled) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 59b44445ed59..efe3ba61fc23 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -281,6 +281,8 @@ struct kvm_rmap_head { + struct kvm_mmu_page { + struct list_head link; + struct hlist_node hash_link; ++ struct list_head lpage_disallowed_link; ++ + bool unsync; + bool lpage_disallowed; /* Can't be replaced by an equiv large page */ + +@@ -808,6 +810,7 @@ struct kvm_arch { + */ + struct list_head active_mmu_pages; + struct list_head zapped_obsolete_pages; ++ struct list_head lpage_disallowed_mmu_pages; + struct kvm_page_track_notifier_node mmu_sp_tracker; + struct kvm_page_track_notifier_head track_notifier_head; + +@@ -878,6 +881,8 @@ struct kvm_arch { + bool x2apic_broadcast_quirk_disabled; + + bool guest_can_read_msr_platform_info; ++ ++ struct task_struct *nx_lpage_recovery_thread; + }; + + struct kvm_vm_stat { +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index 19c3dc9b05cb..bafb9001ce94 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -40,6 +40,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -52,16 +53,26 @@ + extern bool itlb_multihit_kvm_mitigation; + + static int __read_mostly nx_huge_pages = -1; ++static uint __read_mostly nx_huge_pages_recovery_ratio = 60; + + static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); ++static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); + + static struct kernel_param_ops nx_huge_pages_ops = { + .set = set_nx_huge_pages, + .get = param_get_bool, + }; + ++static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { ++ .set = set_nx_huge_pages_recovery_ratio, ++ .get = param_get_uint, ++}; ++ + module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); + __MODULE_PARM_TYPE(nx_huge_pages, "bool"); ++module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, ++ &nx_huge_pages_recovery_ratio, 0644); ++__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); + + /* + * When setting this variable to true it enables Two-Dimensional-Paging +@@ -1121,6 +1132,8 @@ static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) + return; + + ++kvm->stat.nx_lpage_splits; ++ list_add_tail(&sp->lpage_disallowed_link, ++ &kvm->arch.lpage_disallowed_mmu_pages); + sp->lpage_disallowed = true; + } + +@@ -1145,6 +1158,7 @@ static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) + { + --kvm->stat.nx_lpage_splits; + sp->lpage_disallowed = false; ++ list_del(&sp->lpage_disallowed_link); + } + + static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, +@@ -5999,6 +6013,8 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) + idx = srcu_read_lock(&kvm->srcu); + kvm_mmu_invalidate_zap_all_pages(kvm); + srcu_read_unlock(&kvm->srcu, idx); ++ ++ wake_up_process(kvm->arch.nx_lpage_recovery_thread); + } + mutex_unlock(&kvm_lock); + } +@@ -6079,3 +6095,116 @@ void kvm_mmu_module_exit(void) + unregister_shrinker(&mmu_shrinker); + mmu_audit_disable(); + } ++ ++static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) ++{ ++ unsigned int old_val; ++ int err; ++ ++ old_val = nx_huge_pages_recovery_ratio; ++ err = param_set_uint(val, kp); ++ if (err) ++ return err; ++ ++ if (READ_ONCE(nx_huge_pages) && ++ !old_val && nx_huge_pages_recovery_ratio) { ++ struct kvm *kvm; ++ ++ mutex_lock(&kvm_lock); ++ ++ list_for_each_entry(kvm, &vm_list, vm_list) ++ wake_up_process(kvm->arch.nx_lpage_recovery_thread); ++ ++ mutex_unlock(&kvm_lock); ++ } ++ ++ return err; ++} ++ ++static void kvm_recover_nx_lpages(struct kvm *kvm) ++{ ++ int rcu_idx; ++ struct kvm_mmu_page *sp; ++ unsigned int ratio; ++ LIST_HEAD(invalid_list); ++ ulong to_zap; ++ ++ rcu_idx = srcu_read_lock(&kvm->srcu); ++ spin_lock(&kvm->mmu_lock); ++ ++ ratio = READ_ONCE(nx_huge_pages_recovery_ratio); ++ to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; ++ while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { ++ /* ++ * We use a separate list instead of just using active_mmu_pages ++ * because the number of lpage_disallowed pages is expected to ++ * be relatively small compared to the total. ++ */ ++ sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, ++ struct kvm_mmu_page, ++ lpage_disallowed_link); ++ WARN_ON_ONCE(!sp->lpage_disallowed); ++ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); ++ WARN_ON_ONCE(sp->lpage_disallowed); ++ ++ if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { ++ kvm_mmu_commit_zap_page(kvm, &invalid_list); ++ if (to_zap) ++ cond_resched_lock(&kvm->mmu_lock); ++ } ++ } ++ ++ spin_unlock(&kvm->mmu_lock); ++ srcu_read_unlock(&kvm->srcu, rcu_idx); ++} ++ ++static long get_nx_lpage_recovery_timeout(u64 start_time) ++{ ++ return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) ++ ? start_time + 60 * HZ - get_jiffies_64() ++ : MAX_SCHEDULE_TIMEOUT; ++} ++ ++static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) ++{ ++ u64 start_time; ++ long remaining_time; ++ ++ while (true) { ++ start_time = get_jiffies_64(); ++ remaining_time = get_nx_lpage_recovery_timeout(start_time); ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ while (!kthread_should_stop() && remaining_time > 0) { ++ schedule_timeout(remaining_time); ++ remaining_time = get_nx_lpage_recovery_timeout(start_time); ++ set_current_state(TASK_INTERRUPTIBLE); ++ } ++ ++ set_current_state(TASK_RUNNING); ++ ++ if (kthread_should_stop()) ++ return 0; ++ ++ kvm_recover_nx_lpages(kvm); ++ } ++} ++ ++int kvm_mmu_post_init_vm(struct kvm *kvm) ++{ ++ int err; ++ ++ err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, ++ "kvm-nx-lpage-recovery", ++ &kvm->arch.nx_lpage_recovery_thread); ++ if (!err) ++ kthread_unpark(kvm->arch.nx_lpage_recovery_thread); ++ ++ return err; ++} ++ ++void kvm_mmu_pre_destroy_vm(struct kvm *kvm) ++{ ++ if (kvm->arch.nx_lpage_recovery_thread) ++ kthread_stop(kvm->arch.nx_lpage_recovery_thread); ++} +diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h +index 65892288bf51..f7b2de7b6382 100644 +--- a/arch/x86/kvm/mmu.h ++++ b/arch/x86/kvm/mmu.h +@@ -216,4 +216,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); + bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + struct kvm_memory_slot *slot, u64 gfn); + int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); ++ ++int kvm_mmu_post_init_vm(struct kvm *kvm); ++void kvm_mmu_pre_destroy_vm(struct kvm *kvm); ++ + #endif +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 406a37aa61c7..1ecadf51f154 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -8950,6 +8950,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) + INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); ++ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); + atomic_set(&kvm->arch.noncoherent_dma_count, 0); + +@@ -8981,6 +8982,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) + return 0; + } + ++int kvm_arch_post_init_vm(struct kvm *kvm) ++{ ++ return kvm_mmu_post_init_vm(kvm); ++} ++ + static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) + { + vcpu_load(vcpu); +@@ -9082,6 +9088,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) + } + EXPORT_SYMBOL_GPL(x86_set_memory_region); + ++void kvm_arch_pre_destroy_vm(struct kvm *kvm) ++{ ++ kvm_mmu_pre_destroy_vm(kvm); ++} ++ + void kvm_arch_destroy_vm(struct kvm *kvm) + { + if (current->mm == kvm->mm) { +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 77da54d334b2..7a0d86d52230 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -625,6 +625,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) + return 0; + } + ++/* ++ * Called after the VM is otherwise initialized, but just before adding it to ++ * the vm_list. ++ */ ++int __weak kvm_arch_post_init_vm(struct kvm *kvm) ++{ ++ return 0; ++} ++ ++/* ++ * Called just after removing the VM from the vm_list, but before doing any ++ * other destruction. ++ */ ++void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) ++{ ++} ++ + static struct kvm *kvm_create_vm(unsigned long type) + { + int r, i; +@@ -679,10 +696,14 @@ static struct kvm *kvm_create_vm(unsigned long type) + rcu_assign_pointer(kvm->buses[i], + kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); + if (!kvm->buses[i]) +- goto out_err; ++ goto out_err_no_mmu_notifier; + } + + r = kvm_init_mmu_notifier(kvm); ++ if (r) ++ goto out_err_no_mmu_notifier; ++ ++ r = kvm_arch_post_init_vm(kvm); + if (r) + goto out_err; + +@@ -695,6 +716,11 @@ static struct kvm *kvm_create_vm(unsigned long type) + return kvm; + + out_err: ++#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) ++ if (kvm->mmu_notifier.ops) ++ mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); ++#endif ++out_err_no_mmu_notifier: + cleanup_srcu_struct(&kvm->irq_srcu); + out_err_no_irq_srcu: + cleanup_srcu_struct(&kvm->srcu); +@@ -737,6 +763,8 @@ static void kvm_destroy_vm(struct kvm *kvm) + mutex_lock(&kvm_lock); + list_del(&kvm->vm_list); + mutex_unlock(&kvm_lock); ++ kvm_arch_pre_destroy_vm(kvm); ++ + kvm_free_irq_routing(kvm); + for (i = 0; i < KVM_NR_BUSES; i++) { + struct kvm_io_bus *bus = kvm_get_bus(kvm, i); diff --git a/debian/patches/series b/debian/patches/series index 891d589ca..6dc480d6d 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -258,6 +258,18 @@ bugfix/all/ALSA-usb-audio-Fix-a-stack-buffer-overflow-bug-in-check_input_term.pa bugfix/all/vhost-make-sure-log_num-in_num.patch bugfix/x86/x86-ptrace-fix-up-botched-merge-of-spectrev1-fix.patch bugfix/all/KVM-coalesced_mmio-add-bounds-checking.patch +bugfix/x86//itlb_multihit/0001-KVM-x86-adjust-kvm_mmu_page-member-to-save-8-bytes.patch +bugfix/x86//itlb_multihit/0002-kvm-Convert-kvm_lock-to-a-mutex.patch +bugfix/x86//itlb_multihit/0003-kvm-x86-Do-not-release-the-page-inside-mmu_set_spte.patch +bugfix/x86//itlb_multihit/0004-KVM-x86-make-FNAME-fetch-and-__direct_map-more-simil.patch +bugfix/x86//itlb_multihit/0005-KVM-x86-remove-now-unneeded-hugepage-gfn-adjustment.patch +bugfix/x86//itlb_multihit/0006-KVM-x86-change-kvm_mmu_page_get_gfn-BUG_ON-to-WARN_O.patch +bugfix/x86//itlb_multihit/0007-KVM-x86-add-tracepoints-around-__direct_map-and-FNAM.patch +bugfix/x86//itlb_multihit/0008-kvm-x86-powerpc-do-not-allow-clearing-largepages-deb.patch +bugfix/x86//itlb_multihit/0009-x86-Add-ITLB_MULTIHIT-bug-infrastructure.patch +bugfix/x86//itlb_multihit/0010-kvm-mmu-ITLB_MULTIHIT-mitigation.patch +bugfix/x86//itlb_multihit/0011-kvm-Add-helper-function-for-creating-VM-worker-threa.patch +bugfix/x86//itlb_multihit/0012-kvm-x86-mmu-Recovery-of-shattered-NX-large-pages.patch # ABI maintenance debian/abi/powerpc-avoid-abi-change-for-disabling-tm.patch