diff options
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu.c | 460 |
1 files changed, 366 insertions, 94 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 676edfc19a95..3a281a2decde 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -37,6 +37,7 @@ #include <linux/srcu.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/kthread.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -44,6 +45,30 @@ #include <asm/vmx.h> #include <asm/kvm_page_track.h> +extern bool itlb_multihit_kvm_mitigation; + +static int __read_mostly nx_huge_pages = -1; +static uint __read_mostly nx_huge_pages_recovery_ratio = 60; + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); + +static struct kernel_param_ops nx_huge_pages_ops = { + .set = set_nx_huge_pages, + .get = param_get_bool, +}; + +static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { + .set = set_nx_huge_pages_recovery_ratio, + .get = param_get_uint, +}; + +module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); +__MODULE_PARM_TYPE(nx_huge_pages, "bool"); +module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, + &nx_huge_pages_recovery_ratio, 0644); +__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); + /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -131,9 +156,6 @@ module_param(dbg, bool, 0644); #include <trace/events/kvm.h> -#define CREATE_TRACE_POINTS -#include "mmutrace.h" - #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) @@ -142,6 +164,20 @@ module_param(dbg, bool, 0644); /* make pte_list_desc fit well in cache line */ #define PTE_LIST_EXT 3 +/* + * Return values of handle_mmio_page_fault and mmu.page_fault: + * RET_PF_RETRY: let CPU fault again on the address. + * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * + * For handle_mmio_page_fault only: + * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + */ +enum { + RET_PF_RETRY = 0, + RET_PF_EMULATE = 1, + RET_PF_INVALID = 2, +}; + struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -179,14 +215,23 @@ static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_present_mask; static void mmu_spte_set(u64 *sptep, u64 spte); +static bool is_executable_pte(u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); +#define CREATE_TRACE_POINTS +#include "mmutrace.h" + void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { shadow_mmio_mask = mmio_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} + /* * the low bit of the generation number is always presumed to be zero. * This disables mmio caching during memslot updates. The concept is @@ -324,6 +369,11 @@ static int is_last_spte(u64 pte, int level) return 0; } +static bool is_executable_pte(u64 spte) +{ + return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; +} + static kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -767,10 +817,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) { - if (sp->role.direct) - BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); - else + if (!sp->role.direct) { sp->gfns[index] = gfn; + return; + } + + if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) + pr_err_ratelimited("gfn mismatch under direct page %llx " + "(expected %llx, got %llx)\n", + sp->gfn, + kvm_mmu_page_get_gfn(sp, index), gfn); } /* @@ -829,6 +885,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } +static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + if (sp->lpage_disallowed) + return; + + ++kvm->stat.nx_lpage_splits; + list_add_tail(&sp->lpage_disallowed_link, + &kvm->arch.lpage_disallowed_mmu_pages); + sp->lpage_disallowed = true; +} + static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; @@ -846,6 +913,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } +static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + --kvm->stat.nx_lpage_splits; + sp->lpage_disallowed = false; + list_del(&sp->lpage_disallowed_link); +} + static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -2382,6 +2456,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_reload_remote_mmus(kvm); } + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + sp->role.invalid = 1; return ret; } @@ -2533,6 +2610,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!speculative) spte |= shadow_accessed_mask; + if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else @@ -2598,13 +2680,13 @@ done: return ret; } -static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, - int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, - bool speculative, bool host_writable) +static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, + bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; - bool emulate = false; + int ret = RET_PF_RETRY; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); @@ -2634,18 +2716,15 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, true, host_writable)) { if (write_fault) - emulate = true; + ret = RET_PF_EMULATE; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } if (unlikely(is_mmio_spte(*sptep))) - emulate = true; + ret = RET_PF_EMULATE; pgprintk("%s: setting spte %llx\n", __func__, *sptep); - pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", - is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, - *sptep, sptep); + trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; @@ -2657,9 +2736,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, } } - kvm_release_pfn_clean(pfn); - - return emulate; + return ret; } static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2693,9 +2770,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, if (ret <= 0) return -1; - for (i = 0; i < ret; i++, gfn++, start++) + for (i = 0; i < ret; i++, gfn++, start++) { mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); + put_page(pages[i]); + } return 0; } @@ -2743,40 +2822,71 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, - int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) +static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) { - struct kvm_shadow_walk_iterator iterator; + int level = *levelp; + u64 spte = *it.sptep; + + if (it.level == level && level > PT_PAGE_TABLE_LEVEL && + is_nx_huge_page_enabled() && + is_shadow_present_pte(spte) && + !is_large_pte(spte)) { + /* + * A small SPTE exists for this pfn, but FNAME(fetch) + * and __direct_map would like to create a large PTE + * instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of + * the address. + */ + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + *pfnp |= gfn & page_mask; + (*levelp)--; + } +} + +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, + int map_writable, int level, kvm_pfn_t pfn, + bool prefault, bool lpage_disallowed) +{ + struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int emulate = 0; - gfn_t pseudo_gfn; + int ret; + gfn_t gfn = gpa >> PAGE_SHIFT; + gfn_t base_gfn = gfn; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return 0; + return RET_PF_RETRY; - for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { - if (iterator.level == level) { - emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, - write, level, gfn, pfn, prefault, - map_writable); - direct_pte_prefetch(vcpu, iterator.sptep); - ++vcpu->stat.pf_fixed; - break; - } + trace_kvm_mmu_spte_requested(gpa, level, pfn); + for_each_shadow_entry(vcpu, gpa, it) { + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &level); - drop_large_spte(vcpu, iterator.sptep); - if (!is_shadow_present_pte(*iterator.sptep)) { - u64 base_addr = iterator.addr; + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == level) + break; - base_addr &= PT64_LVL_ADDR_MASK(iterator.level); - pseudo_gfn = base_addr >> PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, - iterator.level - 1, 1, ACC_ALL); + drop_large_spte(vcpu, it.sptep); + if (!is_shadow_present_pte(*it.sptep)) { + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, + it.level - 1, true, ACC_ALL); - link_shadow_page(vcpu, iterator.sptep, sp); + link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } - return emulate; + + ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, + write, level, base_gfn, pfn, prefault, + map_writable); + direct_pte_prefetch(vcpu, it.sptep); + ++vcpu->stat.pf_fixed; + return ret; } static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) @@ -2798,25 +2908,23 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access. - * Return 1 to tell kvm to emulate it. */ if (pfn == KVM_PFN_ERR_RO_FAULT) - return 1; + return RET_PF_EMULATE; if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); - return 0; + return RET_PF_RETRY; } return -EFAULT; } static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, kvm_pfn_t *pfnp, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) { kvm_pfn_t pfn = *pfnp; - gfn_t gfn = *gfnp; int level = *levelp; /* @@ -2826,7 +2934,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * here. */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && + !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompoundMap(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; @@ -2843,8 +2951,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, mask = KVM_PAGES_PER_HPAGE(level) - 1; VM_BUG_ON((gfn & mask) != (pfn & mask)); if (pfn & mask) { - gfn &= ~mask; - *gfnp = gfn; kvm_release_pfn_clean(pfn); pfn &= ~mask; kvm_get_pfn(pfn); @@ -3012,11 +3118,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - bool force_pt_level = false; + bool force_pt_level; kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + force_pt_level = lpage_disallowed; level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { /* @@ -3031,32 +3140,30 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, } if (fast_page_fault(vcpu, v, level, error_code)) - return 0; + return RET_PF_RETRY; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) - return 0; + return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) return r; + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; make_mmu_pages_available(vcpu); if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; - + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + r = __direct_map(vcpu, v, write, map_writable, level, pfn, + prefault, false); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return r; } @@ -3383,38 +3490,38 @@ exit: return reserved; } -int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) +static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; bool reserved; if (mmio_info_in_cache(vcpu, addr, direct)) - return RET_MMIO_PF_EMULATE; + return RET_PF_EMULATE; reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); if (WARN_ON(reserved)) - return RET_MMIO_PF_BUG; + return -EINVAL; if (is_mmio_spte(spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); unsigned access = get_mmio_spte_access(spte); if (!check_mmio_spte(vcpu, spte)) - return RET_MMIO_PF_INVALID; + return RET_PF_INVALID; if (direct) addr = 0; trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); - return RET_MMIO_PF_EMULATE; + return RET_PF_EMULATE; } /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ - return RET_MMIO_PF_RETRY; + return RET_PF_RETRY; } EXPORT_SYMBOL_GPL(handle_mmio_page_fault); @@ -3464,7 +3571,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); if (page_fault_handle_page_track(vcpu, error_code, gfn)) - return 1; + return RET_PF_EMULATE; r = mmu_topup_memory_caches(vcpu); if (r) @@ -3548,18 +3655,21 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; bool map_writable; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (page_fault_handle_page_track(vcpu, error_code, gfn)) - return 1; + return RET_PF_EMULATE; r = mmu_topup_memory_caches(vcpu); if (r) return r; - force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, - PT_DIRECTORY_LEVEL); + force_pt_level = + lpage_disallowed || + !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { if (level > PT_DIRECTORY_LEVEL && @@ -3569,32 +3679,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, } if (fast_page_fault(vcpu, gpa, level, error_code)) - return 0; + return RET_PF_RETRY; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return 0; + return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) return r; + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; make_mmu_pages_available(vcpu); if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; - + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, + prefault, lpage_disallowed); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return r; } static void nonpaging_init_context(struct kvm_vcpu *vcpu, @@ -4510,23 +4618,24 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, enum emulation_result er; bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu); + r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, cr2, direct); - if (r == RET_MMIO_PF_EMULATE) { + if (r == RET_PF_EMULATE) { emulation_type = 0; goto emulate; } - if (r == RET_MMIO_PF_RETRY) - return 1; - if (r < 0) - return r; } - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); + if (r == RET_PF_INVALID) { + r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); + WARN_ON(r == RET_PF_INVALID); + } + + if (r == RET_PF_RETRY) + return 1; if (r < 0) return r; - if (!r) - return 1; if (mmio_info_in_cache(vcpu, cr2, direct)) emulation_type = 0; @@ -4781,9 +4890,9 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && - !kvm_is_reserved_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + !kvm_is_zone_device_pfn(pfn) && + PageTransCompoundMap(pfn_to_page(pfn))) { drop_spte(kvm, sptep); need_tlb_flush = 1; goto restart; @@ -4965,7 +5074,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int nr_to_scan = sc->nr_to_scan; unsigned long freed = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx; @@ -5015,7 +5124,7 @@ unlock: break; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return freed; } @@ -5039,8 +5148,58 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static bool get_nx_auto_mode(void) +{ + /* Return true when CPU has the bug, and mitigations are ON */ + return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); +} + +static void __set_nx_huge_pages(bool val) +{ + nx_huge_pages = itlb_multihit_kvm_mitigation = val; +} + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) +{ + bool old_val = nx_huge_pages; + bool new_val; + + /* In "auto" mode deploy workaround only if CPU has the bug. */ + if (sysfs_streq(val, "off")) + new_val = 0; + else if (sysfs_streq(val, "force")) + new_val = 1; + else if (sysfs_streq(val, "auto")) + new_val = get_nx_auto_mode(); + else if (strtobool(val, &new_val) < 0) + return -EINVAL; + + __set_nx_huge_pages(new_val); + + if (new_val != old_val) { + struct kvm *kvm; + int idx; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + idx = srcu_read_lock(&kvm->srcu); + kvm_mmu_invalidate_zap_all_pages(kvm); + srcu_read_unlock(&kvm->srcu, idx); + + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + } + mutex_unlock(&kvm_lock); + } + + return 0; +} + int kvm_mmu_module_init(void) { + if (nx_huge_pages == -1) + __set_nx_huge_pages(get_nx_auto_mode()); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); @@ -5104,3 +5263,116 @@ void kvm_mmu_module_exit(void) unregister_shrinker(&mmu_shrinker); mmu_audit_disable(); } + +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) +{ + unsigned int old_val; + int err; + + old_val = nx_huge_pages_recovery_ratio; + err = param_set_uint(val, kp); + if (err) + return err; + + if (READ_ONCE(nx_huge_pages) && + !old_val && nx_huge_pages_recovery_ratio) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + + mutex_unlock(&kvm_lock); + } + + return err; +} + +static void kvm_recover_nx_lpages(struct kvm *kvm) +{ + int rcu_idx; + struct kvm_mmu_page *sp; + unsigned int ratio; + LIST_HEAD(invalid_list); + ulong to_zap; + + rcu_idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + + ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + /* + * We use a separate list instead of just using active_mmu_pages + * because the number of lpage_disallowed pages is expected to + * be relatively small compared to the total. + */ + sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + struct kvm_mmu_page, + lpage_disallowed_link); + WARN_ON_ONCE(!sp->lpage_disallowed); + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + + if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + if (to_zap) + cond_resched_lock(&kvm->mmu_lock); + } + } + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, rcu_idx); +} + +static long get_nx_lpage_recovery_timeout(u64 start_time) +{ + return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) + ? start_time + 60 * HZ - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; +} + +static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +{ + u64 start_time; + long remaining_time; + + while (true) { + start_time = get_jiffies_64(); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop() && remaining_time > 0) { + schedule_timeout(remaining_time); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + + if (kthread_should_stop()) + return 0; + + kvm_recover_nx_lpages(kvm); + } +} + +int kvm_mmu_post_init_vm(struct kvm *kvm) +{ + int err; + + err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, + "kvm-nx-lpage-recovery", + &kvm->arch.nx_lpage_recovery_thread); + if (!err) + kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + + return err; +} + +void kvm_mmu_pre_destroy_vm(struct kvm *kvm) +{ + if (kvm->arch.nx_lpage_recovery_thread) + kthread_stop(kvm->arch.nx_lpage_recovery_thread); +} |