[PATCH 3/6] KVM: x86/mmu: avoid struct page in MMU
From: David Stevens <hidden>
Date: 2021-06-24 03:59:43
Also in:
dri-devel, intel-gfx, kvm, kvmarm, linux-arm-kernel, linuxppc-dev, lkml
Subsystem:
kernel virtual machine for x86 (kvm/x86), the rest, x86 architecture (32-bit and 64-bit) · Maintainers:
Sean Christopherson, Paolo Bonzini, Linus Torvalds, Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen
From: David Stevens <redacted> Avoid converting pfns returned by follow_fault_pfn to struct pages to transiently take a reference. The reference was originally taken to match the reference taken by gup. However, pfns returned by follow_fault_pfn may not have a struct page set up for reference counting. Signed-off-by: David Stevens <redacted> --- arch/x86/kvm/mmu/mmu.c | 56 +++++++++++++++++++-------------- arch/x86/kvm/mmu/mmu_audit.c | 13 ++++---- arch/x86/kvm/mmu/mmu_internal.h | 3 +- arch/x86/kvm/mmu/paging_tmpl.h | 36 ++++++++++++--------- arch/x86/kvm/mmu/tdp_mmu.c | 7 +++-- arch/x86/kvm/mmu/tdp_mmu.h | 4 +-- arch/x86/kvm/x86.c | 9 +++--- 7 files changed, 73 insertions(+), 55 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 84913677c404..8fa4a4a411ba 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c@@ -2610,16 +2610,16 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, return ret; } -static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, - bool no_dirty_log) +static struct kvm_pfn_page pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, + gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); if (!slot) - return KVM_PFN_ERR_FAULT; + return KVM_PFN_PAGE_ERR(KVM_PFN_ERR_FAULT); - return kvm_pfn_page_unwrap(gfn_to_pfn_memslot_atomic(slot, gfn)); + return gfn_to_pfn_memslot_atomic(slot, gfn); } static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
@@ -2748,7 +2748,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, - bool huge_page_disallowed, int *req_level) + struct page *page, bool huge_page_disallowed, + int *req_level) { struct kvm_memory_slot *slot; kvm_pfn_t pfn = *pfnp;
@@ -2760,6 +2761,9 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (unlikely(max_level == PG_LEVEL_4K)) return PG_LEVEL_4K; + if (!page) + return PG_LEVEL_4K; + if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) return PG_LEVEL_4K;
@@ -2814,7 +2818,8 @@ void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, + int map_writable, int max_level, + const struct kvm_pfn_page *pfnpg, bool prefault, bool is_tdp) { bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
@@ -2826,11 +2831,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int level, req_level, ret; gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; + kvm_pfn_t pfn = pfnpg->pfn; if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, pfnpg->page, huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(gpa, level, pfn);
@@ -3672,8 +3678,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, - bool write, bool *writable) + gpa_t cr2_or_gpa, struct kvm_pfn_page *pfnpg, + hva_t *hva, bool write, bool *writable) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async;
@@ -3688,17 +3694,16 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, /* Don't expose private memslots to L2. */ if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { - *pfn = KVM_PFN_NOSLOT; + *pfnpg = KVM_PFN_PAGE_ERR(KVM_PFN_NOSLOT); *writable = false; return false; } async = false; - *pfn = kvm_pfn_page_unwrap(__gfn_to_pfn_memslot(slot, gfn, false, - &async, write, - writable, hva)); + *pfnpg = __gfn_to_pfn_memslot(slot, gfn, false, &async, + write, writable, hva); if (!async) - return false; /* *pfn has correct page already */ + return false; /* *pfnpg has correct page already */ if (!prefault && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
@@ -3710,8 +3715,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return true; } - *pfn = kvm_pfn_page_unwrap(__gfn_to_pfn_memslot(slot, gfn, false, NULL, - write, writable, hva)); + *pfnpg = __gfn_to_pfn_memslot(slot, gfn, false, NULL, + write, writable, hva); return false; }
@@ -3723,7 +3728,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; - kvm_pfn_t pfn; + struct kvm_pfn_page pfnpg; hva_t hva; int r;
@@ -3743,11 +3748,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfnpg, &hva, write, &map_writable)) return RET_PF_RETRY; - if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) + if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, + gfn, pfnpg.pfn, ACC_ALL, &r)) return r; r = RET_PF_RETRY;
@@ -3757,7 +3763,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, else write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) + if (!is_noslot_pfn(pfnpg.pfn) && + mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r)
@@ -3765,17 +3772,18 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, - pfn, prefault); + &pfnpg, prefault); else - r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, - prefault, is_tdp); + r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, + &pfnpg, prefault, is_tdp); out_unlock: if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + if (pfnpg.page) + put_page(pfnpg.page); return r; }
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
index 3f983dc6e0f1..72b470b892da 100644
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ b/arch/x86/kvm/mmu/mmu_audit.c@@ -94,7 +94,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) { struct kvm_mmu_page *sp; gfn_t gfn; - kvm_pfn_t pfn; + struct kvm_pfn_page pfnpg; hpa_t hpa; sp = sptep_to_sp(sptep);
@@ -111,18 +111,19 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) return; gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - pfn = kvm_pfn_page_unwrap(kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn)); + pfnpg = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn); - if (is_error_pfn(pfn)) + if (is_error_pfn(pfnpg.pfn)) return; - hpa = pfn << PAGE_SHIFT; + hpa = pfnpg.pfn << PAGE_SHIFT; if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " - "ent %llxn", vcpu->arch.mmu->root_level, pfn, + "ent %llxn", vcpu->arch.mmu->root_level, pfnpg.pfn, hpa, *sptep); - kvm_release_pfn_clean(pfn); + if (pfnpg.page) + put_page(pfnpg.page); } static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index d64ccb417c60..db4d878fde4e 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h@@ -154,7 +154,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, int max_level); int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, - bool huge_page_disallowed, int *req_level); + struct page *page, bool huge_page_disallowed, + int *req_level); void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, kvm_pfn_t *pfnp, int *goal_levelp);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 823a5919f9fa..db13efd4b62d 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h@@ -535,7 +535,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, { unsigned pte_access; gfn_t gfn; - kvm_pfn_t pfn; + struct kvm_pfn_page pfnpg; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false;
@@ -545,19 +545,20 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + pfnpg = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); - if (is_error_pfn(pfn)) + if (is_error_pfn(pfnpg.pfn)) return false; /* * we call mmu_set_spte() with host_writable = true because * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ - mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn, + mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfnpg.pfn, true, true); - kvm_release_pfn_clean(pfn); + if (pfnpg.page) + put_page(pfnpg.page); return true; }
@@ -637,8 +638,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, */ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, struct guest_walker *gw, u32 error_code, - int max_level, kvm_pfn_t pfn, bool map_writable, - bool prefault) + int max_level, struct kvm_pfn_page *pfnpg, + bool map_writable, bool prefault) { bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); bool write_fault = error_code & PFERR_WRITE_MASK;
@@ -649,6 +650,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int direct_access, access; int top_level, level, req_level, ret; gfn_t base_gfn = gw->gfn; + kvm_pfn_t pfn = pfnpg->pfn; direct_access = gw->pte_access;
@@ -695,7 +697,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, } level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + pfnpg->page, huge_page_disallowed, + &req_level); trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
@@ -801,7 +804,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, bool user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; - kvm_pfn_t pfn; + struct kvm_pfn_page pfnpg; hva_t hva; unsigned long mmu_seq; bool map_writable, is_self_change_mapping;
@@ -853,11 +856,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, + if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfnpg, &hva, write_fault, &map_writable)) return RET_PF_RETRY; - if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) + if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfnpg.pfn, + walker.pte_access, &r)) return r; /*
@@ -866,7 +870,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, */ if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && !is_write_protection(vcpu) && !user_fault && - !is_noslot_pfn(pfn)) { + !is_noslot_pfn(pfnpg.pfn)) { walker.pte_access |= ACC_WRITE_MASK; walker.pte_access &= ~ACC_USER_MASK;
@@ -882,20 +886,22 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) + if (!is_noslot_pfn(pfnpg.pfn) && + mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn, + r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, &pfnpg, map_writable, prefault); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + if (pfnpg.page) + put_page(pfnpg.page); return r; }
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 237317b1eddd..b0e6d63f0fe1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c@@ -960,8 +960,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, * page tables and SPTEs to translate the faulting guest physical address. */ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault) + int map_writable, int max_level, + const struct kvm_pfn_page *pfnpg, bool prefault) { bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); bool write = error_code & PFERR_WRITE_MASK;
@@ -976,13 +976,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; int level; int req_level; + kvm_pfn_t pfn = pfnpg->pfn; if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, pfnpg->page, huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(gpa, level, pfn);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 5fdf63090451..f78681b9dcb7 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h@@ -52,8 +52,8 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault); + int map_writable, int max_level, + const struct kvm_pfn_page *pfnpg, bool prefault); bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d31797e0cb6e..86d66c765190 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c@@ -7311,7 +7311,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type) { gpa_t gpa = cr2_or_gpa; - kvm_pfn_t pfn; + struct kvm_pfn_page pfnpg; if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false;
@@ -7341,16 +7341,17 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * retry instruction -> write #PF -> emulation fail -> retry * instruction -> ... */ - pfn = kvm_pfn_page_unwrap(gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa))); + pfnpg = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); /* * If the instruction failed on the error pfn, it can not be fixed, * report the error to userspace. */ - if (is_error_noslot_pfn(pfn)) + if (is_error_noslot_pfn(pfnpg.pfn)) return false; - kvm_release_pfn_clean(pfn); + if (pfnpg.page) + put_page(pfnpg.page); /* The instructions are well-emulated on direct mmu. */ if (vcpu->arch.mmu->direct_map) {
--
2.32.0.93.g670b81a890-goog