mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-23 08:35:19 -05:00
ARM:
- Plug race between enabling MTE and creating vcpus - Fix off-by-one bug when checking whether an address range is RAM x86: - Fixes for the new MMU, especially a memory leak on hosts with <39 physical address bits - Remove bogus EFER.NX checks on 32-bit non-PAE hosts - WAITPKG fix -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmEWjBwUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroPrMgf9EDBsRvD/Kids0kddaoAgM6qICdsH tQX/GdsmecUlU16Bkp21XeZif1ZKcJxCmx/dhYmid3woi9HuX5AreFTlLjlJDRxg +lJvboqTV0kk7PjaYkOaqd42RSg/BiSLZ+JVPpbW7CqeIr1lGG4yhIC/Nl7fCCto sCaY/NoxtraoG5+WZcRRP7XptQmMRckVZ9bimHHh8dKqMkosGx1hcGfj64aKmx4F 2EVrrjr+an3mpMnwvUIgNw4xEj/jUCFebvGAROVEsrZzNTZ9UrwgT0HeA92XwQVQ 93z7nqcBUKHH11rnbOvRESEJD9f6I9vCSaiqRROwmoqLY/Xi7jly7XeDcA== =Lj8B -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull KVM fixes from Paolo Bonzini: "ARM: - Plug race between enabling MTE and creating vcpus - Fix off-by-one bug when checking whether an address range is RAM x86: - Fixes for the new MMU, especially a memory leak on hosts with <39 physical address bits - Remove bogus EFER.NX checks on 32-bit non-PAE hosts - WAITPKG fix" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: x86/mmu: Protect marking SPs unsync when using TDP MMU with spinlock KVM: x86/mmu: Don't step down in the TDP iterator when zapping all SPTEs KVM: x86/mmu: Don't leak non-leaf SPTEs when zapping all SPTEs KVM: nVMX: Use vmx_need_pf_intercept() when deciding if L0 wants a #PF kvm: vmx: Sync all matching EPTPs when injecting nested EPT fault KVM: x86: remove dead initialization KVM: x86: Allow guest to set EFER.NX=1 on non-PAE 32-bit kernels KVM: VMX: Use current VMCS to query WAITPKG support for MSR emulation KVM: arm64: Fix race when enabling KVM_ARM_CAP_MTE KVM: arm64: Fix off-by-one in range_is_memory
This commit is contained in:
commit
3e763ec791
10 changed files with 118 additions and 62 deletions
|
@ -25,10 +25,10 @@ On x86:
|
|||
|
||||
- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
|
||||
|
||||
- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is
|
||||
taken inside kvm->arch.mmu_lock, and cannot be taken without already
|
||||
holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
|
||||
there's no need to take kvm->arch.tdp_mmu_pages_lock at all).
|
||||
- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and
|
||||
kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and
|
||||
cannot be taken without already holding kvm->arch.mmu_lock (typically with
|
||||
``read_lock`` for the TDP MMU, thus the need for additional spinlocks).
|
||||
|
||||
Everything else is a leaf: no other lock is taken inside the critical
|
||||
sections.
|
||||
|
|
|
@ -94,10 +94,14 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
|
|||
kvm->arch.return_nisv_io_abort_to_user = true;
|
||||
break;
|
||||
case KVM_CAP_ARM_MTE:
|
||||
if (!system_supports_mte() || kvm->created_vcpus)
|
||||
return -EINVAL;
|
||||
r = 0;
|
||||
kvm->arch.mte_enabled = true;
|
||||
mutex_lock(&kvm->lock);
|
||||
if (!system_supports_mte() || kvm->created_vcpus) {
|
||||
r = -EINVAL;
|
||||
} else {
|
||||
r = 0;
|
||||
kvm->arch.mte_enabled = true;
|
||||
}
|
||||
mutex_unlock(&kvm->lock);
|
||||
break;
|
||||
default:
|
||||
r = -EINVAL;
|
||||
|
|
|
@ -193,7 +193,7 @@ static bool range_is_memory(u64 start, u64 end)
|
|||
{
|
||||
struct kvm_mem_range r1, r2;
|
||||
|
||||
if (!find_mem_range(start, &r1) || !find_mem_range(end, &r2))
|
||||
if (!find_mem_range(start, &r1) || !find_mem_range(end - 1, &r2))
|
||||
return false;
|
||||
if (r1.start != r2.start)
|
||||
return false;
|
||||
|
|
|
@ -1038,6 +1038,13 @@ struct kvm_arch {
|
|||
struct list_head lpage_disallowed_mmu_pages;
|
||||
struct kvm_page_track_notifier_node mmu_sp_tracker;
|
||||
struct kvm_page_track_notifier_head track_notifier_head;
|
||||
/*
|
||||
* Protects marking pages unsync during page faults, as TDP MMU page
|
||||
* faults only take mmu_lock for read. For simplicity, the unsync
|
||||
* pages lock is always taken when marking pages unsync regardless of
|
||||
* whether mmu_lock is held for read or write.
|
||||
*/
|
||||
spinlock_t mmu_unsync_pages_lock;
|
||||
|
||||
struct list_head assigned_dev_head;
|
||||
struct iommu_domain *iommu_domain;
|
||||
|
|
|
@ -208,30 +208,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
|
|||
kvm_mmu_after_set_cpuid(vcpu);
|
||||
}
|
||||
|
||||
static int is_efer_nx(void)
|
||||
{
|
||||
return host_efer & EFER_NX;
|
||||
}
|
||||
|
||||
static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
int i;
|
||||
struct kvm_cpuid_entry2 *e, *entry;
|
||||
|
||||
entry = NULL;
|
||||
for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
|
||||
e = &vcpu->arch.cpuid_entries[i];
|
||||
if (e->function == 0x80000001) {
|
||||
entry = e;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) {
|
||||
cpuid_entry_clear(entry, X86_FEATURE_NX);
|
||||
printk(KERN_INFO "kvm: guest NX capability removed\n");
|
||||
}
|
||||
}
|
||||
|
||||
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
@ -302,7 +278,6 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
|
|||
vcpu->arch.cpuid_entries = e2;
|
||||
vcpu->arch.cpuid_nent = cpuid->nent;
|
||||
|
||||
cpuid_fix_nx_cap(vcpu);
|
||||
kvm_update_cpuid_runtime(vcpu);
|
||||
kvm_vcpu_after_set_cpuid(vcpu);
|
||||
|
||||
|
@ -401,7 +376,6 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
|
|||
|
||||
void kvm_set_cpu_caps(void)
|
||||
{
|
||||
unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
|
||||
#ifdef CONFIG_X86_64
|
||||
unsigned int f_gbpages = F(GBPAGES);
|
||||
unsigned int f_lm = F(LM);
|
||||
|
@ -515,7 +489,7 @@ void kvm_set_cpu_caps(void)
|
|||
F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
|
||||
F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
|
||||
F(PAT) | F(PSE36) | 0 /* Reserved */ |
|
||||
f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
|
||||
F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
|
||||
F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
|
||||
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW)
|
||||
);
|
||||
|
|
|
@ -1933,7 +1933,7 @@ ret_success:
|
|||
void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *entry;
|
||||
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
|
||||
struct kvm_vcpu_hv *hv_vcpu;
|
||||
|
||||
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0);
|
||||
if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) {
|
||||
|
|
|
@ -2535,6 +2535,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
|||
int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
|
||||
{
|
||||
struct kvm_mmu_page *sp;
|
||||
bool locked = false;
|
||||
|
||||
/*
|
||||
* Force write-protection if the page is being tracked. Note, the page
|
||||
|
@ -2557,9 +2558,34 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
|
|||
if (sp->unsync)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* TDP MMU page faults require an additional spinlock as they
|
||||
* run with mmu_lock held for read, not write, and the unsync
|
||||
* logic is not thread safe. Take the spinklock regardless of
|
||||
* the MMU type to avoid extra conditionals/parameters, there's
|
||||
* no meaningful penalty if mmu_lock is held for write.
|
||||
*/
|
||||
if (!locked) {
|
||||
locked = true;
|
||||
spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
|
||||
|
||||
/*
|
||||
* Recheck after taking the spinlock, a different vCPU
|
||||
* may have since marked the page unsync. A false
|
||||
* positive on the unprotected check above is not
|
||||
* possible as clearing sp->unsync _must_ hold mmu_lock
|
||||
* for write, i.e. unsync cannot transition from 0->1
|
||||
* while this CPU holds mmu_lock for read (or write).
|
||||
*/
|
||||
if (READ_ONCE(sp->unsync))
|
||||
continue;
|
||||
}
|
||||
|
||||
WARN_ON(sp->role.level != PG_LEVEL_4K);
|
||||
kvm_unsync_page(vcpu, sp);
|
||||
}
|
||||
if (locked)
|
||||
spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
|
||||
|
||||
/*
|
||||
* We need to ensure that the marking of unsync pages is visible
|
||||
|
@ -5537,6 +5563,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
|
|||
{
|
||||
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
|
||||
|
||||
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
|
||||
|
||||
if (!kvm_mmu_init_tdp_mmu(kvm))
|
||||
/*
|
||||
* No smp_load/store wrappers needed here as we are in
|
||||
|
|
|
@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
|
|||
if (!kvm->arch.tdp_mmu_enabled)
|
||||
return;
|
||||
|
||||
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
|
||||
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
|
||||
|
||||
/*
|
||||
|
@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
|
|||
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
|
||||
bool shared)
|
||||
{
|
||||
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
|
||||
|
||||
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
|
||||
|
||||
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
|
||||
|
@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
|
|||
list_del_rcu(&root->link);
|
||||
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
|
||||
zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
|
||||
zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
|
||||
|
||||
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
|
||||
}
|
||||
|
@ -724,13 +723,29 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
|
|||
gfn_t start, gfn_t end, bool can_yield, bool flush,
|
||||
bool shared)
|
||||
{
|
||||
gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
|
||||
bool zap_all = (start == 0 && end >= max_gfn_host);
|
||||
struct tdp_iter iter;
|
||||
|
||||
/*
|
||||
* No need to try to step down in the iterator when zapping all SPTEs,
|
||||
* zapping the top-level non-leaf SPTEs will recurse on their children.
|
||||
*/
|
||||
int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
|
||||
|
||||
/*
|
||||
* Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
|
||||
* hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
|
||||
* and so KVM will never install a SPTE for such addresses.
|
||||
*/
|
||||
end = min(end, max_gfn_host);
|
||||
|
||||
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
tdp_root_for_each_pte(iter, root, start, end) {
|
||||
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
|
||||
min_level, start, end) {
|
||||
retry:
|
||||
if (can_yield &&
|
||||
tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
|
||||
|
@ -744,9 +759,10 @@ retry:
|
|||
/*
|
||||
* If this is a non-last-level SPTE that covers a larger range
|
||||
* than should be zapped, continue, and zap the mappings at a
|
||||
* lower level.
|
||||
* lower level, except when zapping all SPTEs.
|
||||
*/
|
||||
if ((iter.gfn < start ||
|
||||
if (!zap_all &&
|
||||
(iter.gfn < start ||
|
||||
iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
|
||||
!is_last_spte(iter.old_spte, iter.level))
|
||||
continue;
|
||||
|
@ -794,12 +810,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
|
|||
|
||||
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
|
||||
{
|
||||
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
|
||||
bool flush = false;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
|
||||
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
|
||||
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
|
||||
flush, false);
|
||||
|
||||
if (flush)
|
||||
|
@ -838,7 +853,6 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
|
|||
*/
|
||||
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
|
||||
{
|
||||
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
|
||||
struct kvm_mmu_page *next_root;
|
||||
struct kvm_mmu_page *root;
|
||||
bool flush = false;
|
||||
|
@ -854,8 +868,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
|
|||
|
||||
rcu_read_unlock();
|
||||
|
||||
flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
|
||||
true);
|
||||
flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
|
||||
|
||||
/*
|
||||
* Put the reference acquired in
|
||||
|
|
|
@ -330,6 +330,31 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
|
|||
vcpu_put(vcpu);
|
||||
}
|
||||
|
||||
#define EPTP_PA_MASK GENMASK_ULL(51, 12)
|
||||
|
||||
static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
|
||||
{
|
||||
return VALID_PAGE(root_hpa) &&
|
||||
((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
|
||||
}
|
||||
|
||||
static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
|
||||
gpa_t addr)
|
||||
{
|
||||
uint i;
|
||||
struct kvm_mmu_root_info *cached_root;
|
||||
|
||||
WARN_ON_ONCE(!mmu_is_nested(vcpu));
|
||||
|
||||
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
|
||||
cached_root = &vcpu->arch.mmu->prev_roots[i];
|
||||
|
||||
if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
|
||||
eptp))
|
||||
vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
|
||||
}
|
||||
}
|
||||
|
||||
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
|
||||
struct x86_exception *fault)
|
||||
{
|
||||
|
@ -342,10 +367,22 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
|
|||
vm_exit_reason = EXIT_REASON_PML_FULL;
|
||||
vmx->nested.pml_full = false;
|
||||
exit_qualification &= INTR_INFO_UNBLOCK_NMI;
|
||||
} else if (fault->error_code & PFERR_RSVD_MASK)
|
||||
vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
|
||||
else
|
||||
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
|
||||
} else {
|
||||
if (fault->error_code & PFERR_RSVD_MASK)
|
||||
vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
|
||||
else
|
||||
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
|
||||
|
||||
/*
|
||||
* Although the caller (kvm_inject_emulated_page_fault) would
|
||||
* have already synced the faulting address in the shadow EPT
|
||||
* tables for the current EPTP12, we also need to sync it for
|
||||
* any other cached EPTP02s based on the same EP4TA, since the
|
||||
* TLB associates mappings to the EP4TA rather than the full EPTP.
|
||||
*/
|
||||
nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
|
||||
fault->address);
|
||||
}
|
||||
|
||||
nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
|
||||
vmcs12->guest_physical_address = fault->address;
|
||||
|
@ -5325,14 +5362,6 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
|
|||
return nested_vmx_succeed(vcpu);
|
||||
}
|
||||
|
||||
#define EPTP_PA_MASK GENMASK_ULL(51, 12)
|
||||
|
||||
static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
|
||||
{
|
||||
return VALID_PAGE(root_hpa) &&
|
||||
((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
|
||||
}
|
||||
|
||||
/* Emulate the INVEPT instruction */
|
||||
static int handle_invept(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
|
@ -5826,7 +5855,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
|
|||
if (is_nmi(intr_info))
|
||||
return true;
|
||||
else if (is_page_fault(intr_info))
|
||||
return vcpu->arch.apf.host_apf_flags || !enable_ept;
|
||||
return vcpu->arch.apf.host_apf_flags ||
|
||||
vmx_need_pf_intercept(vcpu);
|
||||
else if (is_debug(intr_info) &&
|
||||
vcpu->guest_debug &
|
||||
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
|
||||
|
|
|
@ -522,7 +522,7 @@ static inline struct vmcs *alloc_vmcs(bool shadow)
|
|||
|
||||
static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
|
||||
{
|
||||
return vmx->secondary_exec_control &
|
||||
return secondary_exec_controls_get(vmx) &
|
||||
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue