mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-23 16:53:58 -05:00
* Fix syzkaller NULL pointer dereference
* Fix TDP MMU performance issue with disabling dirty logging * Fix 5.14 regression with SVM TSC scaling * Fix indefinite stall on applying live patches * Fix unstable selftest * Fix memory leak from wrong copy-and-paste * Fix missed PV TLB flush when racing with emulation -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmKglysUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroOJDAgArpPcAnJbeT2VQTQcp94e4tp9k1Sf gmUewajco4zFVB/sldE0fIporETkaX+FYYPiaNDdNgJ2lUw/HUJBN7KoFEYTZ37N Xx/qXiIXQYFw1bmxTnacLzIQtD3luMCzOs/6/Q7CAFZIBpUtUEjkMlQOBuxoKeG0 B0iLCTJSw0taWcN170aN8G6T+5+bdR3AJW1k2wkgfESfYF9NfJoTUHQj9WTMzM2R aBRuXvUI/rWKvQY3DfoRmgg9Ig/SirSC+abbKIs4H08vZIEUlPk3WOZSKpsN/Wzh 3XDnVRxgnaRLx6NI/ouI2UYJCmjPKbNcueGCf5IfUcHvngHjAEG/xxe4Qw== =zQ9u -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull KVM fixes from Paolo Bonzini: - syzkaller NULL pointer dereference - TDP MMU performance issue with disabling dirty logging - 5.14 regression with SVM TSC scaling - indefinite stall on applying live patches - unstable selftest - memory leak from wrong copy-and-paste - missed PV TLB flush when racing with emulation * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: x86: do not report a vCPU as preempted outside instruction boundaries KVM: x86: do not set st->preempted when going back to user space KVM: SVM: fix tsc scaling cache logic KVM: selftests: Make hyperv_clock selftest more stable KVM: x86/MMU: Zap non-leaf SPTEs when disabling dirty logging x86: drop bogus "cc" clobber from __try_cmpxchg_user_asm() KVM: x86/mmu: Check every prev_roots in __kvm_mmu_free_obsolete_roots() entry/kvm: Exit to user mode when TIF_NOTIFY_SIGNAL is set KVM: Don't null dereference ops->destroy
This commit is contained in:
commit
34f4335c16
15 changed files with 124 additions and 47 deletions
|
@ -653,6 +653,7 @@ struct kvm_vcpu_arch {
|
|||
u64 ia32_misc_enable_msr;
|
||||
u64 smbase;
|
||||
u64 smi_count;
|
||||
bool at_instruction_boundary;
|
||||
bool tpr_access_reporting;
|
||||
bool xsaves_enabled;
|
||||
bool xfd_no_write_intercept;
|
||||
|
@ -1300,6 +1301,8 @@ struct kvm_vcpu_stat {
|
|||
u64 nested_run;
|
||||
u64 directed_yield_attempted;
|
||||
u64 directed_yield_successful;
|
||||
u64 preemption_reported;
|
||||
u64 preemption_other;
|
||||
u64 guest_mode;
|
||||
};
|
||||
|
||||
|
|
|
@ -439,7 +439,7 @@ do { \
|
|||
[ptr] "+m" (*_ptr), \
|
||||
[old] "+a" (__old) \
|
||||
: [new] ltype (__new) \
|
||||
: "memory", "cc"); \
|
||||
: "memory"); \
|
||||
if (unlikely(__err)) \
|
||||
goto label; \
|
||||
if (unlikely(!success)) \
|
||||
|
|
|
@ -5179,7 +5179,7 @@ static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
|
|||
roots_to_free |= KVM_MMU_ROOT_CURRENT;
|
||||
|
||||
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
|
||||
if (is_obsolete_root(kvm, mmu->root.hpa))
|
||||
if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
|
||||
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
|
||||
}
|
||||
|
||||
|
|
|
@ -145,6 +145,15 @@ static bool try_step_up(struct tdp_iter *iter)
|
|||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Step the iterator back up a level in the paging structure. Should only be
|
||||
* used when the iterator is below the root level.
|
||||
*/
|
||||
void tdp_iter_step_up(struct tdp_iter *iter)
|
||||
{
|
||||
WARN_ON(!try_step_up(iter));
|
||||
}
|
||||
|
||||
/*
|
||||
* Step to the next SPTE in a pre-order traversal of the paging structure.
|
||||
* To get to the next SPTE, the iterator either steps down towards the goal
|
||||
|
|
|
@ -114,5 +114,6 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
|
|||
int min_level, gfn_t next_last_level_gfn);
|
||||
void tdp_iter_next(struct tdp_iter *iter);
|
||||
void tdp_iter_restart(struct tdp_iter *iter);
|
||||
void tdp_iter_step_up(struct tdp_iter *iter);
|
||||
|
||||
#endif /* __KVM_X86_MMU_TDP_ITER_H */
|
||||
|
|
|
@ -1742,12 +1742,12 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
|
|||
gfn_t start = slot->base_gfn;
|
||||
gfn_t end = start + slot->npages;
|
||||
struct tdp_iter iter;
|
||||
int max_mapping_level;
|
||||
kvm_pfn_t pfn;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
tdp_root_for_each_pte(iter, root, start, end) {
|
||||
retry:
|
||||
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
|
||||
continue;
|
||||
|
||||
|
@ -1755,15 +1755,41 @@ retry:
|
|||
!is_last_spte(iter.old_spte, iter.level))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* This is a leaf SPTE. Check if the PFN it maps can
|
||||
* be mapped at a higher level.
|
||||
*/
|
||||
pfn = spte_to_pfn(iter.old_spte);
|
||||
if (kvm_is_reserved_pfn(pfn) ||
|
||||
iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
|
||||
pfn, PG_LEVEL_NUM))
|
||||
|
||||
if (kvm_is_reserved_pfn(pfn))
|
||||
continue;
|
||||
|
||||
max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
|
||||
iter.gfn, pfn, PG_LEVEL_NUM);
|
||||
|
||||
WARN_ON(max_mapping_level < iter.level);
|
||||
|
||||
/*
|
||||
* If this page is already mapped at the highest
|
||||
* viable level, there's nothing more to do.
|
||||
*/
|
||||
if (max_mapping_level == iter.level)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* The page can be remapped at a higher level, so step
|
||||
* up to zap the parent SPTE.
|
||||
*/
|
||||
while (max_mapping_level > iter.level)
|
||||
tdp_iter_step_up(&iter);
|
||||
|
||||
/* Note, a successful atomic zap also does a remote TLB flush. */
|
||||
if (tdp_mmu_zap_spte_atomic(kvm, &iter))
|
||||
goto retry;
|
||||
tdp_mmu_zap_spte_atomic(kvm, &iter);
|
||||
|
||||
/*
|
||||
* If the atomic zap fails, the iter will recurse back into
|
||||
* the same subtree to retry.
|
||||
*/
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
|
|
@ -982,7 +982,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
|
|||
if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
|
||||
WARN_ON(!svm->tsc_scaling_enabled);
|
||||
vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
|
||||
svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
|
||||
__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
|
||||
}
|
||||
|
||||
svm->nested.ctl.nested_cr3 = 0;
|
||||
|
@ -1387,7 +1387,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
|
|||
vcpu->arch.tsc_scaling_ratio =
|
||||
kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
|
||||
svm->tsc_ratio_msr);
|
||||
svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
|
||||
__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
|
||||
}
|
||||
|
||||
/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
|
||||
|
|
|
@ -465,11 +465,24 @@ static int has_svm(void)
|
|||
return 1;
|
||||
}
|
||||
|
||||
void __svm_write_tsc_multiplier(u64 multiplier)
|
||||
{
|
||||
preempt_disable();
|
||||
|
||||
if (multiplier == __this_cpu_read(current_tsc_ratio))
|
||||
goto out;
|
||||
|
||||
wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
|
||||
__this_cpu_write(current_tsc_ratio, multiplier);
|
||||
out:
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void svm_hardware_disable(void)
|
||||
{
|
||||
/* Make sure we clean up behind us */
|
||||
if (tsc_scaling)
|
||||
wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT);
|
||||
__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
|
||||
|
||||
cpu_svm_disable();
|
||||
|
||||
|
@ -515,8 +528,7 @@ static int svm_hardware_enable(void)
|
|||
* Set the default value, even if we don't use TSC scaling
|
||||
* to avoid having stale value in the msr
|
||||
*/
|
||||
wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT);
|
||||
__this_cpu_write(current_tsc_ratio, SVM_TSC_RATIO_DEFAULT);
|
||||
__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
|
||||
}
|
||||
|
||||
|
||||
|
@ -999,11 +1011,12 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
|
|||
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
|
||||
}
|
||||
|
||||
void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
|
||||
static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
|
||||
{
|
||||
wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
|
||||
__svm_write_tsc_multiplier(multiplier);
|
||||
}
|
||||
|
||||
|
||||
/* Evaluate instruction intercepts that depend on guest CPUID features. */
|
||||
static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
|
||||
struct vcpu_svm *svm)
|
||||
|
@ -1363,13 +1376,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
|
|||
sev_es_prepare_switch_to_guest(hostsa);
|
||||
}
|
||||
|
||||
if (tsc_scaling) {
|
||||
u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
|
||||
if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
|
||||
__this_cpu_write(current_tsc_ratio, tsc_ratio);
|
||||
wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
|
||||
}
|
||||
}
|
||||
if (tsc_scaling)
|
||||
__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
|
||||
|
||||
if (likely(tsc_aux_uret_slot >= 0))
|
||||
kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
|
||||
|
@ -4255,6 +4263,8 @@ out:
|
|||
|
||||
static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
|
||||
vcpu->arch.at_instruction_boundary = true;
|
||||
}
|
||||
|
||||
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
|
||||
|
|
|
@ -590,7 +590,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
|
|||
bool has_error_code, u32 error_code);
|
||||
int nested_svm_exit_special(struct vcpu_svm *svm);
|
||||
void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
|
||||
void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier);
|
||||
void __svm_write_tsc_multiplier(u64 multiplier);
|
||||
void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
|
||||
struct vmcb_control_area *control);
|
||||
void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
|
||||
|
|
|
@ -6547,6 +6547,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
|
|||
return;
|
||||
|
||||
handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
|
||||
vcpu->arch.at_instruction_boundary = true;
|
||||
}
|
||||
|
||||
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
|
||||
|
|
|
@ -296,6 +296,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
|
|||
STATS_DESC_COUNTER(VCPU, nested_run),
|
||||
STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
|
||||
STATS_DESC_COUNTER(VCPU, directed_yield_successful),
|
||||
STATS_DESC_COUNTER(VCPU, preemption_reported),
|
||||
STATS_DESC_COUNTER(VCPU, preemption_other),
|
||||
STATS_DESC_ICOUNTER(VCPU, guest_mode)
|
||||
};
|
||||
|
||||
|
@ -4625,6 +4627,19 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
|
|||
struct kvm_memslots *slots;
|
||||
static const u8 preempted = KVM_VCPU_PREEMPTED;
|
||||
|
||||
/*
|
||||
* The vCPU can be marked preempted if and only if the VM-Exit was on
|
||||
* an instruction boundary and will not trigger guest emulation of any
|
||||
* kind (see vcpu_run). Vendor specific code controls (conservatively)
|
||||
* when this is true, for example allowing the vCPU to be marked
|
||||
* preempted if and only if the VM-Exit was due to a host interrupt.
|
||||
*/
|
||||
if (!vcpu->arch.at_instruction_boundary) {
|
||||
vcpu->stat.preemption_other++;
|
||||
return;
|
||||
}
|
||||
|
||||
vcpu->stat.preemption_reported++;
|
||||
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
|
||||
return;
|
||||
|
||||
|
@ -4654,19 +4669,21 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
|
|||
{
|
||||
int idx;
|
||||
|
||||
if (vcpu->preempted && !vcpu->arch.guest_state_protected)
|
||||
vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
|
||||
if (vcpu->preempted) {
|
||||
if (!vcpu->arch.guest_state_protected)
|
||||
vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
|
||||
|
||||
/*
|
||||
* Take the srcu lock as memslots will be accessed to check the gfn
|
||||
* cache generation against the memslots generation.
|
||||
*/
|
||||
idx = srcu_read_lock(&vcpu->kvm->srcu);
|
||||
if (kvm_xen_msr_enabled(vcpu->kvm))
|
||||
kvm_xen_runstate_set_preempted(vcpu);
|
||||
else
|
||||
kvm_steal_time_set_preempted(vcpu);
|
||||
srcu_read_unlock(&vcpu->kvm->srcu, idx);
|
||||
/*
|
||||
* Take the srcu lock as memslots will be accessed to check the gfn
|
||||
* cache generation against the memslots generation.
|
||||
*/
|
||||
idx = srcu_read_lock(&vcpu->kvm->srcu);
|
||||
if (kvm_xen_msr_enabled(vcpu->kvm))
|
||||
kvm_xen_runstate_set_preempted(vcpu);
|
||||
else
|
||||
kvm_steal_time_set_preempted(vcpu);
|
||||
srcu_read_unlock(&vcpu->kvm->srcu, idx);
|
||||
}
|
||||
|
||||
static_call(kvm_x86_vcpu_put)(vcpu);
|
||||
vcpu->arch.last_host_tsc = rdtsc();
|
||||
|
@ -10422,6 +10439,13 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
|
|||
vcpu->arch.l1tf_flush_l1d = true;
|
||||
|
||||
for (;;) {
|
||||
/*
|
||||
* If another guest vCPU requests a PV TLB flush in the middle
|
||||
* of instruction emulation, the rest of the emulation could
|
||||
* use a stale page translation. Assume that any code after
|
||||
* this point can start executing an instruction.
|
||||
*/
|
||||
vcpu->arch.at_instruction_boundary = false;
|
||||
if (kvm_vcpu_running(vcpu)) {
|
||||
r = vcpu_enter_guest(vcpu);
|
||||
} else {
|
||||
|
|
|
@ -159,8 +159,10 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
|
|||
* behalf of the vCPU. Only if the VMM does actually block
|
||||
* does it need to enter RUNSTATE_blocked.
|
||||
*/
|
||||
if (vcpu->preempted)
|
||||
kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
|
||||
if (WARN_ON_ONCE(!vcpu->preempted))
|
||||
return;
|
||||
|
||||
kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
|
||||
}
|
||||
|
||||
/* 32-bit compatibility definitions, also used natively in 32-bit build */
|
||||
|
|
|
@ -9,12 +9,6 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
|
|||
int ret;
|
||||
|
||||
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
|
||||
clear_notify_signal();
|
||||
if (task_work_pending(current))
|
||||
task_work_run();
|
||||
}
|
||||
|
||||
if (ti_work & _TIF_SIGPENDING) {
|
||||
kvm_handle_signal_exit(vcpu);
|
||||
return -EINTR;
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@ static inline void nop_loop(void)
|
|||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 1000000; i++)
|
||||
for (i = 0; i < 100000000; i++)
|
||||
asm volatile("nop");
|
||||
}
|
||||
|
||||
|
@ -56,12 +56,14 @@ static inline void check_tsc_msr_rdtsc(void)
|
|||
tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY);
|
||||
GUEST_ASSERT(tsc_freq > 0);
|
||||
|
||||
/* First, check MSR-based clocksource */
|
||||
/* For increased accuracy, take mean rdtsc() before and afrer rdmsr() */
|
||||
r1 = rdtsc();
|
||||
t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
|
||||
r1 = (r1 + rdtsc()) / 2;
|
||||
nop_loop();
|
||||
r2 = rdtsc();
|
||||
t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
|
||||
r2 = (r2 + rdtsc()) / 2;
|
||||
|
||||
GUEST_ASSERT(r2 > r1 && t2 > t1);
|
||||
|
||||
|
@ -181,12 +183,14 @@ static void host_check_tsc_msr_rdtsc(struct kvm_vm *vm)
|
|||
tsc_freq = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TSC_FREQUENCY);
|
||||
TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero");
|
||||
|
||||
/* First, check MSR-based clocksource */
|
||||
/* For increased accuracy, take mean rdtsc() before and afrer ioctl */
|
||||
r1 = rdtsc();
|
||||
t1 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT);
|
||||
r1 = (r1 + rdtsc()) / 2;
|
||||
nop_loop();
|
||||
r2 = rdtsc();
|
||||
t2 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT);
|
||||
r2 = (r2 + rdtsc()) / 2;
|
||||
|
||||
TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2);
|
||||
|
||||
|
|
|
@ -4300,8 +4300,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
|
|||
kvm_put_kvm_no_destroy(kvm);
|
||||
mutex_lock(&kvm->lock);
|
||||
list_del(&dev->vm_node);
|
||||
if (ops->release)
|
||||
ops->release(dev);
|
||||
mutex_unlock(&kvm->lock);
|
||||
ops->destroy(dev);
|
||||
if (ops->destroy)
|
||||
ops->destroy(dev);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue