1
0
Fork 0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-01-24 09:13:20 -05:00
* Remove 's' & 'u' as valid ISA extension
 
 * Do not allow disabling the base extensions 'i'/'m'/'a'/'c'
 
 x86:
 
 * Fix NMI watchdog in guests on AMD
 
 * Fix for SEV cache incoherency issues
 
 * Don't re-acquire SRCU lock in complete_emulated_io()
 
 * Avoid NULL pointer deref if VM creation fails
 
 * Fix race conditions between APICv disabling and vCPU creation
 
 * Bugfixes for disabling of APICv
 
 * Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume
 
 selftests:
 
 * Do not use bitfields larger than 32-bits, they differ between GCC and clang
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmJi3KUUHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroMhvQf/Yncfg3MkOvKsVxnCe7diKDTI/E2n
 wBGNIcL8r7L9oIltHL4Mh7JQTacHFQOZ9PQ30NO1p+pznZ03e8LR59IF1JpP7VOU
 sWrLZ5a4bIAEjOpA7Jxcee6hUBwewBauDgFLbb+YAI2lAahiH7jVfywDRife/c3k
 N2LjeA75K8UvMiDCfjxxxerFJK91zaqjWlUNF2OhtFp/5pnMfS+nli9Q8QS837pZ
 oUf+0Beb2RpSHan+wbYVU7X3ZLwtpR0M3w3uXOG+X3as56wDf26znXS02aSwa45x
 lfX+pqJfmb4vCJJDXt6avH27EVgTq0Vew+BhQHG3VLRO6uxZ+smX6qmsuw==
 =kvbw
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "The main and larger change here is a workaround for AMD's lack of
  cache coherency for encrypted-memory guests.

  I have another patch pending, but it's waiting for review from the
  architecture maintainers.

  RISC-V:

   - Remove 's' & 'u' as valid ISA extension

   - Do not allow disabling the base extensions 'i'/'m'/'a'/'c'

  x86:

   - Fix NMI watchdog in guests on AMD

   - Fix for SEV cache incoherency issues

   - Don't re-acquire SRCU lock in complete_emulated_io()

   - Avoid NULL pointer deref if VM creation fails

   - Fix race conditions between APICv disabling and vCPU creation

   - Bugfixes for disabling of APICv

   - Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume

  selftests:

   - Do not use bitfields larger than 32-bits, they differ between GCC
     and clang"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  kvm: selftests: introduce and use more page size-related constants
  kvm: selftests: do not use bitfields larger than 32-bits for PTEs
  KVM: SEV: add cache flush to solve SEV cache incoherency issues
  KVM: SVM: Flush when freeing encrypted pages even on SME_COHERENT CPUs
  KVM: SVM: Simplify and harden helper to flush SEV guest page(s)
  KVM: selftests: Silence compiler warning in the kvm_page_table_test
  KVM: x86/pmu: Update AMD PMC sample period to fix guest NMI-watchdog
  x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume
  KVM: SPDX style and spelling fixes
  KVM: x86: Skip KVM_GUESTDBG_BLOCKIRQ APICv update if APICv is disabled
  KVM: x86: Pend KVM_REQ_APICV_UPDATE during vCPU creation to fix a race
  KVM: nVMX: Defer APICv updates while L2 is active until L1 is active
  KVM: x86: Tag APICv DISABLE inhibit, not ABSENT, if APICv is disabled
  KVM: Initialize debugfs_dentry when a VM is created to avoid NULL deref
  KVM: Add helpers to wrap vcpu->srcu_idx and yell if it's abused
  KVM: RISC-V: Use kvm_vcpu.srcu_idx, drop RISC-V's unnecessary copy
  KVM: x86: Don't re-acquire SRCU lock in complete_emulated_io()
  RISC-V: KVM: Restrict the extensions that can be disabled
  RISC-V: KVM: Remove 's' & 'u' as valid ISA extension
This commit is contained in:
Linus Torvalds 2022-04-22 17:58:36 -07:00
commit bb4ce2c658
36 changed files with 318 additions and 254 deletions

View file

@ -168,9 +168,10 @@ int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
return -EINVAL;
/* Read the entry from guest memory */
addr = base + (index * sizeof(rpte));
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
if (ret) {
if (pte_ret_p)
*pte_ret_p = addr;
@ -246,9 +247,9 @@ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
/* Read the table to find the root of the radix tree */
ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
if (ret)
return ret;

View file

@ -306,10 +306,10 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
/* copy parameters in */
hv_ptr = kvmppc_get_gpr(vcpu, 4);
regs_ptr = kvmppc_get_gpr(vcpu, 5);
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
err = kvmhv_read_guest_state_and_regs(vcpu, &l2_hv, &l2_regs,
hv_ptr, regs_ptr);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
if (err)
return H_PARAMETER;
@ -410,10 +410,10 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
byteswap_hv_regs(&l2_hv);
byteswap_pt_regs(&l2_regs);
}
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
err = kvmhv_write_guest_state_and_regs(vcpu, &l2_hv, &l2_regs,
hv_ptr, regs_ptr);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
if (err)
return H_AUTHORITY;
@ -600,16 +600,16 @@ long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu)
goto not_found;
/* Write what was loaded into our buffer back to the L1 guest */
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
rc = kvm_vcpu_write_guest(vcpu, gp_to, buf, n);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
if (rc)
goto not_found;
} else {
/* Load the data to be stored from the L1 guest into our buf */
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
rc = kvm_vcpu_read_guest(vcpu, gp_from, buf, n);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
if (rc)
goto not_found;

View file

@ -229,9 +229,9 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
*/
args_phys = kvmppc_get_gpr(vcpu, 4) & KVM_PAM;
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args));
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
if (rc)
goto fail;

View file

@ -425,9 +425,9 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
return EMULATE_DONE;
}
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
rc = kvm_read_guest(vcpu->kvm, pte.raddr, ptr, size);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
if (rc)
return EMULATE_DO_MMIO;

View file

@ -193,9 +193,6 @@ struct kvm_vcpu_arch {
/* Don't run the VCPU (blocked) */
bool pause;
/* SRCU lock index for in-kernel run loop */
int srcu_idx;
};
static inline void kvm_arch_hardware_unsetup(void) {}

View file

@ -38,14 +38,16 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
sizeof(kvm_vcpu_stats_desc),
};
#define KVM_RISCV_ISA_ALLOWED (riscv_isa_extension_mask(a) | \
riscv_isa_extension_mask(c) | \
riscv_isa_extension_mask(d) | \
riscv_isa_extension_mask(f) | \
riscv_isa_extension_mask(i) | \
riscv_isa_extension_mask(m) | \
riscv_isa_extension_mask(s) | \
riscv_isa_extension_mask(u))
#define KVM_RISCV_ISA_DISABLE_ALLOWED (riscv_isa_extension_mask(d) | \
riscv_isa_extension_mask(f))
#define KVM_RISCV_ISA_DISABLE_NOT_ALLOWED (riscv_isa_extension_mask(a) | \
riscv_isa_extension_mask(c) | \
riscv_isa_extension_mask(i) | \
riscv_isa_extension_mask(m))
#define KVM_RISCV_ISA_ALLOWED (KVM_RISCV_ISA_DISABLE_ALLOWED | \
KVM_RISCV_ISA_DISABLE_NOT_ALLOWED)
static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu)
{
@ -219,7 +221,8 @@ static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu,
switch (reg_num) {
case KVM_REG_RISCV_CONFIG_REG(isa):
if (!vcpu->arch.ran_atleast_once) {
vcpu->arch.isa = reg_val;
/* Ignore the disable request for these extensions */
vcpu->arch.isa = reg_val | KVM_RISCV_ISA_DISABLE_NOT_ALLOWED;
vcpu->arch.isa &= riscv_isa_extension_base(NULL);
vcpu->arch.isa &= KVM_RISCV_ISA_ALLOWED;
kvm_riscv_vcpu_fp_reset(vcpu);
@ -724,13 +727,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
/* Mark this VCPU ran at least once */
vcpu->arch.ran_atleast_once = true;
vcpu->arch.srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
/* Process MMIO value returned from user-space */
if (run->exit_reason == KVM_EXIT_MMIO) {
ret = kvm_riscv_vcpu_mmio_return(vcpu, vcpu->run);
if (ret) {
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
return ret;
}
}
@ -739,13 +742,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
if (run->exit_reason == KVM_EXIT_RISCV_SBI) {
ret = kvm_riscv_vcpu_sbi_return(vcpu, vcpu->run);
if (ret) {
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
return ret;
}
}
if (run->immediate_exit) {
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
return -EINTR;
}
@ -784,7 +787,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
vcpu->mode = IN_GUEST_MODE;
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
smp_mb__after_srcu_read_unlock();
/*
@ -802,7 +805,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
local_irq_enable();
preempt_enable();
vcpu->arch.srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
continue;
}
@ -846,7 +849,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
preempt_enable();
vcpu->arch.srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
ret = kvm_riscv_vcpu_exit(vcpu, run, &trap);
}
@ -855,7 +858,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
vcpu_put(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
return ret;
}

View file

@ -456,9 +456,9 @@ static int stage2_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
void kvm_riscv_vcpu_wfi(struct kvm_vcpu *vcpu)
{
if (!kvm_arch_vcpu_runnable(vcpu)) {
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
kvm_vcpu_halt(vcpu);
vcpu->arch.srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
kvm_clear_request(KVM_REQ_UNHALT, vcpu);
}
}

View file

@ -1334,11 +1334,11 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
hrtimer_start(&vcpu->arch.ckc_timer, sltime, HRTIMER_MODE_REL);
VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime);
no_timer:
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
kvm_vcpu_halt(vcpu);
vcpu->valid_wakeup = false;
__unset_cpu_idle(vcpu);
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
hrtimer_cancel(&vcpu->arch.ckc_timer);
return 0;

View file

@ -4237,14 +4237,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
* We try to hold kvm->srcu during most of vcpu_run (except when run-
* ning the guest), so that memslots (and other stuff) are protected
*/
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
do {
rc = vcpu_pre_run(vcpu);
if (rc)
break;
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
/*
* As PF_VCPU will be used in fault handler, between
* guest_enter and guest_exit should be no uaccess.
@ -4281,12 +4281,12 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
__enable_cpu_timer_accounting(vcpu);
guest_exit_irqoff();
local_irq_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
rc = vcpu_post_run(vcpu, exit_reason);
} while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
return rc;
}

View file

@ -1091,7 +1091,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
handle_last_fault(vcpu, vsie_page);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
/* save current guest state of bp isolation override */
guest_bp_isolation = test_thread_flag(TIF_ISOLATE_BP_GUEST);
@ -1133,7 +1133,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (!guest_bp_isolation)
clear_thread_flag(TIF_ISOLATE_BP_GUEST);
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
if (rc == -EINTR) {
VCPU_EVENT(vcpu, 3, "%s", "machine check");

View file

@ -118,6 +118,7 @@ KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
KVM_X86_OP(get_msr_feature)
KVM_X86_OP(can_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)

View file

@ -1484,6 +1484,7 @@ struct kvm_x86_ops {
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
void (*guest_memory_reclaimed)(struct kvm *kvm);
int (*get_msr_feature)(struct kvm_msr_entry *entry);

View file

@ -69,6 +69,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align
DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
static int has_steal_clock = 0;
static int has_guest_poll = 0;
/*
* No need for any "IO delay" on KVM
*/
@ -706,14 +707,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
static int kvm_suspend(void)
{
u64 val = 0;
kvm_guest_cpu_offline(false);
#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
rdmsrl(MSR_KVM_POLL_CONTROL, val);
has_guest_poll = !(val & 1);
#endif
return 0;
}
static void kvm_resume(void)
{
kvm_cpu_online(raw_smp_processor_id());
#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
wrmsrl(MSR_KVM_POLL_CONTROL, 0);
#endif
}
static struct syscore_ops kvm_syscore_ops = {

View file

@ -138,6 +138,15 @@ static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
return sample_period;
}
static inline void pmc_update_sample_period(struct kvm_pmc *pmc)
{
if (!pmc->perf_event || pmc->is_paused)
return;
perf_event_period(pmc->perf_event,
get_sample_period(pmc, pmc->counter));
}
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);

View file

@ -257,6 +257,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
if (pmc) {
pmc->counter += data - pmc_read_counter(pmc);
pmc_update_sample_period(pmc);
return 0;
}
/* MSR_EVNTSELn */

View file

@ -2226,51 +2226,47 @@ int sev_cpu_init(struct svm_cpu_data *sd)
* Pages used by hardware to hold guest encrypted state must be flushed before
* returning them to the system.
*/
static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va,
unsigned long len)
static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
{
int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid;
/*
* If hardware enforced cache coherency for encrypted mappings of the
* same physical page is supported, nothing to do.
* Note! The address must be a kernel address, as regular page walk
* checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
* address is non-deterministic and unsafe. This function deliberately
* takes a pointer to deter passing in a user address.
*/
if (boot_cpu_has(X86_FEATURE_SME_COHERENT))
unsigned long addr = (unsigned long)va;
/*
* If CPU enforced cache coherency for encrypted mappings of the
* same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
* flush is still needed in order to work properly with DMA devices.
*/
if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
clflush_cache_range(va, PAGE_SIZE);
return;
/*
* If the VM Page Flush MSR is supported, use it to flush the page
* (using the page virtual address and the guest ASID).
*/
if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) {
struct kvm_sev_info *sev;
unsigned long va_start;
u64 start, stop;
/* Align start and stop to page boundaries. */
va_start = (unsigned long)va;
start = (u64)va_start & PAGE_MASK;
stop = PAGE_ALIGN((u64)va_start + len);
if (start < stop) {
sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
while (start < stop) {
wrmsrl(MSR_AMD64_VM_PAGE_FLUSH,
start | sev->asid);
start += PAGE_SIZE;
}
return;
}
WARN(1, "Address overflow, using WBINVD\n");
}
/*
* Hardware should always have one of the above features,
* but if not, use WBINVD and issue a warning.
* VM Page Flush takes a host virtual address and a guest ASID. Fall
* back to WBINVD if this faults so as not to make any problems worse
* by leaving stale encrypted data in the cache.
*/
WARN_ONCE(1, "Using WBINVD to flush guest memory\n");
if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
goto do_wbinvd;
return;
do_wbinvd:
wbinvd_on_all_cpus();
}
void sev_guest_memory_reclaimed(struct kvm *kvm)
{
if (!sev_guest(kvm))
return;
wbinvd_on_all_cpus();
}
@ -2284,7 +2280,8 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
if (vcpu->arch.guest_state_protected)
sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE);
sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
__free_page(virt_to_page(svm->sev_es.vmsa));
if (svm->sev_es.ghcb_sa_free)

View file

@ -4620,6 +4620,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.mem_enc_ioctl = sev_mem_enc_ioctl,
.mem_enc_register_region = sev_mem_enc_register_region,
.mem_enc_unregister_region = sev_mem_enc_unregister_region,
.guest_memory_reclaimed = sev_guest_memory_reclaimed,
.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
.vm_move_enc_context_from = sev_vm_move_enc_context_from,

View file

@ -609,6 +609,8 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
struct kvm_enc_region *range);
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
void sev_guest_memory_reclaimed(struct kvm *kvm);
void pre_sev_run(struct vcpu_svm *svm, int cpu);
void __init sev_set_cpu_caps(void);
void __init sev_hardware_setup(void);

View file

@ -4618,6 +4618,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
}
if (vmx->nested.update_vmcs01_apicv_status) {
vmx->nested.update_vmcs01_apicv_status = false;
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
}
if ((vm_exit_reason != -1) &&
(enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
vmx->nested.need_vmcs12_to_shadow_sync = true;

View file

@ -431,15 +431,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
if (pmc->perf_event && !pmc->is_paused)
perf_event_period(pmc->perf_event,
get_sample_period(pmc, data));
pmc_update_sample_period(pmc);
return 0;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
pmc->counter += data - pmc_read_counter(pmc);
if (pmc->perf_event && !pmc->is_paused)
perf_event_period(pmc->perf_event,
get_sample_period(pmc, data));
pmc_update_sample_period(pmc);
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)

View file

@ -4174,6 +4174,11 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (is_guest_mode(vcpu)) {
vmx->nested.update_vmcs01_apicv_status = true;
return;
}
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
if (cpu_has_secondary_exec_ctrls()) {
if (kvm_vcpu_apicv_active(vcpu))

View file

@ -183,6 +183,7 @@ struct nested_vmx {
bool change_vmcs01_virtual_apic_mode;
bool reload_vmcs01_apic_access_page;
bool update_vmcs01_cpu_dirty_logging;
bool update_vmcs01_apicv_status;
/*
* Enlightened VMCS has been enabled. It does not mean that L1 has to

View file

@ -9111,7 +9111,7 @@ static void kvm_apicv_init(struct kvm *kvm)
if (!enable_apicv)
set_or_clear_apicv_inhibit(inhibits,
APICV_INHIBIT_REASON_ABSENT, true);
APICV_INHIBIT_REASON_DISABLE, true);
}
static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
@ -9889,6 +9889,11 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
}
void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
{
static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
}
static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
if (!lapic_in_kernel(vcpu))
@ -10097,7 +10102,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/* Store vcpu->apicv_active before vcpu->mode. */
smp_store_release(&vcpu->mode, IN_GUEST_MODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
/*
* 1) We should set ->mode before checking ->requests. Please see
@ -10128,7 +10133,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
smp_wmb();
local_irq_enable();
preempt_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
r = 1;
goto cancel_injection;
}
@ -10254,7 +10259,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
local_irq_enable();
preempt_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
/*
* Profile KVM exit RIPs:
@ -10284,7 +10289,7 @@ out:
}
/* Called within kvm->srcu read side. */
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
static inline int vcpu_block(struct kvm_vcpu *vcpu)
{
bool hv_timer;
@ -10300,12 +10305,12 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
if (hv_timer)
kvm_lapic_switch_to_sw_timer(vcpu);
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
kvm_vcpu_halt(vcpu);
else
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
if (hv_timer)
kvm_lapic_switch_to_hv_timer(vcpu);
@ -10347,7 +10352,6 @@ static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
struct kvm *kvm = vcpu->kvm;
vcpu->arch.l1tf_flush_l1d = true;
@ -10355,7 +10359,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
} else {
r = vcpu_block(kvm, vcpu);
r = vcpu_block(vcpu);
}
if (r <= 0)
@ -10374,9 +10378,9 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
}
if (__xfer_to_guest_mode_work_pending()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
r = xfer_to_guest_mode_handle_work(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
if (r)
return r;
}
@ -10387,12 +10391,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
{
int r;
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
return r;
return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
}
static int complete_emulated_pio(struct kvm_vcpu *vcpu)
@ -10484,7 +10483,6 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
struct kvm *kvm = vcpu->kvm;
int r;
vcpu_load(vcpu);
@ -10492,7 +10490,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_run->flags = 0;
kvm_load_guest_fpu(vcpu);
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
if (kvm_run->immediate_exit) {
r = -EINTR;
@ -10504,9 +10502,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
kvm_vcpu_srcu_read_lock(vcpu);
if (kvm_apic_accept_events(vcpu) < 0) {
r = 0;
@ -10567,7 +10565,7 @@ out:
if (kvm_run->kvm_valid_regs)
store_regs(vcpu);
post_kvm_run_save(vcpu);
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_srcu_read_unlock(vcpu);
kvm_sigset_deactivate(vcpu);
vcpu_put(vcpu);
@ -10985,6 +10983,9 @@ static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
struct kvm_vcpu *vcpu;
unsigned long i;
if (!enable_apicv)
return;
down_write(&kvm->arch.apicv_update_lock);
kvm_for_each_vcpu(i, vcpu, kvm) {
@ -11196,8 +11197,21 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
if (r < 0)
goto fail_mmu_destroy;
if (kvm_apicv_activated(vcpu->kvm))
/*
* Defer evaluating inhibits until the vCPU is first run, as
* this vCPU will not get notified of any changes until this
* vCPU is visible to other vCPUs (marked online and added to
* the set of vCPUs). Opportunistically mark APICv active as
* VMX in particularly is highly unlikely to have inhibits.
* Ignore the current per-VM APICv state so that vCPU creation
* is guaranteed to run with a deterministic value, the request
* will ensure the vCPU gets the correct state before VM-Entry.
*/
if (enable_apicv) {
vcpu->arch.apicv_active = true;
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
}
} else
static_branch_inc(&kvm_has_noapic_vcpu);

View file

@ -315,7 +315,10 @@ struct kvm_vcpu {
int cpu;
int vcpu_id; /* id given by userspace at creation */
int vcpu_idx; /* index in kvm->vcpus array */
int srcu_idx;
int ____srcu_idx; /* Don't use this directly. You've been warned. */
#ifdef CONFIG_PROVE_RCU
int srcu_depth;
#endif
int mode;
u64 requests;
unsigned long guest_debug;
@ -840,6 +843,25 @@ static inline void kvm_vm_bugged(struct kvm *kvm)
unlikely(__ret); \
})
static inline void kvm_vcpu_srcu_read_lock(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_PROVE_RCU
WARN_ONCE(vcpu->srcu_depth++,
"KVM: Illegal vCPU srcu_idx LOCK, depth=%d", vcpu->srcu_depth - 1);
#endif
vcpu->____srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
}
static inline void kvm_vcpu_srcu_read_unlock(struct kvm_vcpu *vcpu)
{
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->____srcu_idx);
#ifdef CONFIG_PROVE_RCU
WARN_ONCE(--vcpu->srcu_depth,
"KVM: Illegal vCPU srcu_idx UNLOCK, depth=%d", vcpu->srcu_depth);
#endif
}
static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm)
{
return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET);
@ -2197,6 +2219,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end);
void kvm_arch_guest_memory_reclaimed(struct kvm *kvm);
#ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
#else

View file

@ -60,6 +60,23 @@
/* CPUID.0x8000_0001.EDX */
#define CPUID_GBPAGES (1ul << 26)
/* Page table bitfield declarations */
#define PTE_PRESENT_MASK BIT_ULL(0)
#define PTE_WRITABLE_MASK BIT_ULL(1)
#define PTE_USER_MASK BIT_ULL(2)
#define PTE_ACCESSED_MASK BIT_ULL(5)
#define PTE_DIRTY_MASK BIT_ULL(6)
#define PTE_LARGE_MASK BIT_ULL(7)
#define PTE_GLOBAL_MASK BIT_ULL(8)
#define PTE_NX_MASK BIT_ULL(63)
#define PAGE_SHIFT 12
#define PAGE_SIZE (1ULL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
#define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12)
#define PTE_GET_PFN(pte) (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
/* General Registers in 64-Bit Mode */
struct gpr64_regs {
u64 rax;

View file

@ -278,7 +278,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
else
guest_test_phys_mem = p->phys_offset;
#ifdef __s390x__
alignment = max(0x100000, alignment);
alignment = max(0x100000UL, alignment);
#endif
guest_test_phys_mem = align_down(guest_test_phys_mem, alignment);

View file

@ -19,38 +19,6 @@
vm_vaddr_t exception_handlers;
/* Virtual translation table structure declarations */
struct pageUpperEntry {
uint64_t present:1;
uint64_t writable:1;
uint64_t user:1;
uint64_t write_through:1;
uint64_t cache_disable:1;
uint64_t accessed:1;
uint64_t ignored_06:1;
uint64_t page_size:1;
uint64_t ignored_11_08:4;
uint64_t pfn:40;
uint64_t ignored_62_52:11;
uint64_t execute_disable:1;
};
struct pageTableEntry {
uint64_t present:1;
uint64_t writable:1;
uint64_t user:1;
uint64_t write_through:1;
uint64_t cache_disable:1;
uint64_t accessed:1;
uint64_t dirty:1;
uint64_t reserved_07:1;
uint64_t global:1;
uint64_t ignored_11_09:3;
uint64_t pfn:40;
uint64_t ignored_62_52:11;
uint64_t execute_disable:1;
};
void regs_dump(FILE *stream, struct kvm_regs *regs,
uint8_t indent)
{
@ -195,23 +163,21 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr,
return &page_table[index];
}
static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
uint64_t pt_pfn,
uint64_t vaddr,
uint64_t paddr,
int level,
enum x86_page_size page_size)
static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
uint64_t pt_pfn,
uint64_t vaddr,
uint64_t paddr,
int level,
enum x86_page_size page_size)
{
struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
if (!pte->present) {
pte->writable = true;
pte->present = true;
pte->page_size = (level == page_size);
if (pte->page_size)
pte->pfn = paddr >> vm->page_shift;
if (!(*pte & PTE_PRESENT_MASK)) {
*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
if (level == page_size)
*pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
else
pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift;
*pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
} else {
/*
* Entry already present. Assert that the caller doesn't want
@ -221,7 +187,7 @@ static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
TEST_ASSERT(level != page_size,
"Cannot create hugepage at level: %u, vaddr: 0x%lx\n",
page_size, vaddr);
TEST_ASSERT(!pte->page_size,
TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
"Cannot create page table at level: %u, vaddr: 0x%lx\n",
level, vaddr);
}
@ -232,8 +198,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
enum x86_page_size page_size)
{
const uint64_t pg_size = 1ull << ((page_size * 9) + 12);
struct pageUpperEntry *pml4e, *pdpe, *pde;
struct pageTableEntry *pte;
uint64_t *pml4e, *pdpe, *pde;
uint64_t *pte;
TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
"Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
@ -257,24 +223,22 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
*/
pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift,
vaddr, paddr, 3, page_size);
if (pml4e->page_size)
if (*pml4e & PTE_LARGE_MASK)
return;
pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size);
if (pdpe->page_size)
pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size);
if (*pdpe & PTE_LARGE_MASK)
return;
pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size);
if (pde->page_size)
pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size);
if (*pde & PTE_LARGE_MASK)
return;
/* Fill in page table entry. */
pte = virt_get_pte(vm, pde->pfn, vaddr, 0);
TEST_ASSERT(!pte->present,
pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0);
TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
"PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
pte->pfn = paddr >> vm->page_shift;
pte->writable = true;
pte->present = 1;
*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
}
void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
@ -282,22 +246,22 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
__virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K);
}
static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
uint64_t vaddr)
{
uint16_t index[4];
struct pageUpperEntry *pml4e, *pdpe, *pde;
struct pageTableEntry *pte;
uint64_t *pml4e, *pdpe, *pde;
uint64_t *pte;
struct kvm_cpuid_entry2 *entry;
struct kvm_sregs sregs;
int max_phy_addr;
/* Set the bottom 52 bits. */
uint64_t rsvd_mask = 0x000fffffffffffff;
uint64_t rsvd_mask = 0;
entry = kvm_get_supported_cpuid_index(0x80000008, 0);
max_phy_addr = entry->eax & 0x000000ff;
/* Clear the bottom bits of the reserved mask. */
rsvd_mask = (rsvd_mask >> max_phy_addr) << max_phy_addr;
/* Set the high bits in the reserved mask. */
if (max_phy_addr < 52)
rsvd_mask = GENMASK_ULL(51, max_phy_addr);
/*
* SDM vol 3, fig 4-11 "Formats of CR3 and Paging-Structure Entries
@ -307,7 +271,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
*/
vcpu_sregs_get(vm, vcpuid, &sregs);
if ((sregs.efer & EFER_NX) == 0) {
rsvd_mask |= (1ull << 63);
rsvd_mask |= PTE_NX_MASK;
}
TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
@ -329,30 +293,29 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
index[3] = (vaddr >> 39) & 0x1ffu;
pml4e = addr_gpa2hva(vm, vm->pgd);
TEST_ASSERT(pml4e[index[3]].present,
TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK,
"Expected pml4e to be present for gva: 0x%08lx", vaddr);
TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) &
(rsvd_mask | (1ull << 7))) == 0,
TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0,
"Unexpected reserved bits set.");
pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
TEST_ASSERT(pdpe[index[2]].present,
pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK,
"Expected pdpe to be present for gva: 0x%08lx", vaddr);
TEST_ASSERT(pdpe[index[2]].page_size == 0,
TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK),
"Expected pdpe to map a pde not a 1-GByte page.");
TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0,
TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0,
"Unexpected reserved bits set.");
pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
TEST_ASSERT(pde[index[1]].present,
pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK,
"Expected pde to be present for gva: 0x%08lx", vaddr);
TEST_ASSERT(pde[index[1]].page_size == 0,
TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK),
"Expected pde to map a pte not a 2-MByte page.");
TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0,
TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0,
"Unexpected reserved bits set.");
pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
TEST_ASSERT(pte[index[0]].present,
pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK,
"Expected pte to be present for gva: 0x%08lx", vaddr);
return &pte[index[0]];
@ -360,7 +323,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
{
struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
return *(uint64_t *)pte;
}
@ -368,18 +331,17 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
uint64_t pte)
{
struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid,
vaddr);
uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
*(uint64_t *)new_pte = pte;
}
void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
{
struct pageUpperEntry *pml4e, *pml4e_start;
struct pageUpperEntry *pdpe, *pdpe_start;
struct pageUpperEntry *pde, *pde_start;
struct pageTableEntry *pte, *pte_start;
uint64_t *pml4e, *pml4e_start;
uint64_t *pdpe, *pdpe_start;
uint64_t *pde, *pde_start;
uint64_t *pte, *pte_start;
if (!vm->pgd_created)
return;
@ -389,58 +351,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
fprintf(stream, "%*s index hvaddr gpaddr "
"addr w exec dirty\n",
indent, "");
pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd);
pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
pml4e = &pml4e_start[n1];
if (!pml4e->present)
if (!(*pml4e & PTE_PRESENT_MASK))
continue;
fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
" %u\n",
indent, "",
pml4e - pml4e_start, pml4e,
addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn,
pml4e->writable, pml4e->execute_disable);
addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
!!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size);
pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
pdpe = &pdpe_start[n2];
if (!pdpe->present)
if (!(*pdpe & PTE_PRESENT_MASK))
continue;
fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx "
fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx "
"%u %u\n",
indent, "",
pdpe - pdpe_start, pdpe,
addr_hva2gpa(vm, pdpe),
(uint64_t) pdpe->pfn, pdpe->writable,
pdpe->execute_disable);
PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
!!(*pdpe & PTE_NX_MASK));
pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size);
pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
pde = &pde_start[n3];
if (!pde->present)
if (!(*pde & PTE_PRESENT_MASK))
continue;
fprintf(stream, "%*spde 0x%-3zx %p "
"0x%-12lx 0x%-10lx %u %u\n",
"0x%-12lx 0x%-10llx %u %u\n",
indent, "", pde - pde_start, pde,
addr_hva2gpa(vm, pde),
(uint64_t) pde->pfn, pde->writable,
pde->execute_disable);
PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
!!(*pde & PTE_NX_MASK));
pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size);
pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
pte = &pte_start[n4];
if (!pte->present)
if (!(*pte & PTE_PRESENT_MASK))
continue;
fprintf(stream, "%*spte 0x%-3zx %p "
"0x%-12lx 0x%-10lx %u %u "
"0x%-12lx 0x%-10llx %u %u "
" %u 0x%-10lx\n",
indent, "",
pte - pte_start, pte,
addr_hva2gpa(vm, pte),
(uint64_t) pte->pfn,
pte->writable,
pte->execute_disable,
pte->dirty,
PTE_GET_PFN(*pte),
!!(*pte & PTE_WRITABLE_MASK),
!!(*pte & PTE_NX_MASK),
!!(*pte & PTE_DIRTY_MASK),
((uint64_t) n1 << 27)
| ((uint64_t) n2 << 18)
| ((uint64_t) n3 << 9)
@ -558,8 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
{
uint16_t index[4];
struct pageUpperEntry *pml4e, *pdpe, *pde;
struct pageTableEntry *pte;
uint64_t *pml4e, *pdpe, *pde;
uint64_t *pte;
TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
@ -572,22 +534,22 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
if (!vm->pgd_created)
goto unmapped_gva;
pml4e = addr_gpa2hva(vm, vm->pgd);
if (!pml4e[index[3]].present)
if (!(pml4e[index[3]] & PTE_PRESENT_MASK))
goto unmapped_gva;
pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
if (!pdpe[index[2]].present)
pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
if (!(pdpe[index[2]] & PTE_PRESENT_MASK))
goto unmapped_gva;
pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
if (!pde[index[1]].present)
pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
if (!(pde[index[1]] & PTE_PRESENT_MASK))
goto unmapped_gva;
pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
if (!pte[index[0]].present)
pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
if (!(pte[index[0]] & PTE_PRESENT_MASK))
goto unmapped_gva;
return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu);
return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & ~PAGE_MASK);
unmapped_gva:
TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);

View file

@ -29,7 +29,6 @@
#define X86_FEATURE_XSAVE (1 << 26)
#define X86_FEATURE_OSXSAVE (1 << 27)
#define PAGE_SIZE (1 << 12)
#define NUM_TILES 8
#define TILE_SIZE 1024
#define XSAVE_SIZE ((NUM_TILES * TILE_SIZE) + PAGE_SIZE)

View file

@ -12,7 +12,6 @@
#include "vmx.h"
#define VCPU_ID 1
#define PAGE_SIZE 4096
#define MAXPHYADDR 36
#define MEM_REGION_GVA 0x0000123456789000

View file

@ -21,8 +21,6 @@
#define VCPU_ID 1
#define PAGE_SIZE 4096
#define SMRAM_SIZE 65536
#define SMRAM_MEMSLOT ((1 << 16) | 1)
#define SMRAM_PAGES (SMRAM_SIZE / PAGE_SIZE)

View file

@ -32,7 +32,6 @@
#define MSR_IA32_TSC_ADJUST 0x3b
#endif
#define PAGE_SIZE 4096
#define VCPU_ID 5
#define TSC_ADJUST_VALUE (1ll << 32)

View file

@ -23,7 +23,6 @@
#define SHINFO_REGION_GVA 0xc0000000ULL
#define SHINFO_REGION_GPA 0xc0000000ULL
#define SHINFO_REGION_SLOT 10
#define PAGE_SIZE 4096
#define DUMMY_REGION_GPA (SHINFO_REGION_GPA + (2 * PAGE_SIZE))
#define DUMMY_REGION_SLOT 11

View file

@ -15,7 +15,6 @@
#define HCALL_REGION_GPA 0xc0000000ULL
#define HCALL_REGION_SLOT 10
#define PAGE_SIZE 4096
static struct kvm_vm *vm;

View file

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0-only */
// SPDX-License-Identifier: GPL-2.0-only
/*
* KVM dirty ring implementation
*

View file

@ -164,6 +164,10 @@ __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
{
}
__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
{
}
bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
{
/*
@ -357,6 +361,12 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
#endif
static void kvm_flush_shadow_all(struct kvm *kvm)
{
kvm_arch_flush_shadow_all(kvm);
kvm_arch_guest_memory_reclaimed(kvm);
}
#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
gfp_t gfp_flags)
@ -485,12 +495,15 @@ typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
unsigned long end);
typedef void (*on_unlock_fn_t)(struct kvm *kvm);
struct kvm_hva_range {
unsigned long start;
unsigned long end;
pte_t pte;
hva_handler_t handler;
on_lock_fn_t on_lock;
on_unlock_fn_t on_unlock;
bool flush_on_ret;
bool may_block;
};
@ -578,8 +591,11 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
if (range->flush_on_ret && ret)
kvm_flush_remote_tlbs(kvm);
if (locked)
if (locked) {
KVM_MMU_UNLOCK(kvm);
if (!IS_KVM_NULL_FN(range->on_unlock))
range->on_unlock(kvm);
}
srcu_read_unlock(&kvm->srcu, idx);
@ -600,6 +616,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
.pte = pte,
.handler = handler,
.on_lock = (void *)kvm_null_fn,
.on_unlock = (void *)kvm_null_fn,
.flush_on_ret = true,
.may_block = false,
};
@ -619,6 +636,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
.pte = __pte(0),
.handler = handler,
.on_lock = (void *)kvm_null_fn,
.on_unlock = (void *)kvm_null_fn,
.flush_on_ret = false,
.may_block = false,
};
@ -662,7 +680,7 @@ void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
kvm->mmu_notifier_range_end = end;
} else {
/*
* Fully tracking multiple concurrent ranges has dimishing
* Fully tracking multiple concurrent ranges has diminishing
* returns. Keep things simple and just find the minimal range
* which includes the current and new ranges. As there won't be
* enough information to subtract a range after its invalidate
@ -687,6 +705,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
.pte = __pte(0),
.handler = kvm_unmap_gfn_range,
.on_lock = kvm_inc_notifier_count,
.on_unlock = kvm_arch_guest_memory_reclaimed,
.flush_on_ret = true,
.may_block = mmu_notifier_range_blockable(range),
};
@ -741,6 +760,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
.pte = __pte(0),
.handler = (void *)kvm_null_fn,
.on_lock = kvm_dec_notifier_count,
.on_unlock = (void *)kvm_null_fn,
.flush_on_ret = false,
.may_block = mmu_notifier_range_blockable(range),
};
@ -813,7 +833,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
int idx;
idx = srcu_read_lock(&kvm->srcu);
kvm_arch_flush_shadow_all(kvm);
kvm_flush_shadow_all(kvm);
srcu_read_unlock(&kvm->srcu, idx);
}
@ -955,12 +975,6 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
kvm_vcpu_stats_header.num_desc;
/*
* Force subsequent debugfs file creations to fail if the VM directory
* is not created.
*/
kvm->debugfs_dentry = ERR_PTR(-ENOENT);
if (!debugfs_initialized())
return 0;
@ -1081,6 +1095,12 @@ static struct kvm *kvm_create_vm(unsigned long type)
BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
/*
* Force subsequent debugfs file creations to fail if the VM directory
* is not created (by kvm_create_vm_debugfs()).
*/
kvm->debugfs_dentry = ERR_PTR(-ENOENT);
if (init_srcu_struct(&kvm->srcu))
goto out_err_no_srcu;
if (init_srcu_struct(&kvm->irq_srcu))
@ -1225,7 +1245,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
kvm->mn_active_invalidate_count = 0;
#else
kvm_arch_flush_shadow_all(kvm);
kvm_flush_shadow_all(kvm);
#endif
kvm_arch_destroy_vm(kvm);
kvm_destroy_devices(kvm);
@ -1652,6 +1672,7 @@ static void kvm_invalidate_memslot(struct kvm *kvm,
* - kvm_is_visible_gfn (mmu_check_root)
*/
kvm_arch_flush_shadow_memslot(kvm, old);
kvm_arch_guest_memory_reclaimed(kvm);
/* Was released by kvm_swap_active_memslots, reacquire. */
mutex_lock(&kvm->slots_arch_lock);
@ -1799,7 +1820,7 @@ static int kvm_set_memslot(struct kvm *kvm,
/*
* No need to refresh new->arch, changes after dropping slots_arch_lock
* will directly hit the final, active memsot. Architectures are
* will directly hit the final, active memslot. Architectures are
* responsible for knowing that new->arch may be stale.
*/
kvm_commit_memory_region(kvm, old, new, change);

View file

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-only
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __KVM_MM_H__
#define __KVM_MM_H__ 1