aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/debugfs.c46
-rw-r--r--arch/x86/kvm/hyperv.c5
-rw-r--r--arch/x86/kvm/lapic.c13
-rw-r--r--arch/x86/kvm/mmu.c118
-rw-r--r--arch/x86/kvm/svm.c15
-rw-r--r--arch/x86/kvm/vmx/nested.c4
-rw-r--r--arch/x86/kvm/vmx/vmx.c7
-rw-r--r--arch/x86/kvm/x86.c32
8 files changed, 159 insertions, 81 deletions
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 329361b69d5e..018aebce33ff 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -8,11 +8,6 @@
#include <linux/debugfs.h>
#include "lapic.h"
-bool kvm_arch_has_vcpu_debugfs(void)
-{
- return true;
-}
-
static int vcpu_get_timer_advance_ns(void *data, u64 *val)
{
struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
@@ -48,37 +43,22 @@ static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
-int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
{
- struct dentry *ret;
-
- ret = debugfs_create_file("tsc-offset", 0444,
- vcpu->debugfs_dentry,
- vcpu, &vcpu_tsc_offset_fops);
- if (!ret)
- return -ENOMEM;
+ debugfs_create_file("tsc-offset", 0444, vcpu->debugfs_dentry, vcpu,
+ &vcpu_tsc_offset_fops);
- if (lapic_in_kernel(vcpu)) {
- ret = debugfs_create_file("lapic_timer_advance_ns", 0444,
- vcpu->debugfs_dentry,
- vcpu, &vcpu_timer_advance_ns_fops);
- if (!ret)
- return -ENOMEM;
- }
+ if (lapic_in_kernel(vcpu))
+ debugfs_create_file("lapic_timer_advance_ns", 0444,
+ vcpu->debugfs_dentry, vcpu,
+ &vcpu_timer_advance_ns_fops);
if (kvm_has_tsc_control) {
- ret = debugfs_create_file("tsc-scaling-ratio", 0444,
- vcpu->debugfs_dentry,
- vcpu, &vcpu_tsc_scaling_fops);
- if (!ret)
- return -ENOMEM;
- ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
- vcpu->debugfs_dentry,
- vcpu, &vcpu_tsc_scaling_frac_fops);
- if (!ret)
- return -ENOMEM;
-
+ debugfs_create_file("tsc-scaling-ratio", 0444,
+ vcpu->debugfs_dentry, vcpu,
+ &vcpu_tsc_scaling_fops);
+ debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+ vcpu->debugfs_dentry, vcpu,
+ &vcpu_tsc_scaling_frac_fops);
}
-
- return 0;
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index c10a8b10b203..fff790a3f4ee 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1781,7 +1781,7 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries)
{
- uint16_t evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu);
+ uint16_t evmcs_ver = 0;
struct kvm_cpuid_entry2 cpuid_entries[] = {
{ .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS },
{ .function = HYPERV_CPUID_INTERFACE },
@@ -1793,6 +1793,9 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
};
int i, nent = ARRAY_SIZE(cpuid_entries);
+ if (kvm_x86_ops->nested_get_evmcs_version)
+ evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu);
+
/* Skip NESTED_FEATURES if eVMCS is not supported */
if (!evmcs_ver)
--nent;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0aa158657f20..e904ff06a83d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -216,6 +216,9 @@ static void recalculate_apic_map(struct kvm *kvm)
if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
new->phys_map[xapic_id] = apic;
+ if (!kvm_apic_sw_enabled(apic))
+ continue;
+
ldr = kvm_lapic_get_reg(apic, APIC_LDR);
if (apic_x2apic_mode(apic)) {
@@ -258,6 +261,8 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
static_key_slow_dec_deferred(&apic_sw_disabled);
else
static_key_slow_inc(&apic_sw_disabled.key);
+
+ recalculate_apic_map(apic->vcpu->kvm);
}
}
@@ -1548,7 +1553,6 @@ static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic)
static void apic_timer_expired(struct kvm_lapic *apic)
{
struct kvm_vcpu *vcpu = apic->vcpu;
- struct swait_queue_head *q = &vcpu->wq;
struct kvm_timer *ktimer = &apic->lapic_timer;
if (atomic_read(&apic->lapic_timer.pending))
@@ -1566,13 +1570,6 @@ static void apic_timer_expired(struct kvm_lapic *apic)
atomic_inc(&apic->lapic_timer.pending);
kvm_set_pending_timer(vcpu);
-
- /*
- * For x86, the atomic_inc() is serialized, thus
- * using swait_active() is safe.
- */
- if (swait_active(q))
- swake_up_one(q);
}
static void start_sw_tscdeadline(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 24843cf49579..a63964e7cec7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2095,6 +2095,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
if (!direct)
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ /*
+ * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
+ * depends on valid pages being added to the head of the list. See
+ * comments in kvm_zap_obsolete_pages().
+ */
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
@@ -2244,7 +2250,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
#define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
- if ((_sp)->role.invalid) { \
+ if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
} else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
@@ -2301,6 +2307,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
static void mmu_audit_disable(void) { }
#endif
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
@@ -2525,6 +2536,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (level > PT_PAGE_TABLE_LEVEL && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
+ sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
@@ -4233,6 +4245,13 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
return false;
if (cached_root_available(vcpu, new_cr3, new_role)) {
+ /*
+ * It is possible that the cached previous root page is
+ * obsolete because of a change in the MMU generation
+ * number. However, changing the generation number is
+ * accompanied by KVM_REQ_MMU_RELOAD, which will free
+ * the root set here and allocate a new one.
+ */
kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
if (!skip_tlb_flush) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -5649,44 +5668,91 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
return alloc_mmu_pages(vcpu);
}
-static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_page_track_notifier_node *node)
+
+static void kvm_zap_obsolete_pages(struct kvm *kvm)
{
- struct kvm_mmu_page *sp;
+ struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
- unsigned long i;
- bool flush;
- gfn_t gfn;
-
- spin_lock(&kvm->mmu_lock);
-
- if (list_empty(&kvm->arch.active_mmu_pages))
- goto out_unlock;
-
- flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false);
+ int ign;
- for (i = 0; i < slot->npages; i++) {
- gfn = slot->base_gfn + i;
+restart:
+ list_for_each_entry_safe_reverse(sp, node,
+ &kvm->arch.active_mmu_pages, link) {
+ /*
+ * No obsolete valid page exists before a newly created page
+ * since active_mmu_pages is a FIFO list.
+ */
+ if (!is_obsolete_sp(kvm, sp))
+ break;
- for_each_valid_sp(kvm, sp, gfn) {
- if (sp->gfn != gfn)
- continue;
+ /*
+ * Do not repeatedly zap a root page to avoid unnecessary
+ * KVM_REQ_MMU_RELOAD, otherwise we may not be able to
+ * progress:
+ * vcpu 0 vcpu 1
+ * call vcpu_enter_guest():
+ * 1): handle KVM_REQ_MMU_RELOAD
+ * and require mmu-lock to
+ * load mmu
+ * repeat:
+ * 1): zap root page and
+ * send KVM_REQ_MMU_RELOAD
+ *
+ * 2): if (cond_resched_lock(mmu-lock))
+ *
+ * 2): hold mmu-lock and load mmu
+ *
+ * 3): see KVM_REQ_MMU_RELOAD bit
+ * on vcpu->requests is set
+ * then return 1 to call
+ * vcpu_enter_guest() again.
+ * goto repeat;
+ *
+ * Since we are reversely walking the list and the invalid
+ * list will be moved to the head, skip the invalid page
+ * can help us to avoid the infinity list walking.
+ */
+ if (sp->role.invalid)
+ continue;
- kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- }
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
- flush = false;
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
cond_resched_lock(&kvm->mmu_lock);
+ goto restart;
}
+
+ if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
+ goto restart;
}
- kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
-out_unlock:
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+}
+
+/*
+ * Fast invalidate all shadow pages and use lock-break technique
+ * to zap obsolete pages.
+ *
+ * It's required when memslot is being deleted or VM is being
+ * destroyed, in these cases, we should ensure that KVM MMU does
+ * not use any resource of the being-deleted slot or all slots
+ * after calling the function.
+ */
+static void kvm_mmu_zap_all_fast(struct kvm *kvm)
+{
+ spin_lock(&kvm->mmu_lock);
+ kvm->arch.mmu_valid_gen++;
+
+ kvm_zap_obsolete_pages(kvm);
spin_unlock(&kvm->mmu_lock);
}
+static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ struct kvm_page_track_notifier_node *node)
+{
+ kvm_mmu_zap_all_fast(kvm);
+}
+
void kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7eafc6907861..e0368076a1ef 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1714,7 +1714,6 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
if (!entry)
return -EINVAL;
- new_entry = READ_ONCE(*entry);
new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
@@ -5190,6 +5189,11 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
kvm_vcpu_wake_up(vcpu);
}
+static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
{
unsigned long flags;
@@ -7124,12 +7128,6 @@ failed:
return ret;
}
-static uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
-{
- /* Not supported */
- return 0;
-}
-
static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version)
{
@@ -7314,6 +7312,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.pmu_ops = &amd_pmu_ops,
.deliver_posted_interrupt = svm_deliver_avic_intr,
+ .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
.update_pi_irte = svm_update_pi_irte,
.setup_mce = svm_setup_mce,
@@ -7327,7 +7326,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.mem_enc_unreg_region = svm_unregister_enc_region,
.nested_enable_evmcs = nested_enable_evmcs,
- .nested_get_evmcs_version = nested_get_evmcs_version,
+ .nested_get_evmcs_version = NULL,
.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
};
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ced9fba32598..a3cba321b5c5 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4540,6 +4540,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
int len;
gva_t gva = 0;
struct vmcs12 *vmcs12;
+ struct x86_exception e;
short offset;
if (!nested_vmx_check_permission(vcpu))
@@ -4588,7 +4589,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
vmx_instruction_info, true, len, &gva))
return 1;
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
- kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL);
+ if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e))
+ kvm_inject_page_fault(vcpu, &e);
}
return nested_vmx_succeed(vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 074385c86c09..c030c96fc81a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6117,6 +6117,11 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
return max_irr;
}
+static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ return pi_test_on(vcpu_to_pi_desc(vcpu));
+}
+
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
@@ -7726,6 +7731,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
+ .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
.set_tss_addr = vmx_set_tss_addr,
.set_identity_map_addr = vmx_set_identity_map_addr,
@@ -7791,6 +7797,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.set_nested_state = NULL,
.get_vmcs12_pages = NULL,
.nested_enable_evmcs = NULL,
+ .nested_get_evmcs_version = NULL,
.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
};
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c6d951cbd76c..91602d310a3f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5312,6 +5312,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
}
@@ -6594,12 +6601,13 @@ restart:
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
- kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE && ctxt->tf)
- kvm_vcpu_do_singlestep(vcpu, &r);
if (!ctxt->have_exception ||
- exception_type(ctxt->exception.vector) == EXCPT_TRAP)
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+ kvm_rip_write(vcpu, ctxt->eip);
+ if (r == EMULATE_DONE && ctxt->tf)
+ kvm_vcpu_do_singlestep(vcpu, &r);
__kvm_set_rflags(vcpu, ctxt->eflags);
+ }
/*
* For STI, interrupts are shadowed; so KVM_REQ_EVENT will
@@ -9698,6 +9706,22 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
}
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+ if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return true;
+
+ if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
+ return true;
+
+ return false;
+}
+
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
{
return vcpu->arch.preempted_in_kernel;