diff options
Diffstat (limited to 'arch/x86/kernel/kvm.c')
-rw-r--r-- | arch/x86/kernel/kvm.c | 246 |
1 files changed, 140 insertions, 106 deletions
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 78bb0fae3982..a26643dc6bd6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -26,6 +26,7 @@ #include <linux/kprobes.h> #include <linux/nmi.h> #include <linux/swait.h> +#include <linux/syscore_ops.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -37,6 +38,7 @@ #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> #include <asm/ptrace.h> +#include <asm/reboot.h> #include <asm/svm.h> DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); @@ -345,7 +347,7 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); - pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); + pr_info("setup async PF for cpu %d\n", smp_processor_id()); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { @@ -371,34 +373,17 @@ static void kvm_pv_disable_apf(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(apf_reason.enabled, 0); - pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id()); + pr_info("disable async PF for cpu %d\n", smp_processor_id()); } -static void kvm_pv_guest_cpu_reboot(void *unused) +static void kvm_disable_steal_time(void) { - /* - * We disable PV EOI before we load a new kernel by kexec, - * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. - * New kernel can re-enable when it boots. - */ - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - wrmsrl(MSR_KVM_PV_EOI_EN, 0); - kvm_pv_disable_apf(); - kvm_disable_steal_time(); -} + if (!has_steal_clock) + return; -static int kvm_pv_reboot_notify(struct notifier_block *nb, - unsigned long code, void *unused) -{ - if (code == SYS_RESTART) - on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); - return NOTIFY_DONE; + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); } -static struct notifier_block kvm_pv_reboot_nb = { - .notifier_call = kvm_pv_reboot_notify, -}; - static u64 kvm_steal_clock(int cpu) { u64 steal; @@ -416,14 +401,6 @@ static u64 kvm_steal_clock(int cpu) return steal; } -void kvm_disable_steal_time(void) -{ - if (!has_steal_clock) - return; - - wrmsr(MSR_KVM_STEAL_TIME, 0, 0); -} - static inline void __set_percpu_decrypted(void *ptr, unsigned long size) { early_set_memory_decrypted((unsigned long) ptr, size); @@ -451,6 +428,31 @@ static void __init sev_map_percpu_data(void) } } +static void kvm_guest_cpu_offline(bool shutdown) +{ + kvm_disable_steal_time(); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); + if (!shutdown) + apf_task_wake_all(); + kvmclock_disable(); +} + +static int kvm_cpu_online(unsigned int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + kvm_guest_cpu_init(); + local_irq_restore(flags); + return 0; +} + +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); + static bool pv_tlb_flush_supported(void) { return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && @@ -458,10 +460,6 @@ static bool pv_tlb_flush_supported(void) kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); } -static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); - -#ifdef CONFIG_SMP - static bool pv_ipi_supported(void) { return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI); @@ -574,6 +572,54 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) } } +static void kvm_flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + u8 state; + int cpu; + struct kvm_steal_time *src; + struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); + + cpumask_copy(flushmask, cpumask); + /* + * We have to call flush only on online vCPUs. And + * queue flush_on_enter for pre-empted vCPUs + */ + for_each_cpu(cpu, flushmask) { + /* + * The local vCPU is never preempted, so we do not explicitly + * skip check for local vCPU - it will never be cleared from + * flushmask. + */ + src = &per_cpu(steal_time, cpu); + state = READ_ONCE(src->preempted); + if ((state & KVM_VCPU_PREEMPTED)) { + if (try_cmpxchg(&src->preempted, &state, + state | KVM_VCPU_FLUSH_TLB)) + __cpumask_clear_cpu(cpu, flushmask); + } + } + + native_flush_tlb_multi(flushmask, info); +} + +static __init int kvm_alloc_cpumask(void) +{ + int cpu; + + if (!kvm_para_available() || nopv) + return 0; + + if (pv_tlb_flush_supported() || pv_ipi_supported()) + for_each_possible_cpu(cpu) { + zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + } + + return 0; +} +arch_initcall(kvm_alloc_cpumask); + static void __init kvm_smp_prepare_boot_cpu(void) { /* @@ -587,57 +633,65 @@ static void __init kvm_smp_prepare_boot_cpu(void) kvm_spinlock_init(); } -static void kvm_guest_cpu_offline(void) +static int kvm_cpu_down_prepare(unsigned int cpu) { - kvm_disable_steal_time(); - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - wrmsrl(MSR_KVM_PV_EOI_EN, 0); - kvm_pv_disable_apf(); - apf_task_wake_all(); + unsigned long flags; + + local_irq_save(flags); + kvm_guest_cpu_offline(false); + local_irq_restore(flags); + return 0; } -static int kvm_cpu_online(unsigned int cpu) +#endif + +static int kvm_suspend(void) { - local_irq_disable(); - kvm_guest_cpu_init(); - local_irq_enable(); + kvm_guest_cpu_offline(false); + return 0; } -static int kvm_cpu_down_prepare(unsigned int cpu) +static void kvm_resume(void) { - local_irq_disable(); - kvm_guest_cpu_offline(); - local_irq_enable(); - return 0; + kvm_cpu_online(raw_smp_processor_id()); } -#endif -static void kvm_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) +static struct syscore_ops kvm_syscore_ops = { + .suspend = kvm_suspend, + .resume = kvm_resume, +}; + +static void kvm_pv_guest_cpu_reboot(void *unused) { - u8 state; - int cpu; - struct kvm_steal_time *src; - struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); + kvm_guest_cpu_offline(true); +} - cpumask_copy(flushmask, cpumask); - /* - * We have to call flush only on online vCPUs. And - * queue flush_on_enter for pre-empted vCPUs - */ - for_each_cpu(cpu, flushmask) { - src = &per_cpu(steal_time, cpu); - state = READ_ONCE(src->preempted); - if ((state & KVM_VCPU_PREEMPTED)) { - if (try_cmpxchg(&src->preempted, &state, - state | KVM_VCPU_FLUSH_TLB)) - __cpumask_clear_cpu(cpu, flushmask); - } - } +static int kvm_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block kvm_pv_reboot_nb = { + .notifier_call = kvm_pv_reboot_notify, +}; - native_flush_tlb_others(flushmask, info); +/* + * After a PV feature is registered, the host will keep writing to the + * registered memory location. If the guest happens to shutdown, this memory + * won't be valid. In cases like kexec, in which you install a new kernel, this + * means a random memory location will be kept being written. + */ +#ifdef CONFIG_KEXEC_CORE +static void kvm_crash_shutdown(struct pt_regs *regs) +{ + kvm_guest_cpu_offline(true); + native_machine_crash_shutdown(regs); } +#endif static void __init kvm_guest_init(void) { @@ -650,13 +704,7 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; - pv_ops.time.steal_clock = kvm_steal_clock; - } - - if (pv_tlb_flush_supported()) { - pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; - pr_info("KVM setup pv remote TLB flush\n"); + static_call_update(pv_steal_clock, kvm_steal_clock); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) @@ -668,6 +716,12 @@ static void __init kvm_guest_init(void) } #ifdef CONFIG_SMP + if (pv_tlb_flush_supported()) { + pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; + pv_ops.mmu.tlb_remove_table = tlb_remove_table; + pr_info("KVM setup pv remote TLB flush\n"); + } + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; if (pv_sched_yield_supported()) { smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; @@ -681,6 +735,12 @@ static void __init kvm_guest_init(void) kvm_guest_cpu_init(); #endif +#ifdef CONFIG_KEXEC_CORE + machine_ops.crash_shutdown = kvm_crash_shutdown; +#endif + + register_syscore_ops(&kvm_syscore_ops); + /* * Hard lockup detection is enabled by default. Disable it, as guests * can get false positives too easily, for example if the host is @@ -734,7 +794,7 @@ static uint32_t __init kvm_detect(void) static void __init kvm_apic_init(void) { -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP if (pv_ipi_supported()) kvm_setup_pv_ipi(); #endif @@ -794,32 +854,6 @@ static __init int activate_jump_labels(void) } arch_initcall(activate_jump_labels); -static __init int kvm_alloc_cpumask(void) -{ - int cpu; - bool alloc = false; - - if (!kvm_para_available() || nopv) - return 0; - - if (pv_tlb_flush_supported()) - alloc = true; - -#if defined(CONFIG_SMP) - if (pv_ipi_supported()) - alloc = true; -#endif - - if (alloc) - for_each_possible_cpu(cpu) { - zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), - GFP_KERNEL, cpu_to_node(cpu)); - } - - return 0; -} -arch_initcall(kvm_alloc_cpumask); - #ifdef CONFIG_PARAVIRT_SPINLOCKS /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ |