aboutsummaryrefslogtreecommitdiff
path: root/virt/kvm/kvm_main.c
diff options
context:
space:
mode:
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r--virt/kvm/kvm_main.c165
1 files changed, 109 insertions, 56 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cb5c13eee193..5bbb5612b207 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -154,11 +154,6 @@ static unsigned long long kvm_active_vms;
static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
-__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end)
-{
-}
-
__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
{
}
@@ -521,18 +516,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
return container_of(mn, struct kvm, mmu_notifier);
}
-static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int idx;
-
- idx = srcu_read_lock(&kvm->srcu);
- kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
- srcu_read_unlock(&kvm->srcu, idx);
-}
-
typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
@@ -686,6 +669,24 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
return __kvm_handle_hva_range(kvm, &range);
}
+
+static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ /*
+ * Skipping invalid memslots is correct if and only change_pte() is
+ * surrounded by invalidate_range_{start,end}(), which is currently
+ * guaranteed by the primary MMU. If that ever changes, KVM needs to
+ * unmap the memslot instead of skipping the memslot to ensure that KVM
+ * doesn't hold references to the old PFN.
+ */
+ WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
+
+ if (range->slot->flags & KVM_MEMSLOT_INVALID)
+ return false;
+
+ return kvm_set_spte_gfn(kvm, range);
+}
+
static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address,
@@ -707,7 +708,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
return;
- kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
+ kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn);
}
void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
@@ -892,7 +893,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
}
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
- .invalidate_range = kvm_mmu_notifier_invalidate_range,
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
@@ -2477,7 +2477,7 @@ static inline int check_user_page_hwpoison(unsigned long addr)
{
int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
- rc = get_user_pages(addr, 1, flags, NULL, NULL);
+ rc = get_user_pages(addr, 1, flags, NULL);
return rc == -EHWPOISON;
}
@@ -2578,6 +2578,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
{
kvm_pfn_t pfn;
pte_t *ptep;
+ pte_t pte;
spinlock_t *ptl;
int r;
@@ -2601,14 +2602,16 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
return r;
}
- if (write_fault && !pte_write(*ptep)) {
+ pte = ptep_get(ptep);
+
+ if (write_fault && !pte_write(pte)) {
pfn = KVM_PFN_ERR_RO_FAULT;
goto out;
}
if (writable)
- *writable = pte_write(*ptep);
- pfn = pte_pfn(*ptep);
+ *writable = pte_write(pte);
+ pfn = pte_pfn(pte);
/*
* Get a reference here because callers of *hva_to_pfn* and
@@ -2626,7 +2629,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
* tail pages of non-compound higher order allocations, which
* would then underflow the refcount when the caller does the
* required put_page. Don't allow those pages here.
- */
+ */
if (!kvm_try_get_pfn(pfn))
r = -EFAULT;
@@ -3870,7 +3873,10 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
static int vcpu_get_pid(void *data, u64 *val)
{
struct kvm_vcpu *vcpu = data;
- *val = pid_nr(rcu_access_pointer(vcpu->pid));
+
+ rcu_read_lock();
+ *val = pid_nr(rcu_dereference(vcpu->pid));
+ rcu_read_unlock();
return 0;
}
@@ -3962,18 +3968,19 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
}
vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
- r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
- BUG_ON(r == -EBUSY);
+ r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
if (r)
goto unlock_vcpu_destroy;
/* Now it's all set up, let userspace reach it */
kvm_get_kvm(kvm);
r = create_vcpu_fd(vcpu);
- if (r < 0) {
- xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
- kvm_put_kvm_no_destroy(kvm);
- goto unlock_vcpu_destroy;
+ if (r < 0)
+ goto kvm_put_xa_release;
+
+ if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
+ r = -EINVAL;
+ goto kvm_put_xa_release;
}
/*
@@ -3988,6 +3995,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
kvm_create_vcpu_debugfs(vcpu);
return r;
+kvm_put_xa_release:
+ kvm_put_kvm_no_destroy(kvm);
+ xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
unlock_vcpu_destroy:
mutex_unlock(&kvm->lock);
kvm_dirty_ring_free(&vcpu->dirty_ring);
@@ -4025,8 +4035,17 @@ static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
sizeof(vcpu->stat), user_buffer, size, offset);
}
+static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
+{
+ struct kvm_vcpu *vcpu = file->private_data;
+
+ kvm_put_kvm(vcpu->kvm);
+ return 0;
+}
+
static const struct file_operations kvm_vcpu_stats_fops = {
.read = kvm_vcpu_stats_read,
+ .release = kvm_vcpu_stats_release,
.llseek = noop_llseek,
};
@@ -4047,6 +4066,9 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
put_unused_fd(fd);
return PTR_ERR(file);
}
+
+ kvm_get_kvm(vcpu->kvm);
+
file->f_mode |= FMODE_PREAD;
fd_install(fd, file);
@@ -4598,7 +4620,7 @@ int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
return -EINVAL;
}
-static bool kvm_are_all_memslots_empty(struct kvm *kvm)
+bool kvm_are_all_memslots_empty(struct kvm *kvm)
{
int i;
@@ -4611,6 +4633,7 @@ static bool kvm_are_all_memslots_empty(struct kvm *kvm)
return true;
}
+EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
struct kvm_enable_cap *cap)
@@ -4690,8 +4713,17 @@ static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
sizeof(kvm->stat), user_buffer, size, offset);
}
+static int kvm_vm_stats_release(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = file->private_data;
+
+ kvm_put_kvm(kvm);
+ return 0;
+}
+
static const struct file_operations kvm_vm_stats_fops = {
.read = kvm_vm_stats_read,
+ .release = kvm_vm_stats_release,
.llseek = noop_llseek,
};
@@ -4710,6 +4742,9 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
put_unused_fd(fd);
return PTR_ERR(file);
}
+
+ kvm_get_kvm(kvm);
+
file->f_mode |= FMODE_PREAD;
fd_install(fd, file);
@@ -5184,7 +5219,20 @@ static void hardware_disable_all(void)
static int hardware_enable_all(void)
{
atomic_t failed = ATOMIC_INIT(0);
- int r = 0;
+ int r;
+
+ /*
+ * Do not enable hardware virtualization if the system is going down.
+ * If userspace initiated a forced reboot, e.g. reboot -f, then it's
+ * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
+ * after kvm_reboot() is called. Note, this relies on system_state
+ * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
+ * hook instead of registering a dedicated reboot notifier (the latter
+ * runs before system_state is updated).
+ */
+ if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
+ system_state == SYSTEM_RESTART)
+ return -EBUSY;
/*
* When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
@@ -5197,6 +5245,8 @@ static int hardware_enable_all(void)
cpus_read_lock();
mutex_lock(&kvm_lock);
+ r = 0;
+
kvm_usage_count++;
if (kvm_usage_count == 1) {
on_each_cpu(hardware_enable_nolock, &failed, 1);
@@ -5213,26 +5263,24 @@ static int hardware_enable_all(void)
return r;
}
-static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
- void *v)
+static void kvm_shutdown(void)
{
/*
- * Some (well, at least mine) BIOSes hang on reboot if
- * in vmx root mode.
- *
- * And Intel TXT required VMX off for all cpu when system shutdown.
+ * Disable hardware virtualization and set kvm_rebooting to indicate
+ * that KVM has asynchronously disabled hardware virtualization, i.e.
+ * that relevant errors and exceptions aren't entirely unexpected.
+ * Some flavors of hardware virtualization need to be disabled before
+ * transferring control to firmware (to perform shutdown/reboot), e.g.
+ * on x86, virtualization can block INIT interrupts, which are used by
+ * firmware to pull APs back under firmware control. Note, this path
+ * is used for both shutdown and reboot scenarios, i.e. neither name is
+ * 100% comprehensive.
*/
pr_info("kvm: exiting hardware virtualization\n");
kvm_rebooting = true;
on_each_cpu(hardware_disable_nolock, NULL, 1);
- return NOTIFY_OK;
}
-static struct notifier_block kvm_reboot_notifier = {
- .notifier_call = kvm_reboot,
- .priority = 0,
-};
-
static int kvm_suspend(void)
{
/*
@@ -5263,6 +5311,7 @@ static void kvm_resume(void)
static struct syscore_ops kvm_syscore_ops = {
.suspend = kvm_suspend,
.resume = kvm_resume,
+ .shutdown = kvm_shutdown,
};
#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
static int hardware_enable_all(void)
@@ -5276,6 +5325,12 @@ static void hardware_disable_all(void)
}
#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
+static void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+ if (dev->ops->destructor)
+ dev->ops->destructor(dev);
+}
+
static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
{
int i;
@@ -5499,7 +5554,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_device *dev)
{
- int i, j;
+ int i;
struct kvm_io_bus *new_bus, *bus;
lockdep_assert_held(&kvm->slots_lock);
@@ -5529,18 +5584,19 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
synchronize_srcu_expedited(&kvm->srcu);
- /* Destroy the old bus _after_ installing the (null) bus. */
+ /*
+ * If NULL bus is installed, destroy the old bus, including all the
+ * attached devices. Otherwise, destroy the caller's device only.
+ */
if (!new_bus) {
pr_err("kvm: failed to shrink bus, removing it completely\n");
- for (j = 0; j < bus->dev_count; j++) {
- if (j == i)
- continue;
- kvm_iodevice_destructor(bus->range[j].dev);
- }
+ kvm_io_bus_destroy(bus);
+ return -ENOMEM;
}
+ kvm_iodevice_destructor(dev);
kfree(bus);
- return new_bus ? 0 : -ENOMEM;
+ return 0;
}
struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -5967,7 +6023,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
if (r)
return r;
- register_reboot_notifier(&kvm_reboot_notifier);
register_syscore_ops(&kvm_syscore_ops);
#endif
@@ -6039,7 +6094,6 @@ err_cpu_kick_mask:
err_vcpu_cache:
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
unregister_syscore_ops(&kvm_syscore_ops);
- unregister_reboot_notifier(&kvm_reboot_notifier);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
#endif
return r;
@@ -6065,7 +6119,6 @@ void kvm_exit(void)
kvm_async_pf_deinit();
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
unregister_syscore_ops(&kvm_syscore_ops);
- unregister_reboot_notifier(&kvm_reboot_notifier);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
#endif
kvm_irqfd_exit();