diff options
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r-- | virt/kvm/kvm_main.c | 165 |
1 files changed, 109 insertions, 56 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cb5c13eee193..5bbb5612b207 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -154,11 +154,6 @@ static unsigned long long kvm_active_vms; static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask); -__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) -{ -} - __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } @@ -521,18 +516,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) return container_of(mn, struct kvm, mmu_notifier); } -static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int idx; - - idx = srcu_read_lock(&kvm->srcu); - kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); - srcu_read_unlock(&kvm->srcu, idx); -} - typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, @@ -686,6 +669,24 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn return __kvm_handle_hva_range(kvm, &range); } + +static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + /* + * Skipping invalid memslots is correct if and only change_pte() is + * surrounded by invalidate_range_{start,end}(), which is currently + * guaranteed by the primary MMU. If that ever changes, KVM needs to + * unmap the memslot instead of skipping the memslot to ensure that KVM + * doesn't hold references to the old PFN. + */ + WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); + + if (range->slot->flags & KVM_MEMSLOT_INVALID) + return false; + + return kvm_set_spte_gfn(kvm, range); +} + static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address, @@ -707,7 +708,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, if (!READ_ONCE(kvm->mmu_invalidate_in_progress)) return; - kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); + kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn); } void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, @@ -892,7 +893,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { - .invalidate_range = kvm_mmu_notifier_invalidate_range, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, @@ -2477,7 +2477,7 @@ static inline int check_user_page_hwpoison(unsigned long addr) { int rc, flags = FOLL_HWPOISON | FOLL_WRITE; - rc = get_user_pages(addr, 1, flags, NULL, NULL); + rc = get_user_pages(addr, 1, flags, NULL); return rc == -EHWPOISON; } @@ -2578,6 +2578,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, { kvm_pfn_t pfn; pte_t *ptep; + pte_t pte; spinlock_t *ptl; int r; @@ -2601,14 +2602,16 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, return r; } - if (write_fault && !pte_write(*ptep)) { + pte = ptep_get(ptep); + + if (write_fault && !pte_write(pte)) { pfn = KVM_PFN_ERR_RO_FAULT; goto out; } if (writable) - *writable = pte_write(*ptep); - pfn = pte_pfn(*ptep); + *writable = pte_write(pte); + pfn = pte_pfn(pte); /* * Get a reference here because callers of *hva_to_pfn* and @@ -2626,7 +2629,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * tail pages of non-compound higher order allocations, which * would then underflow the refcount when the caller does the * required put_page. Don't allow those pages here. - */ + */ if (!kvm_try_get_pfn(pfn)) r = -EFAULT; @@ -3870,7 +3873,10 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) static int vcpu_get_pid(void *data, u64 *val) { struct kvm_vcpu *vcpu = data; - *val = pid_nr(rcu_access_pointer(vcpu->pid)); + + rcu_read_lock(); + *val = pid_nr(rcu_dereference(vcpu->pid)); + rcu_read_unlock(); return 0; } @@ -3962,18 +3968,19 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) } vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); - r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT); - BUG_ON(r == -EBUSY); + r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT); if (r) goto unlock_vcpu_destroy; /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); - if (r < 0) { - xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx); - kvm_put_kvm_no_destroy(kvm); - goto unlock_vcpu_destroy; + if (r < 0) + goto kvm_put_xa_release; + + if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) { + r = -EINVAL; + goto kvm_put_xa_release; } /* @@ -3988,6 +3995,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) kvm_create_vcpu_debugfs(vcpu); return r; +kvm_put_xa_release: + kvm_put_kvm_no_destroy(kvm); + xa_release(&kvm->vcpu_array, vcpu->vcpu_idx); unlock_vcpu_destroy: mutex_unlock(&kvm->lock); kvm_dirty_ring_free(&vcpu->dirty_ring); @@ -4025,8 +4035,17 @@ static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer, sizeof(vcpu->stat), user_buffer, size, offset); } +static int kvm_vcpu_stats_release(struct inode *inode, struct file *file) +{ + struct kvm_vcpu *vcpu = file->private_data; + + kvm_put_kvm(vcpu->kvm); + return 0; +} + static const struct file_operations kvm_vcpu_stats_fops = { .read = kvm_vcpu_stats_read, + .release = kvm_vcpu_stats_release, .llseek = noop_llseek, }; @@ -4047,6 +4066,9 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) put_unused_fd(fd); return PTR_ERR(file); } + + kvm_get_kvm(vcpu->kvm); + file->f_mode |= FMODE_PREAD; fd_install(fd, file); @@ -4598,7 +4620,7 @@ int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, return -EINVAL; } -static bool kvm_are_all_memslots_empty(struct kvm *kvm) +bool kvm_are_all_memslots_empty(struct kvm *kvm) { int i; @@ -4611,6 +4633,7 @@ static bool kvm_are_all_memslots_empty(struct kvm *kvm) return true; } +EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty); static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) @@ -4690,8 +4713,17 @@ static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer, sizeof(kvm->stat), user_buffer, size, offset); } +static int kvm_vm_stats_release(struct inode *inode, struct file *file) +{ + struct kvm *kvm = file->private_data; + + kvm_put_kvm(kvm); + return 0; +} + static const struct file_operations kvm_vm_stats_fops = { .read = kvm_vm_stats_read, + .release = kvm_vm_stats_release, .llseek = noop_llseek, }; @@ -4710,6 +4742,9 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) put_unused_fd(fd); return PTR_ERR(file); } + + kvm_get_kvm(kvm); + file->f_mode |= FMODE_PREAD; fd_install(fd, file); @@ -5184,7 +5219,20 @@ static void hardware_disable_all(void) static int hardware_enable_all(void) { atomic_t failed = ATOMIC_INIT(0); - int r = 0; + int r; + + /* + * Do not enable hardware virtualization if the system is going down. + * If userspace initiated a forced reboot, e.g. reboot -f, then it's + * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling + * after kvm_reboot() is called. Note, this relies on system_state + * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops + * hook instead of registering a dedicated reboot notifier (the latter + * runs before system_state is updated). + */ + if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || + system_state == SYSTEM_RESTART) + return -EBUSY; /* * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() @@ -5197,6 +5245,8 @@ static int hardware_enable_all(void) cpus_read_lock(); mutex_lock(&kvm_lock); + r = 0; + kvm_usage_count++; if (kvm_usage_count == 1) { on_each_cpu(hardware_enable_nolock, &failed, 1); @@ -5213,26 +5263,24 @@ static int hardware_enable_all(void) return r; } -static int kvm_reboot(struct notifier_block *notifier, unsigned long val, - void *v) +static void kvm_shutdown(void) { /* - * Some (well, at least mine) BIOSes hang on reboot if - * in vmx root mode. - * - * And Intel TXT required VMX off for all cpu when system shutdown. + * Disable hardware virtualization and set kvm_rebooting to indicate + * that KVM has asynchronously disabled hardware virtualization, i.e. + * that relevant errors and exceptions aren't entirely unexpected. + * Some flavors of hardware virtualization need to be disabled before + * transferring control to firmware (to perform shutdown/reboot), e.g. + * on x86, virtualization can block INIT interrupts, which are used by + * firmware to pull APs back under firmware control. Note, this path + * is used for both shutdown and reboot scenarios, i.e. neither name is + * 100% comprehensive. */ pr_info("kvm: exiting hardware virtualization\n"); kvm_rebooting = true; on_each_cpu(hardware_disable_nolock, NULL, 1); - return NOTIFY_OK; } -static struct notifier_block kvm_reboot_notifier = { - .notifier_call = kvm_reboot, - .priority = 0, -}; - static int kvm_suspend(void) { /* @@ -5263,6 +5311,7 @@ static void kvm_resume(void) static struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, + .shutdown = kvm_shutdown, }; #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static int hardware_enable_all(void) @@ -5276,6 +5325,12 @@ static void hardware_disable_all(void) } #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ +static void kvm_iodevice_destructor(struct kvm_io_device *dev) +{ + if (dev->ops->destructor) + dev->ops->destructor(dev); +} + static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { int i; @@ -5499,7 +5554,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev) { - int i, j; + int i; struct kvm_io_bus *new_bus, *bus; lockdep_assert_held(&kvm->slots_lock); @@ -5529,18 +5584,19 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); - /* Destroy the old bus _after_ installing the (null) bus. */ + /* + * If NULL bus is installed, destroy the old bus, including all the + * attached devices. Otherwise, destroy the caller's device only. + */ if (!new_bus) { pr_err("kvm: failed to shrink bus, removing it completely\n"); - for (j = 0; j < bus->dev_count; j++) { - if (j == i) - continue; - kvm_iodevice_destructor(bus->range[j].dev); - } + kvm_io_bus_destroy(bus); + return -ENOMEM; } + kvm_iodevice_destructor(dev); kfree(bus); - return new_bus ? 0 : -ENOMEM; + return 0; } struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, @@ -5967,7 +6023,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) if (r) return r; - register_reboot_notifier(&kvm_reboot_notifier); register_syscore_ops(&kvm_syscore_ops); #endif @@ -6039,7 +6094,6 @@ err_cpu_kick_mask: err_vcpu_cache: #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); - unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); #endif return r; @@ -6065,7 +6119,6 @@ void kvm_exit(void) kvm_async_pf_deinit(); #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); - unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); #endif kvm_irqfd_exit(); |