diff options
| author | Dmitry Torokhov <[email protected]> | 2023-05-01 15:20:08 -0700 | 
|---|---|---|
| committer | Dmitry Torokhov <[email protected]> | 2023-05-01 15:20:08 -0700 | 
| commit | 9a87ffc99ec8eb8d35eed7c4f816d75f5cc9662e (patch) | |
| tree | d57f3a63479a07b4e0cece029886e76e04feb984 /virt/kvm/kvm_main.c | |
| parent | 5dc63e56a9cf8df0b59c234a505a1653f1bdf885 (diff) | |
| parent | 53bea86b5712c7491bb3dae12e271666df0a308c (diff) | |
Merge branch 'next' into for-linus
Prepare input updates for 6.4 merge window.
Diffstat (limited to 'virt/kvm/kvm_main.c')
| -rw-r--r-- | virt/kvm/kvm_main.c | 304 | 
1 files changed, 163 insertions, 141 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 13e88297f999..d255964ec331 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -100,13 +100,8 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);   */  DEFINE_MUTEX(kvm_lock); -static DEFINE_RAW_SPINLOCK(kvm_count_lock);  LIST_HEAD(vm_list); -static cpumask_var_t cpus_hardware_enabled; -static int kvm_usage_count; -static atomic_t hardware_enable_failed; -  static struct kmem_cache *kvm_vcpu_cache;  static __read_mostly struct preempt_ops kvm_preempt_ops; @@ -148,9 +143,6 @@ static void hardware_disable_all(void);  static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -__visible bool kvm_rebooting; -EXPORT_SYMBOL_GPL(kvm_rebooting); -  #define KVM_EVENT_CREATE_VM 0  #define KVM_EVENT_DESTROY_VM 1  static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); @@ -3954,6 +3946,13 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)  	}  	mutex_lock(&kvm->lock); + +#ifdef CONFIG_LOCKDEP +	/* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */ +	mutex_lock(&vcpu->mutex); +	mutex_unlock(&vcpu->mutex); +#endif +  	if (kvm_get_vcpu_by_id(kvm, id)) {  		r = -EEXIST;  		goto unlock_vcpu_destroy; @@ -5095,50 +5094,70 @@ static struct miscdevice kvm_dev = {  	&kvm_chardev_ops,  }; -static void hardware_enable_nolock(void *junk) -{ -	int cpu = raw_smp_processor_id(); -	int r; - -	if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) -		return; +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +__visible bool kvm_rebooting; +EXPORT_SYMBOL_GPL(kvm_rebooting); -	cpumask_set_cpu(cpu, cpus_hardware_enabled); +static DEFINE_PER_CPU(bool, hardware_enabled); +static int kvm_usage_count; -	r = kvm_arch_hardware_enable(); +static int __hardware_enable_nolock(void) +{ +	if (__this_cpu_read(hardware_enabled)) +		return 0; -	if (r) { -		cpumask_clear_cpu(cpu, cpus_hardware_enabled); -		atomic_inc(&hardware_enable_failed); -		pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); +	if (kvm_arch_hardware_enable()) { +		pr_info("kvm: enabling virtualization on CPU%d failed\n", +			raw_smp_processor_id()); +		return -EIO;  	} + +	__this_cpu_write(hardware_enabled, true); +	return 0;  } -static int kvm_starting_cpu(unsigned int cpu) +static void hardware_enable_nolock(void *failed)  { -	raw_spin_lock(&kvm_count_lock); +	if (__hardware_enable_nolock()) +		atomic_inc(failed); +} + +static int kvm_online_cpu(unsigned int cpu) +{ +	int ret = 0; + +	/* +	 * Abort the CPU online process if hardware virtualization cannot +	 * be enabled. Otherwise running VMs would encounter unrecoverable +	 * errors when scheduled to this CPU. +	 */ +	mutex_lock(&kvm_lock);  	if (kvm_usage_count) -		hardware_enable_nolock(NULL); -	raw_spin_unlock(&kvm_count_lock); -	return 0; +		ret = __hardware_enable_nolock(); +	mutex_unlock(&kvm_lock); +	return ret;  }  static void hardware_disable_nolock(void *junk)  { -	int cpu = raw_smp_processor_id(); - -	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) +	/* +	 * Note, hardware_disable_all_nolock() tells all online CPUs to disable +	 * hardware, not just CPUs that successfully enabled hardware! +	 */ +	if (!__this_cpu_read(hardware_enabled))  		return; -	cpumask_clear_cpu(cpu, cpus_hardware_enabled); +  	kvm_arch_hardware_disable(); + +	__this_cpu_write(hardware_enabled, false);  } -static int kvm_dying_cpu(unsigned int cpu) +static int kvm_offline_cpu(unsigned int cpu)  { -	raw_spin_lock(&kvm_count_lock); +	mutex_lock(&kvm_lock);  	if (kvm_usage_count)  		hardware_disable_nolock(NULL); -	raw_spin_unlock(&kvm_count_lock); +	mutex_unlock(&kvm_lock);  	return 0;  } @@ -5153,29 +5172,41 @@ static void hardware_disable_all_nolock(void)  static void hardware_disable_all(void)  { -	raw_spin_lock(&kvm_count_lock); +	cpus_read_lock(); +	mutex_lock(&kvm_lock);  	hardware_disable_all_nolock(); -	raw_spin_unlock(&kvm_count_lock); +	mutex_unlock(&kvm_lock); +	cpus_read_unlock();  }  static int hardware_enable_all(void)  { +	atomic_t failed = ATOMIC_INIT(0);  	int r = 0; -	raw_spin_lock(&kvm_count_lock); +	/* +	 * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() +	 * is called, and so on_each_cpu() between them includes the CPU that +	 * is being onlined.  As a result, hardware_enable_nolock() may get +	 * invoked before kvm_online_cpu(), which also enables hardware if the +	 * usage count is non-zero.  Disable CPU hotplug to avoid attempting to +	 * enable hardware multiple times. +	 */ +	cpus_read_lock(); +	mutex_lock(&kvm_lock);  	kvm_usage_count++;  	if (kvm_usage_count == 1) { -		atomic_set(&hardware_enable_failed, 0); -		on_each_cpu(hardware_enable_nolock, NULL, 1); +		on_each_cpu(hardware_enable_nolock, &failed, 1); -		if (atomic_read(&hardware_enable_failed)) { +		if (atomic_read(&failed)) {  			hardware_disable_all_nolock();  			r = -EBUSY;  		}  	} -	raw_spin_unlock(&kvm_count_lock); +	mutex_unlock(&kvm_lock); +	cpus_read_unlock();  	return r;  } @@ -5200,6 +5231,49 @@ static struct notifier_block kvm_reboot_notifier = {  	.priority = 0,  }; +static int kvm_suspend(void) +{ +	/* +	 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume +	 * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count +	 * is stable.  Assert that kvm_lock is not held to ensure the system +	 * isn't suspended while KVM is enabling hardware.  Hardware enabling +	 * can be preempted, but the task cannot be frozen until it has dropped +	 * all locks (userspace tasks are frozen via a fake signal). +	 */ +	lockdep_assert_not_held(&kvm_lock); +	lockdep_assert_irqs_disabled(); + +	if (kvm_usage_count) +		hardware_disable_nolock(NULL); +	return 0; +} + +static void kvm_resume(void) +{ +	lockdep_assert_not_held(&kvm_lock); +	lockdep_assert_irqs_disabled(); + +	if (kvm_usage_count) +		WARN_ON_ONCE(__hardware_enable_nolock()); +} + +static struct syscore_ops kvm_syscore_ops = { +	.suspend = kvm_suspend, +	.resume = kvm_resume, +}; +#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ +static int hardware_enable_all(void) +{ +	return 0; +} + +static void hardware_disable_all(void) +{ + +} +#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ +  static void kvm_io_bus_destroy(struct kvm_io_bus *bus)  {  	int i; @@ -5778,26 +5852,6 @@ static void kvm_init_debug(void)  	}  } -static int kvm_suspend(void) -{ -	if (kvm_usage_count) -		hardware_disable_nolock(NULL); -	return 0; -} - -static void kvm_resume(void) -{ -	if (kvm_usage_count) { -		lockdep_assert_not_held(&kvm_count_lock); -		hardware_enable_nolock(NULL); -	} -} - -static struct syscore_ops kvm_syscore_ops = { -	.suspend = kvm_suspend, -	.resume = kvm_resume, -}; -  static inline  struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)  { @@ -5902,62 +5956,20 @@ void kvm_unregister_perf_callbacks(void)  }  #endif -struct kvm_cpu_compat_check { -	void *opaque; -	int *ret; -}; - -static void check_processor_compat(void *data) +int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)  { -	struct kvm_cpu_compat_check *c = data; - -	*c->ret = kvm_arch_check_processor_compat(c->opaque); -} - -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, -		  struct module *module) -{ -	struct kvm_cpu_compat_check c;  	int r;  	int cpu; -	r = kvm_arch_init(opaque); -	if (r) -		goto out_fail; - -	/* -	 * kvm_arch_init makes sure there's at most one caller -	 * for architectures that support multiple implementations, -	 * like intel and amd on x86. -	 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating -	 * conflicts in case kvm is already setup for another implementation. -	 */ -	r = kvm_irqfd_init(); +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", +				      kvm_online_cpu, kvm_offline_cpu);  	if (r) -		goto out_irqfd; - -	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { -		r = -ENOMEM; -		goto out_free_0; -	} - -	r = kvm_arch_hardware_setup(opaque); -	if (r < 0) -		goto out_free_1; - -	c.ret = &r; -	c.opaque = opaque; -	for_each_online_cpu(cpu) { -		smp_call_function_single(cpu, check_processor_compat, &c, 1); -		if (r < 0) -			goto out_free_2; -	} +		return r; -	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", -				      kvm_starting_cpu, kvm_dying_cpu); -	if (r) -		goto out_free_2;  	register_reboot_notifier(&kvm_reboot_notifier); +	register_syscore_ops(&kvm_syscore_ops); +#endif  	/* A kmem cache lets us meet the alignment requirements of fx_save. */  	if (!vcpu_align) @@ -5971,59 +5983,65 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,  					   NULL);  	if (!kvm_vcpu_cache) {  		r = -ENOMEM; -		goto out_free_3; +		goto err_vcpu_cache;  	}  	for_each_possible_cpu(cpu) {  		if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),  					    GFP_KERNEL, cpu_to_node(cpu))) {  			r = -ENOMEM; -			goto out_free_4; +			goto err_cpu_kick_mask;  		}  	} +	r = kvm_irqfd_init(); +	if (r) +		goto err_irqfd; +  	r = kvm_async_pf_init();  	if (r) -		goto out_free_4; +		goto err_async_pf;  	kvm_chardev_ops.owner = module; -	r = misc_register(&kvm_dev); -	if (r) { -		pr_err("kvm: misc device register failed\n"); -		goto out_unreg; -	} - -	register_syscore_ops(&kvm_syscore_ops); -  	kvm_preempt_ops.sched_in = kvm_sched_in;  	kvm_preempt_ops.sched_out = kvm_sched_out;  	kvm_init_debug();  	r = kvm_vfio_ops_init(); -	WARN_ON(r); +	if (WARN_ON_ONCE(r)) +		goto err_vfio; + +	/* +	 * Registration _must_ be the very last thing done, as this exposes +	 * /dev/kvm to userspace, i.e. all infrastructure must be setup! +	 */ +	r = misc_register(&kvm_dev); +	if (r) { +		pr_err("kvm: misc device register failed\n"); +		goto err_register; +	}  	return 0; -out_unreg: +err_register: +	kvm_vfio_ops_exit(); +err_vfio:  	kvm_async_pf_deinit(); -out_free_4: +err_async_pf: +	kvm_irqfd_exit(); +err_irqfd: +err_cpu_kick_mask:  	for_each_possible_cpu(cpu)  		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));  	kmem_cache_destroy(kvm_vcpu_cache); -out_free_3: +err_vcpu_cache: +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +	unregister_syscore_ops(&kvm_syscore_ops);  	unregister_reboot_notifier(&kvm_reboot_notifier); -	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); -out_free_2: -	kvm_arch_hardware_unsetup(); -out_free_1: -	free_cpumask_var(cpus_hardware_enabled); -out_free_0: -	kvm_irqfd_exit(); -out_irqfd: -	kvm_arch_exit(); -out_fail: +	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); +#endif  	return r;  }  EXPORT_SYMBOL_GPL(kvm_init); @@ -6032,21 +6050,25 @@ void kvm_exit(void)  {  	int cpu; -	debugfs_remove_recursive(kvm_debugfs_dir); +	/* +	 * Note, unregistering /dev/kvm doesn't strictly need to come first, +	 * fops_get(), a.k.a. try_module_get(), prevents acquiring references +	 * to KVM while the module is being stopped. +	 */  	misc_deregister(&kvm_dev); + +	debugfs_remove_recursive(kvm_debugfs_dir);  	for_each_possible_cpu(cpu)  		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));  	kmem_cache_destroy(kvm_vcpu_cache); +	kvm_vfio_ops_exit();  	kvm_async_pf_deinit(); +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING  	unregister_syscore_ops(&kvm_syscore_ops);  	unregister_reboot_notifier(&kvm_reboot_notifier); -	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); -	on_each_cpu(hardware_disable_nolock, NULL, 1); -	kvm_arch_hardware_unsetup(); -	kvm_arch_exit(); +	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); +#endif  	kvm_irqfd_exit(); -	free_cpumask_var(cpus_hardware_enabled); -	kvm_vfio_ops_exit();  }  EXPORT_SYMBOL_GPL(kvm_exit);  |