diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
| -rw-r--r-- | arch/x86/kvm/x86.c | 691 | 
1 files changed, 491 insertions, 200 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b0c47b41c264..5f5eb577d583 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -173,8 +173,13 @@ bool __read_mostly enable_vmware_backdoor = false;  module_param(enable_vmware_backdoor, bool, S_IRUGO);  EXPORT_SYMBOL_GPL(enable_vmware_backdoor); -static bool __read_mostly force_emulation_prefix = false; -module_param(force_emulation_prefix, bool, S_IRUGO); +/* + * Flags to manipulate forced emulation behavior (any non-zero value will + * enable forced emulation). + */ +#define KVM_FEP_CLEAR_RFLAGS_RF	BIT(1) +static int __read_mostly force_emulation_prefix; +module_param(force_emulation_prefix, int, 0644);  int __read_mostly pi_inject_timer = -1;  module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); @@ -528,6 +533,7 @@ static int exception_class(int vector)  #define EXCPT_TRAP		1  #define EXCPT_ABORT		2  #define EXCPT_INTERRUPT		3 +#define EXCPT_DB		4  static int exception_type(int vector)  { @@ -538,8 +544,14 @@ static int exception_type(int vector)  	mask = 1 << vector; -	/* #DB is trap, as instruction watchpoints are handled elsewhere */ -	if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR))) +	/* +	 * #DBs can be trap-like or fault-like, the caller must check other CPU +	 * state, e.g. DR6, to determine whether a #DB is a trap or fault. +	 */ +	if (mask & (1 << DB_VECTOR)) +		return EXCPT_DB; + +	if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))  		return EXCPT_TRAP;  	if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) @@ -549,16 +561,13 @@ static int exception_type(int vector)  	return EXCPT_FAULT;  } -void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, +				   struct kvm_queued_exception *ex)  { -	unsigned nr = vcpu->arch.exception.nr; -	bool has_payload = vcpu->arch.exception.has_payload; -	unsigned long payload = vcpu->arch.exception.payload; - -	if (!has_payload) +	if (!ex->has_payload)  		return; -	switch (nr) { +	switch (ex->vector) {  	case DB_VECTOR:  		/*  		 * "Certain debug exceptions may clear bit 0-3.  The @@ -583,8 +592,8 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)  		 * So they need to be flipped for DR6.  		 */  		vcpu->arch.dr6 |= DR6_ACTIVE_LOW; -		vcpu->arch.dr6 |= payload; -		vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW; +		vcpu->arch.dr6 |= ex->payload; +		vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;  		/*  		 * The #DB payload is defined as compatible with the 'pending @@ -595,15 +604,30 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)  		vcpu->arch.dr6 &= ~BIT(12);  		break;  	case PF_VECTOR: -		vcpu->arch.cr2 = payload; +		vcpu->arch.cr2 = ex->payload;  		break;  	} -	vcpu->arch.exception.has_payload = false; -	vcpu->arch.exception.payload = 0; +	ex->has_payload = false; +	ex->payload = 0;  }  EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); +static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector, +				       bool has_error_code, u32 error_code, +				       bool has_payload, unsigned long payload) +{ +	struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; + +	ex->vector = vector; +	ex->injected = false; +	ex->pending = true; +	ex->has_error_code = has_error_code; +	ex->error_code = error_code; +	ex->has_payload = has_payload; +	ex->payload = payload; +} +  static void kvm_multiple_exception(struct kvm_vcpu *vcpu,  		unsigned nr, bool has_error, u32 error_code,  	        bool has_payload, unsigned long payload, bool reinject) @@ -613,18 +637,31 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,  	kvm_make_request(KVM_REQ_EVENT, vcpu); +	/* +	 * If the exception is destined for L2 and isn't being reinjected, +	 * morph it to a VM-Exit if L1 wants to intercept the exception.  A +	 * previously injected exception is not checked because it was checked +	 * when it was original queued, and re-checking is incorrect if _L1_ +	 * injected the exception, in which case it's exempt from interception. +	 */ +	if (!reinject && is_guest_mode(vcpu) && +	    kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { +		kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, +					   has_payload, payload); +		return; +	} +  	if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {  	queue:  		if (reinject) {  			/* -			 * On vmentry, vcpu->arch.exception.pending is only -			 * true if an event injection was blocked by -			 * nested_run_pending.  In that case, however, -			 * vcpu_enter_guest requests an immediate exit, -			 * and the guest shouldn't proceed far enough to -			 * need reinjection. +			 * On VM-Entry, an exception can be pending if and only +			 * if event injection was blocked by nested_run_pending. +			 * In that case, however, vcpu_enter_guest() requests an +			 * immediate exit, and the guest shouldn't proceed far +			 * enough to need reinjection.  			 */ -			WARN_ON_ONCE(vcpu->arch.exception.pending); +			WARN_ON_ONCE(kvm_is_exception_pending(vcpu));  			vcpu->arch.exception.injected = true;  			if (WARN_ON_ONCE(has_payload)) {  				/* @@ -639,17 +676,18 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,  			vcpu->arch.exception.injected = false;  		}  		vcpu->arch.exception.has_error_code = has_error; -		vcpu->arch.exception.nr = nr; +		vcpu->arch.exception.vector = nr;  		vcpu->arch.exception.error_code = error_code;  		vcpu->arch.exception.has_payload = has_payload;  		vcpu->arch.exception.payload = payload;  		if (!is_guest_mode(vcpu)) -			kvm_deliver_exception_payload(vcpu); +			kvm_deliver_exception_payload(vcpu, +						      &vcpu->arch.exception);  		return;  	}  	/* to check exception */ -	prev_nr = vcpu->arch.exception.nr; +	prev_nr = vcpu->arch.exception.vector;  	if (prev_nr == DF_VECTOR) {  		/* triple fault -> shutdown */  		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); @@ -657,25 +695,22 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,  	}  	class1 = exception_class(prev_nr);  	class2 = exception_class(nr); -	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) -		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { +	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || +	    (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {  		/* -		 * Generate double fault per SDM Table 5-5.  Set -		 * exception.pending = true so that the double fault -		 * can trigger a nested vmexit. +		 * Synthesize #DF.  Clear the previously injected or pending +		 * exception so as not to incorrectly trigger shutdown.  		 */ -		vcpu->arch.exception.pending = true;  		vcpu->arch.exception.injected = false; -		vcpu->arch.exception.has_error_code = true; -		vcpu->arch.exception.nr = DF_VECTOR; -		vcpu->arch.exception.error_code = 0; -		vcpu->arch.exception.has_payload = false; -		vcpu->arch.exception.payload = 0; -	} else +		vcpu->arch.exception.pending = false; + +		kvm_queue_exception_e(vcpu, DF_VECTOR, 0); +	} else {  		/* replace previous exception with a new one in a hope  		   that instruction re-execution will regenerate lost  		   exception */  		goto queue; +	}  }  void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) @@ -729,20 +764,22 @@ static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)  {  	++vcpu->stat.pf_guest; -	vcpu->arch.exception.nested_apf = -		is_guest_mode(vcpu) && fault->async_page_fault; -	if (vcpu->arch.exception.nested_apf) { -		vcpu->arch.apf.nested_apf_token = fault->address; -		kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); -	} else { + +	/* +	 * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of +	 * whether or not L1 wants to intercept "regular" #PF. +	 */ +	if (is_guest_mode(vcpu) && fault->async_page_fault) +		kvm_queue_exception_vmexit(vcpu, PF_VECTOR, +					   true, fault->error_code, +					   true, fault->address); +	else  		kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,  					fault->address); -	}  }  EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -/* Returns true if the page fault was immediately morphed into a VM-Exit. */ -bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, +void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,  				    struct x86_exception *fault)  {  	struct kvm_mmu *fault_mmu; @@ -760,26 +797,7 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,  		kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,  				       fault_mmu->root.hpa); -	/* -	 * A workaround for KVM's bad exception handling.  If KVM injected an -	 * exception into L2, and L2 encountered a #PF while vectoring the -	 * injected exception, manually check to see if L1 wants to intercept -	 * #PF, otherwise queuing the #PF will lead to #DF or a lost exception. -	 * In all other cases, defer the check to nested_ops->check_events(), -	 * which will correctly handle priority (this does not).  Note, other -	 * exceptions, e.g. #GP, are theoretically affected, #PF is simply the -	 * most problematic, e.g. when L0 and L1 are both intercepting #PF for -	 * shadow paging. -	 * -	 * TODO: Rewrite exception handling to track injected and pending -	 *       (VM-Exit) exceptions separately. -	 */ -	if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) && -	    kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault)) -		return true; -  	fault_mmu->inject_page_fault(vcpu, fault); -	return false;  }  EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); @@ -2297,11 +2315,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,  	/* we verify if the enable bit is set... */  	if (system_time & 1) { -		kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu, -					  KVM_HOST_USES_PFN, system_time & ~1ULL, -					  sizeof(struct pvclock_vcpu_time_info)); +		kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu, +				 KVM_HOST_USES_PFN, system_time & ~1ULL, +				 sizeof(struct pvclock_vcpu_time_info));  	} else { -		kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); +		kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);  	}  	return; @@ -3370,7 +3388,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)  static void kvmclock_reset(struct kvm_vcpu *vcpu)  { -	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); +	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);  	vcpu->arch.time = 0;  } @@ -4841,7 +4859,7 @@ static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)  	return (kvm_arch_interrupt_allowed(vcpu) &&  		kvm_cpu_accept_dm_intr(vcpu) &&  		!kvm_event_needs_reinjection(vcpu) && -		!vcpu->arch.exception.pending); +		!kvm_is_exception_pending(vcpu));  }  static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, @@ -5016,25 +5034,38 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,  static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,  					       struct kvm_vcpu_events *events)  { +	struct kvm_queued_exception *ex; +  	process_nmi(vcpu);  	if (kvm_check_request(KVM_REQ_SMI, vcpu))  		process_smi(vcpu);  	/* -	 * In guest mode, payload delivery should be deferred, -	 * so that the L1 hypervisor can intercept #PF before -	 * CR2 is modified (or intercept #DB before DR6 is -	 * modified under nVMX). Unless the per-VM capability, -	 * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of -	 * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we -	 * opportunistically defer the exception payload, deliver it if the -	 * capability hasn't been requested before processing a -	 * KVM_GET_VCPU_EVENTS. +	 * KVM's ABI only allows for one exception to be migrated.  Luckily, +	 * the only time there can be two queued exceptions is if there's a +	 * non-exiting _injected_ exception, and a pending exiting exception. +	 * In that case, ignore the VM-Exiting exception as it's an extension +	 * of the injected exception. +	 */ +	if (vcpu->arch.exception_vmexit.pending && +	    !vcpu->arch.exception.pending && +	    !vcpu->arch.exception.injected) +		ex = &vcpu->arch.exception_vmexit; +	else +		ex = &vcpu->arch.exception; + +	/* +	 * In guest mode, payload delivery should be deferred if the exception +	 * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1 +	 * intercepts #PF, ditto for DR6 and #DBs.  If the per-VM capability, +	 * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not +	 * propagate the payload and so it cannot be safely deferred.  Deliver +	 * the payload if the capability hasn't been requested.  	 */  	if (!vcpu->kvm->arch.exception_payload_enabled && -	    vcpu->arch.exception.pending && vcpu->arch.exception.has_payload) -		kvm_deliver_exception_payload(vcpu); +	    ex->pending && ex->has_payload) +		kvm_deliver_exception_payload(vcpu, ex);  	/*  	 * The API doesn't provide the instruction length for software @@ -5042,26 +5073,25 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,  	 * isn't advanced, we should expect to encounter the exception  	 * again.  	 */ -	if (kvm_exception_is_soft(vcpu->arch.exception.nr)) { +	if (kvm_exception_is_soft(ex->vector)) {  		events->exception.injected = 0;  		events->exception.pending = 0;  	} else { -		events->exception.injected = vcpu->arch.exception.injected; -		events->exception.pending = vcpu->arch.exception.pending; +		events->exception.injected = ex->injected; +		events->exception.pending = ex->pending;  		/*  		 * For ABI compatibility, deliberately conflate  		 * pending and injected exceptions when  		 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.  		 */  		if (!vcpu->kvm->arch.exception_payload_enabled) -			events->exception.injected |= -				vcpu->arch.exception.pending; +			events->exception.injected |= ex->pending;  	} -	events->exception.nr = vcpu->arch.exception.nr; -	events->exception.has_error_code = vcpu->arch.exception.has_error_code; -	events->exception.error_code = vcpu->arch.exception.error_code; -	events->exception_has_payload = vcpu->arch.exception.has_payload; -	events->exception_payload = vcpu->arch.exception.payload; +	events->exception.nr = ex->vector; +	events->exception.has_error_code = ex->has_error_code; +	events->exception.error_code = ex->error_code; +	events->exception_has_payload = ex->has_payload; +	events->exception_payload = ex->payload;  	events->interrupt.injected =  		vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; @@ -5131,9 +5161,22 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,  		return -EINVAL;  	process_nmi(vcpu); + +	/* +	 * Flag that userspace is stuffing an exception, the next KVM_RUN will +	 * morph the exception to a VM-Exit if appropriate.  Do this only for +	 * pending exceptions, already-injected exceptions are not subject to +	 * intercpetion.  Note, userspace that conflates pending and injected +	 * is hosed, and will incorrectly convert an injected exception into a +	 * pending exception, which in turn may cause a spurious VM-Exit. +	 */ +	vcpu->arch.exception_from_userspace = events->exception.pending; + +	vcpu->arch.exception_vmexit.pending = false; +  	vcpu->arch.exception.injected = events->exception.injected;  	vcpu->arch.exception.pending = events->exception.pending; -	vcpu->arch.exception.nr = events->exception.nr; +	vcpu->arch.exception.vector = events->exception.nr;  	vcpu->arch.exception.has_error_code = events->exception.has_error_code;  	vcpu->arch.exception.error_code = events->exception.error_code;  	vcpu->arch.exception.has_payload = events->exception_has_payload; @@ -6399,26 +6442,22 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,  	return 0;  } -static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, +				       struct kvm_msr_filter *filter)  { -	struct kvm_msr_filter __user *user_msr_filter = argp;  	struct kvm_x86_msr_filter *new_filter, *old_filter; -	struct kvm_msr_filter filter;  	bool default_allow;  	bool empty = true;  	int r = 0;  	u32 i; -	if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) -		return -EFAULT; - -	if (filter.flags & ~KVM_MSR_FILTER_DEFAULT_DENY) +	if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY)  		return -EINVAL; -	for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) -		empty &= !filter.ranges[i].nmsrs; +	for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) +		empty &= !filter->ranges[i].nmsrs; -	default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); +	default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);  	if (empty && !default_allow)  		return -EINVAL; @@ -6426,8 +6465,8 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)  	if (!new_filter)  		return -ENOMEM; -	for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { -		r = kvm_add_msr_filter(new_filter, &filter.ranges[i]); +	for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) { +		r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);  		if (r) {  			kvm_free_msr_filter(new_filter);  			return r; @@ -6450,6 +6489,62 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)  	return 0;  } +#ifdef CONFIG_KVM_COMPAT +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range_compat { +	__u32 flags; +	__u32 nmsrs; +	__u32 base; +	__u32 bitmap; +}; + +struct kvm_msr_filter_compat { +	__u32 flags; +	struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; + +#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat) + +long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, +			      unsigned long arg) +{ +	void __user *argp = (void __user *)arg; +	struct kvm *kvm = filp->private_data; +	long r = -ENOTTY; + +	switch (ioctl) { +	case KVM_X86_SET_MSR_FILTER_COMPAT: { +		struct kvm_msr_filter __user *user_msr_filter = argp; +		struct kvm_msr_filter_compat filter_compat; +		struct kvm_msr_filter filter; +		int i; + +		if (copy_from_user(&filter_compat, user_msr_filter, +				   sizeof(filter_compat))) +			return -EFAULT; + +		filter.flags = filter_compat.flags; +		for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { +			struct kvm_msr_filter_range_compat *cr; + +			cr = &filter_compat.ranges[i]; +			filter.ranges[i] = (struct kvm_msr_filter_range) { +				.flags = cr->flags, +				.nmsrs = cr->nmsrs, +				.base = cr->base, +				.bitmap = (__u8 *)(ulong)cr->bitmap, +			}; +		} + +		r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); +		break; +	} +	} + +	return r; +} +#endif +  #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER  static int kvm_arch_suspend_notifier(struct kvm *kvm)  { @@ -6872,9 +6967,16 @@ set_pit2_out:  	case KVM_SET_PMU_EVENT_FILTER:  		r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);  		break; -	case KVM_X86_SET_MSR_FILTER: -		r = kvm_vm_ioctl_set_msr_filter(kvm, argp); +	case KVM_X86_SET_MSR_FILTER: { +		struct kvm_msr_filter __user *user_msr_filter = argp; +		struct kvm_msr_filter filter; + +		if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) +			return -EFAULT; + +		r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);  		break; +	}  	default:  		r = -ENOTTY;  	} @@ -7257,6 +7359,7 @@ static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,  int handle_ud(struct kvm_vcpu *vcpu)  {  	static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX }; +	int fep_flags = READ_ONCE(force_emulation_prefix);  	int emul_type = EMULTYPE_TRAP_UD;  	char sig[5]; /* ud2; .ascii "kvm" */  	struct x86_exception e; @@ -7264,10 +7367,12 @@ int handle_ud(struct kvm_vcpu *vcpu)  	if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))  		return 1; -	if (force_emulation_prefix && +	if (fep_flags &&  	    kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),  				sig, sizeof(sig), &e) == 0 &&  	    memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) { +		if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF) +			kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF);  		kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));  		emul_type = EMULTYPE_TRAP_UD_FORCED;  	} @@ -7933,14 +8038,20 @@ static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt,  	int r;  	r = kvm_get_msr_with_filter(vcpu, msr_index, pdata); +	if (r < 0) +		return X86EMUL_UNHANDLEABLE; -	if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0, -				    complete_emulated_rdmsr, r)) { -		/* Bounce to user space */ -		return X86EMUL_IO_NEEDED; +	if (r) { +		if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0, +				       complete_emulated_rdmsr, r)) +			return X86EMUL_IO_NEEDED; + +		trace_kvm_msr_read_ex(msr_index); +		return X86EMUL_PROPAGATE_FAULT;  	} -	return r; +	trace_kvm_msr_read(msr_index, *pdata); +	return X86EMUL_CONTINUE;  }  static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, @@ -7950,14 +8061,20 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,  	int r;  	r = kvm_set_msr_with_filter(vcpu, msr_index, data); +	if (r < 0) +		return X86EMUL_UNHANDLEABLE; -	if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data, -				    complete_emulated_msr_access, r)) { -		/* Bounce to user space */ -		return X86EMUL_IO_NEEDED; +	if (r) { +		if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data, +				       complete_emulated_msr_access, r)) +			return X86EMUL_IO_NEEDED; + +		trace_kvm_msr_write_ex(msr_index, data); +		return X86EMUL_PROPAGATE_FAULT;  	} -	return r; +	trace_kvm_msr_write(msr_index, data); +	return X86EMUL_CONTINUE;  }  static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, @@ -8161,18 +8278,17 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)  	}  } -static bool inject_emulated_exception(struct kvm_vcpu *vcpu) +static void inject_emulated_exception(struct kvm_vcpu *vcpu)  {  	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; -	if (ctxt->exception.vector == PF_VECTOR) -		return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); -	if (ctxt->exception.error_code_valid) +	if (ctxt->exception.vector == PF_VECTOR) +		kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); +	else if (ctxt->exception.error_code_valid)  		kvm_queue_exception_e(vcpu, ctxt->exception.vector,  				      ctxt->exception.error_code);  	else  		kvm_queue_exception(vcpu, ctxt->exception.vector); -	return false;  }  static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) @@ -8548,8 +8664,46 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)  }  EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); -static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r) +static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)  { +	u32 shadow; + +	if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF) +		return true; + +	/* +	 * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active, +	 * but AMD CPUs do not.  MOV/POP SS blocking is rare, check that first +	 * to avoid the relatively expensive CPUID lookup. +	 */ +	shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); +	return (shadow & KVM_X86_SHADOW_INT_MOV_SS) && +	       guest_cpuid_is_intel(vcpu); +} + +static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, +					   int emulation_type, int *r) +{ +	WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE); + +	/* +	 * Do not check for code breakpoints if hardware has already done the +	 * checks, as inferred from the emulation type.  On NO_DECODE and SKIP, +	 * the instruction has passed all exception checks, and all intercepted +	 * exceptions that trigger emulation have lower priority than code +	 * breakpoints, i.e. the fact that the intercepted exception occurred +	 * means any code breakpoints have already been serviced. +	 * +	 * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as +	 * hardware has checked the RIP of the magic prefix, but not the RIP of +	 * the instruction being emulated.  The intent of forced emulation is +	 * to behave as if KVM intercepted the instruction without an exception +	 * and without a prefix. +	 */ +	if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP | +			      EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF)) +		return false; +  	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&  	    (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {  		struct kvm_run *kvm_run = vcpu->run; @@ -8569,7 +8723,7 @@ static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)  	}  	if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && -	    !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { +	    !kvm_is_code_breakpoint_inhibited(vcpu)) {  		unsigned long eip = kvm_get_linear_rip(vcpu);  		u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,  					   vcpu->arch.dr7, @@ -8671,8 +8825,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,  		 * are fault-like and are higher priority than any faults on  		 * the code fetch itself.  		 */ -		if (!(emulation_type & EMULTYPE_SKIP) && -		    kvm_vcpu_check_code_breakpoint(vcpu, &r)) +		if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r))  			return r;  		r = x86_decode_emulated_instruction(vcpu, emulation_type, @@ -8770,8 +8923,7 @@ restart:  	if (ctxt->have_exception) {  		r = 1; -		if (inject_emulated_exception(vcpu)) -			return r; +		inject_emulated_exception(vcpu);  	} else if (vcpu->arch.pio.count) {  		if (!vcpu->arch.pio.in) {  			/* FIXME: return into emulator if single-stepping.  */ @@ -8801,6 +8953,12 @@ writeback:  		unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);  		toggle_interruptibility(vcpu, ctxt->interruptibility);  		vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + +		/* +		 * Note, EXCPT_DB is assumed to be fault-like as the emulator +		 * only supports code breakpoints and general detect #DB, both +		 * of which are fault-like. +		 */  		if (!ctxt->have_exception ||  		    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {  			kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); @@ -9662,74 +9820,155 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu)  static void kvm_inject_exception(struct kvm_vcpu *vcpu)  { -	trace_kvm_inj_exception(vcpu->arch.exception.nr, +	trace_kvm_inj_exception(vcpu->arch.exception.vector,  				vcpu->arch.exception.has_error_code,  				vcpu->arch.exception.error_code,  				vcpu->arch.exception.injected);  	if (vcpu->arch.exception.error_code && !is_protmode(vcpu))  		vcpu->arch.exception.error_code = false; -	static_call(kvm_x86_queue_exception)(vcpu); +	static_call(kvm_x86_inject_exception)(vcpu);  } -static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) +/* + * Check for any event (interrupt or exception) that is ready to be injected, + * and if there is at least one event, inject the event with the highest + * priority.  This handles both "pending" events, i.e. events that have never + * been injected into the guest, and "injected" events, i.e. events that were + * injected as part of a previous VM-Enter, but weren't successfully delivered + * and need to be re-injected. + * + * Note, this is not guaranteed to be invoked on a guest instruction boundary, + * i.e. doesn't guarantee that there's an event window in the guest.  KVM must + * be able to inject exceptions in the "middle" of an instruction, and so must + * also be able to re-inject NMIs and IRQs in the middle of an instruction. + * I.e. for exceptions and re-injected events, NOT invoking this on instruction + * boundaries is necessary and correct. + * + * For simplicity, KVM uses a single path to inject all events (except events + * that are injected directly from L1 to L2) and doesn't explicitly track + * instruction boundaries for asynchronous events.  However, because VM-Exits + * that can occur during instruction execution typically result in KVM skipping + * the instruction or injecting an exception, e.g. instruction and exception + * intercepts, and because pending exceptions have higher priority than pending + * interrupts, KVM still honors instruction boundaries in most scenarios. + * + * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip + * the instruction or inject an exception, then KVM can incorrecty inject a new + * asynchrounous event if the event became pending after the CPU fetched the + * instruction (in the guest).  E.g. if a page fault (#PF, #NPF, EPT violation) + * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be + * injected on the restarted instruction instead of being deferred until the + * instruction completes. + * + * In practice, this virtualization hole is unlikely to be observed by the + * guest, and even less likely to cause functional problems.  To detect the + * hole, the guest would have to trigger an event on a side effect of an early + * phase of instruction execution, e.g. on the instruction fetch from memory. + * And for it to be a functional problem, the guest would need to depend on the + * ordering between that side effect, the instruction completing, _and_ the + * delivery of the asynchronous event. + */ +static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, +				       bool *req_immediate_exit)  { +	bool can_inject;  	int r; -	bool can_inject = true; -	/* try to reinject previous events if any */ +	/* +	 * Process nested events first, as nested VM-Exit supercedes event +	 * re-injection.  If there's an event queued for re-injection, it will +	 * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit. +	 */ +	if (is_guest_mode(vcpu)) +		r = kvm_check_nested_events(vcpu); +	else +		r = 0; -	if (vcpu->arch.exception.injected) { -		kvm_inject_exception(vcpu); -		can_inject = false; -	}  	/* -	 * Do not inject an NMI or interrupt if there is a pending -	 * exception.  Exceptions and interrupts are recognized at -	 * instruction boundaries, i.e. the start of an instruction. -	 * Trap-like exceptions, e.g. #DB, have higher priority than -	 * NMIs and interrupts, i.e. traps are recognized before an -	 * NMI/interrupt that's pending on the same instruction. -	 * Fault-like exceptions, e.g. #GP and #PF, are the lowest -	 * priority, but are only generated (pended) during instruction -	 * execution, i.e. a pending fault-like exception means the -	 * fault occurred on the *previous* instruction and must be -	 * serviced prior to recognizing any new events in order to -	 * fully complete the previous instruction. +	 * Re-inject exceptions and events *especially* if immediate entry+exit +	 * to/from L2 is needed, as any event that has already been injected +	 * into L2 needs to complete its lifecycle before injecting a new event. +	 * +	 * Don't re-inject an NMI or interrupt if there is a pending exception. +	 * This collision arises if an exception occurred while vectoring the +	 * injected event, KVM intercepted said exception, and KVM ultimately +	 * determined the fault belongs to the guest and queues the exception +	 * for injection back into the guest. +	 * +	 * "Injected" interrupts can also collide with pending exceptions if +	 * userspace ignores the "ready for injection" flag and blindly queues +	 * an interrupt.  In that case, prioritizing the exception is correct, +	 * as the exception "occurred" before the exit to userspace.  Trap-like +	 * exceptions, e.g. most #DBs, have higher priority than interrupts. +	 * And while fault-like exceptions, e.g. #GP and #PF, are the lowest +	 * priority, they're only generated (pended) during instruction +	 * execution, and interrupts are recognized at instruction boundaries. +	 * Thus a pending fault-like exception means the fault occurred on the +	 * *previous* instruction and must be serviced prior to recognizing any +	 * new events in order to fully complete the previous instruction.  	 */ -	else if (!vcpu->arch.exception.pending) { -		if (vcpu->arch.nmi_injected) { -			static_call(kvm_x86_inject_nmi)(vcpu); -			can_inject = false; -		} else if (vcpu->arch.interrupt.injected) { -			static_call(kvm_x86_inject_irq)(vcpu, true); -			can_inject = false; -		} -	} +	if (vcpu->arch.exception.injected) +		kvm_inject_exception(vcpu); +	else if (kvm_is_exception_pending(vcpu)) +		; /* see above */ +	else if (vcpu->arch.nmi_injected) +		static_call(kvm_x86_inject_nmi)(vcpu); +	else if (vcpu->arch.interrupt.injected) +		static_call(kvm_x86_inject_irq)(vcpu, true); +	/* +	 * Exceptions that morph to VM-Exits are handled above, and pending +	 * exceptions on top of injected exceptions that do not VM-Exit should +	 * either morph to #DF or, sadly, override the injected exception. +	 */  	WARN_ON_ONCE(vcpu->arch.exception.injected &&  		     vcpu->arch.exception.pending);  	/* -	 * Call check_nested_events() even if we reinjected a previous event -	 * in order for caller to determine if it should require immediate-exit -	 * from L2 to L1 due to pending L1 events which require exit -	 * from L2 to L1. +	 * Bail if immediate entry+exit to/from the guest is needed to complete +	 * nested VM-Enter or event re-injection so that a different pending +	 * event can be serviced (or if KVM needs to exit to userspace). +	 * +	 * Otherwise, continue processing events even if VM-Exit occurred.  The +	 * VM-Exit will have cleared exceptions that were meant for L2, but +	 * there may now be events that can be injected into L1.  	 */ -	if (is_guest_mode(vcpu)) { -		r = kvm_check_nested_events(vcpu); -		if (r < 0) -			goto out; -	} +	if (r < 0) +		goto out; + +	/* +	 * A pending exception VM-Exit should either result in nested VM-Exit +	 * or force an immediate re-entry and exit to/from L2, and exception +	 * VM-Exits cannot be injected (flag should _never_ be set). +	 */ +	WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected || +		     vcpu->arch.exception_vmexit.pending); + +	/* +	 * New events, other than exceptions, cannot be injected if KVM needs +	 * to re-inject a previous event.  See above comments on re-injecting +	 * for why pending exceptions get priority. +	 */ +	can_inject = !kvm_event_needs_reinjection(vcpu); -	/* try to inject new event if pending */  	if (vcpu->arch.exception.pending) { -		if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) +		/* +		 * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS +		 * value pushed on the stack.  Trap-like exception and all #DBs +		 * leave RF as-is (KVM follows Intel's behavior in this regard; +		 * AMD states that code breakpoint #DBs excplitly clear RF=0). +		 * +		 * Note, most versions of Intel's SDM and AMD's APM incorrectly +		 * describe the behavior of General Detect #DBs, which are +		 * fault-like.  They do _not_ set RF, a la code breakpoints. +		 */ +		if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)  			__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |  					     X86_EFLAGS_RF); -		if (vcpu->arch.exception.nr == DB_VECTOR) { -			kvm_deliver_exception_payload(vcpu); +		if (vcpu->arch.exception.vector == DB_VECTOR) { +			kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);  			if (vcpu->arch.dr7 & DR7_GD) {  				vcpu->arch.dr7 &= ~DR7_GD;  				kvm_update_dr7(vcpu); @@ -9801,11 +10040,24 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)  	}  	if (is_guest_mode(vcpu) && -	    kvm_x86_ops.nested_ops->hv_timer_pending && -	    kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) +	    kvm_x86_ops.nested_ops->has_events && +	    kvm_x86_ops.nested_ops->has_events(vcpu))  		*req_immediate_exit = true; -	WARN_ON(vcpu->arch.exception.pending); +	/* +	 * KVM must never queue a new exception while injecting an event; KVM +	 * is done emulating and should only propagate the to-be-injected event +	 * to the VMCS/VMCB.  Queueing a new exception can put the vCPU into an +	 * infinite loop as KVM will bail from VM-Enter to inject the pending +	 * exception and start the cycle all over. +	 * +	 * Exempt triple faults as they have special handling and won't put the +	 * vCPU into an infinite loop.  Triple fault can be queued when running +	 * VMX without unrestricted guest, as that requires KVM to emulate Real +	 * Mode events (see kvm_inject_realmode_interrupt()). +	 */ +	WARN_ON_ONCE(vcpu->arch.exception.pending || +		     vcpu->arch.exception_vmexit.pending);  	return 0;  out: @@ -10110,7 +10362,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)  	 * When APICv gets disabled, we may still have injected interrupts  	 * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was  	 * still active when the interrupt got accepted. Make sure -	 * inject_pending_event() is called to check for that. +	 * kvm_check_and_inject_events() is called to check for that.  	 */  	if (!apic->apicv_active)  		kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -10152,7 +10404,10 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,  		kvm->arch.apicv_inhibit_reasons = new;  		if (new) {  			unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); +			int idx = srcu_read_lock(&kvm->srcu); +  			kvm_zap_gfn_range(kvm, gfn, gfn+1); +			srcu_read_unlock(&kvm->srcu, idx);  		}  	} else {  		kvm->arch.apicv_inhibit_reasons = new; @@ -10407,7 +10662,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  			goto out;  		} -		r = inject_pending_event(vcpu, &req_immediate_exit); +		r = kvm_check_and_inject_events(vcpu, &req_immediate_exit);  		if (r < 0) {  			r = 0;  			goto out; @@ -10646,10 +10901,26 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)  		if (hv_timer)  			kvm_lapic_switch_to_hv_timer(vcpu); -		if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) +		/* +		 * If the vCPU is not runnable, a signal or another host event +		 * of some kind is pending; service it without changing the +		 * vCPU's activity state. +		 */ +		if (!kvm_arch_vcpu_runnable(vcpu))  			return 1;  	} +	/* +	 * Evaluate nested events before exiting the halted state.  This allows +	 * the halt state to be recorded properly in the VMCS12's activity +	 * state field (AMD does not have a similar field and a VM-Exit always +	 * causes a spurious wakeup from HLT). +	 */ +	if (is_guest_mode(vcpu)) { +		if (kvm_check_nested_events(vcpu) < 0) +			return 0; +	} +  	if (kvm_apic_accept_events(vcpu) < 0)  		return 0;  	switch(vcpu->arch.mp_state) { @@ -10673,9 +10944,6 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)  static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)  { -	if (is_guest_mode(vcpu)) -		kvm_check_nested_events(vcpu); -  	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&  		!vcpu->arch.apf.halted);  } @@ -10824,6 +11092,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)  { +	struct kvm_queued_exception *ex = &vcpu->arch.exception;  	struct kvm_run *kvm_run = vcpu->run;  	int r; @@ -10852,7 +11121,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)  			r = 0;  			goto out;  		} -		kvm_clear_request(KVM_REQ_UNHALT, vcpu);  		r = -EAGAIN;  		if (signal_pending(current)) {  			r = -EINTR; @@ -10882,6 +11150,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)  		}  	} +	/* +	 * If userspace set a pending exception and L2 is active, convert it to +	 * a pending VM-Exit if L1 wants to intercept the exception. +	 */ +	if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) && +	    kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector, +							ex->error_code)) { +		kvm_queue_exception_vmexit(vcpu, ex->vector, +					   ex->has_error_code, ex->error_code, +					   ex->has_payload, ex->payload); +		ex->injected = false; +		ex->pending = false; +	} +	vcpu->arch.exception_from_userspace = false; +  	if (unlikely(vcpu->arch.complete_userspace_io)) {  		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;  		vcpu->arch.complete_userspace_io = NULL; @@ -10988,6 +11271,7 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  	kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);  	vcpu->arch.exception.pending = false; +	vcpu->arch.exception_vmexit.pending = false;  	kvm_make_request(KVM_REQ_EVENT, vcpu);  } @@ -11125,11 +11409,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,  	}  	/* -	 * KVM_MP_STATE_INIT_RECEIVED means the processor is in -	 * INIT state; latched init should be reported using -	 * KVM_SET_VCPU_EVENTS, so reject it here. +	 * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow +	 * forcing the guest into INIT/SIPI if those events are supposed to be +	 * blocked.  KVM prioritizes SMI over INIT, so reject INIT/SIPI state +	 * if an SMI is pending as well.  	 */ -	if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) && +	if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&  	    (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||  	     mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))  		goto out; @@ -11368,7 +11653,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,  	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {  		r = -EBUSY; -		if (vcpu->arch.exception.pending) +		if (kvm_is_exception_pending(vcpu))  			goto out;  		if (dbg->control & KVM_GUESTDBG_INJECT_DB)  			kvm_queue_exception(vcpu, DB_VECTOR); @@ -11547,6 +11832,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)  	vcpu->arch.regs_avail = ~0;  	vcpu->arch.regs_dirty = ~0; +	kvm_gpc_init(&vcpu->arch.pv_time); +  	if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;  	else @@ -11750,8 +12037,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)  		struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;  		/* -		 * To avoid have the INIT path from kvm_apic_has_events() that be -		 * called with loaded FPU and does not let userspace fix the state. +		 * All paths that lead to INIT are required to load the guest's +		 * FPU state (because most paths are buried in KVM_RUN).  		 */  		if (init_event)  			kvm_put_guest_fpu(vcpu); @@ -12080,6 +12367,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)  	if (ret)  		goto out_page_track; +	ret = static_call(kvm_x86_vm_init)(kvm); +	if (ret) +		goto out_uninit_mmu; +  	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);  	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);  	atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -12115,8 +12406,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)  	kvm_hv_init_vm(kvm);  	kvm_xen_init_vm(kvm); -	return static_call(kvm_x86_vm_init)(kvm); +	return 0; +out_uninit_mmu: +	kvm_mmu_uninit_vm(kvm);  out_page_track:  	kvm_page_track_cleanup(kvm);  out: @@ -12589,13 +12882,14 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)  	if (!list_empty_careful(&vcpu->async_pf.done))  		return true; -	if (kvm_apic_has_events(vcpu)) +	if (kvm_apic_has_pending_init_or_sipi(vcpu) && +	    kvm_apic_init_sipi_allowed(vcpu))  		return true;  	if (vcpu->arch.pv.pv_unhalted)  		return true; -	if (vcpu->arch.exception.pending) +	if (kvm_is_exception_pending(vcpu))  		return true;  	if (kvm_test_request(KVM_REQ_NMI, vcpu) || @@ -12617,16 +12911,13 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)  		return true;  	if (is_guest_mode(vcpu) && -	    kvm_x86_ops.nested_ops->hv_timer_pending && -	    kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) +	    kvm_x86_ops.nested_ops->has_events && +	    kvm_x86_ops.nested_ops->has_events(vcpu))  		return true;  	if (kvm_xen_has_pending_events(vcpu))  		return true; -	if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) -		return true; -  	return false;  } @@ -12850,7 +13141,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)  {  	if (unlikely(!lapic_in_kernel(vcpu) ||  		     kvm_event_needs_reinjection(vcpu) || -		     vcpu->arch.exception.pending)) +		     kvm_is_exception_pending(vcpu)))  		return false;  	if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) @@ -13401,7 +13692,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);  |