diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
| -rw-r--r-- | arch/x86/kvm/x86.c | 456 | 
1 files changed, 304 insertions, 152 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 93b0bd45ac73..31607174f442 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));  static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);  #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__  #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \                                      KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -212,7 +212,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {  	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },  	{ "mmu_unsync", VM_STAT(mmu_unsync) },  	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, -	{ "largepages", VM_STAT(lpages) }, +	{ "largepages", VM_STAT(lpages, .mode = 0444) },  	{ "max_mmu_page_hash_collisions",  		VM_STAT(max_mmu_page_hash_collisions) },  	{ NULL } @@ -360,7 +360,8 @@ EXPORT_SYMBOL_GPL(kvm_set_apic_base);  asmlinkage __visible void kvm_spurious_fault(void)  {  	/* Fault while not rebooting.  We want the trace. */ -	BUG(); +	if (!kvm_rebooting) +		BUG();  }  EXPORT_SYMBOL_GPL(kvm_spurious_fault); @@ -674,8 +675,14 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,  				       data, offset, len, access);  } +static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) +{ +	return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | +	       rsvd_bits(1, 2); +} +  /* - * Load the pae pdptrs.  Return true is they are all valid. + * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.   */  int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)  { @@ -694,8 +701,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)  	}  	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {  		if ((pdpte[i] & PT_PRESENT_MASK) && -		    (pdpte[i] & -		     vcpu->arch.mmu->guest_rsvd_check.rsvd_bits_mask[0][2])) { +		    (pdpte[i] & pdptr_rsvd_bits(vcpu))) {  			ret = 0;  			goto out;  		} @@ -879,34 +885,42 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)  }  EXPORT_SYMBOL_GPL(kvm_set_xcr); -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)  { -	unsigned long old_cr4 = kvm_read_cr4(vcpu); -	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | -				   X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; -  	if (cr4 & CR4_RESERVED_BITS) -		return 1; +		return -EINVAL;  	if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) -		return 1; +		return -EINVAL;  	if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) -		return 1; +		return -EINVAL;  	if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) -		return 1; +		return -EINVAL;  	if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) -		return 1; +		return -EINVAL;  	if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) -		return 1; +		return -EINVAL;  	if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) -		return 1; +		return -EINVAL;  	if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) +		return -EINVAL; + +	return 0; +} + +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ +	unsigned long old_cr4 = kvm_read_cr4(vcpu); +	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | +				   X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; + +	if (kvm_valid_cr4(vcpu, cr4))  		return 1;  	if (is_long_mode(vcpu)) { @@ -1140,6 +1154,44 @@ static u32 msrs_to_save[] = {  	MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,  	MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,  	MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, +	MSR_IA32_UMWAIT_CONTROL, + +	MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, +	MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, +	MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, +	MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, +	MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, +	MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, +	MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, +	MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, +	MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, +	MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, +	MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, +	MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, +	MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, +	MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, +	MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, +	MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, +	MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, +	MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, +	MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, +	MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, +	MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, +	MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, +	MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, +	MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, +	MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, +	MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, +	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, +	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, +	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, +	MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, +	MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, +	MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, +	MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, +	MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, +	MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, +	MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31,  };  static unsigned num_msrs_to_save; @@ -1254,6 +1306,13 @@ static u64 kvm_get_arch_capabilities(void)  	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)  		data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; +	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) +		data |= ARCH_CAP_RDCL_NO; +	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) +		data |= ARCH_CAP_SSB_NO; +	if (!boot_cpu_has_bug(X86_BUG_MDS)) +		data |= ARCH_CAP_MDS_NO; +  	return data;  } @@ -1351,19 +1410,23 @@ void kvm_enable_efer_bits(u64 mask)  EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);  /* - * Writes msr value into into the appropriate "register". + * Write @data into the MSR specified by @index.  Select MSR specific fault + * checks are bypassed if @host_initiated is %true.   * Returns 0 on success, non-0 otherwise.   * Assumes vcpu_load() was already called.   */ -int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, +			 bool host_initiated)  { -	switch (msr->index) { +	struct msr_data msr; + +	switch (index) {  	case MSR_FS_BASE:  	case MSR_GS_BASE:  	case MSR_KERNEL_GS_BASE:  	case MSR_CSTAR:  	case MSR_LSTAR: -		if (is_noncanonical_address(msr->data, vcpu)) +		if (is_noncanonical_address(data, vcpu))  			return 1;  		break;  	case MSR_IA32_SYSENTER_EIP: @@ -1380,38 +1443,95 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)  		 * value, and that something deterministic happens if the guest  		 * invokes 64-bit SYSENTER.  		 */ -		msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu)); +		data = get_canonical(data, vcpu_virt_addr_bits(vcpu));  	} -	return kvm_x86_ops->set_msr(vcpu, msr); + +	msr.data = data; +	msr.index = index; +	msr.host_initiated = host_initiated; + +	return kvm_x86_ops->set_msr(vcpu, &msr);  } -EXPORT_SYMBOL_GPL(kvm_set_msr);  /* - * Adapt set_msr() to msr_io()'s calling convention + * Read the MSR specified by @index into @data.  Select MSR specific fault + * checks are bypassed if @host_initiated is %true. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called.   */ -static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, +			 bool host_initiated)  {  	struct msr_data msr; -	int r; +	int ret;  	msr.index = index; -	msr.host_initiated = true; -	r = kvm_get_msr(vcpu, &msr); -	if (r) -		return r; +	msr.host_initiated = host_initiated; -	*data = msr.data; -	return 0; +	ret = kvm_x86_ops->get_msr(vcpu, &msr); +	if (!ret) +		*data = msr.data; +	return ret;  } -static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)  { -	struct msr_data msr; +	return __kvm_get_msr(vcpu, index, data, false); +} +EXPORT_SYMBOL_GPL(kvm_get_msr); -	msr.data = *data; -	msr.index = index; -	msr.host_initiated = true; -	return kvm_set_msr(vcpu, &msr); +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ +	return __kvm_set_msr(vcpu, index, data, false); +} +EXPORT_SYMBOL_GPL(kvm_set_msr); + +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +{ +	u32 ecx = kvm_rcx_read(vcpu); +	u64 data; + +	if (kvm_get_msr(vcpu, ecx, &data)) { +		trace_kvm_msr_read_ex(ecx); +		kvm_inject_gp(vcpu, 0); +		return 1; +	} + +	trace_kvm_msr_read(ecx, data); + +	kvm_rax_write(vcpu, data & -1u); +	kvm_rdx_write(vcpu, (data >> 32) & -1u); +	return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); + +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +{ +	u32 ecx = kvm_rcx_read(vcpu); +	u64 data = kvm_read_edx_eax(vcpu); + +	if (kvm_set_msr(vcpu, ecx, data)) { +		trace_kvm_msr_write_ex(ecx, data); +		kvm_inject_gp(vcpu, 0); +		return 1; +	} + +	trace_kvm_msr_write(ecx, data); +	return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ +	return __kvm_get_msr(vcpu, index, data, true); +} + +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ +	return __kvm_set_msr(vcpu, index, *data, true);  }  #ifdef CONFIG_X86_64 @@ -2452,6 +2572,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)  	 * Doing a TLB flush here, on the guest's behalf, can avoid  	 * expensive IPIs.  	 */ +	trace_kvm_pv_tlb_flush(vcpu->vcpu_id, +		vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB);  	if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)  		kvm_vcpu_flush_tlb(vcpu, false); @@ -2748,18 +2870,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  }  EXPORT_SYMBOL_GPL(kvm_set_msr_common); - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) -{ -	return kvm_x86_ops->get_msr(vcpu, msr); -} -EXPORT_SYMBOL_GPL(kvm_get_msr); -  static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)  {  	u64 data; @@ -3106,7 +3216,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)  	case KVM_CAP_HYPERV_EVENTFD:  	case KVM_CAP_HYPERV_TLBFLUSH:  	case KVM_CAP_HYPERV_SEND_IPI: -	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:  	case KVM_CAP_HYPERV_CPUID:  	case KVM_CAP_PCI_SEGMENT:  	case KVM_CAP_DEBUGREGS: @@ -3183,6 +3292,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)  		r = kvm_x86_ops->get_nested_state ?  			kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0;  		break; +	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: +		r = kvm_x86_ops->enable_direct_tlbflush != NULL; +		break; +	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: +		r = kvm_x86_ops->nested_enable_evmcs != NULL; +		break;  	default:  		break;  	} @@ -3506,8 +3621,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,  	for (bank = 0; bank < bank_num; bank++)  		vcpu->arch.mce_banks[bank*4] = ~(u64)0; -	if (kvm_x86_ops->setup_mce) -		kvm_x86_ops->setup_mce(vcpu); +	kvm_x86_ops->setup_mce(vcpu);  out:  	return r;  } @@ -3957,6 +4071,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,  				r = -EFAULT;  		}  		return r; +	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: +		if (!kvm_x86_ops->enable_direct_tlbflush) +			return -ENOTTY; + +		return kvm_x86_ops->enable_direct_tlbflush(vcpu);  	default:  		return -EINVAL; @@ -4986,9 +5105,15 @@ out:  static void kvm_init_msr_list(void)  { +	struct x86_pmu_capability x86_pmu;  	u32 dummy[2];  	unsigned i, j; +	BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, +			 "Please update the fixed PMCs in msrs_to_save[]"); + +	perf_get_x86_pmu_capability(&x86_pmu); +  	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {  		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)  			continue; @@ -5029,6 +5154,15 @@ static void kvm_init_msr_list(void)  				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)  				continue;  			break; +		case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 31: +			if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >= +			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) +				continue; +			break; +		case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 31: +			if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= +			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) +				continue;  		}  		default:  			break; @@ -5312,6 +5446,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,  	/* kvm_write_guest_virt_system can pull in tons of pages. */  	vcpu->arch.l1tf_flush_l1d = true; +	/* +	 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED +	 * is returned, but our callers are not ready for that and they blindly +	 * call kvm_inject_page_fault.  Ensure that they at least do not leak +	 * uninitialized kernel stack memory into cr2 and error code. +	 */ +	memset(exception, 0, sizeof(*exception));  	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,  					   PFERR_WRITE_MASK, exception);  } @@ -5320,7 +5461,6 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);  int handle_ud(struct kvm_vcpu *vcpu)  {  	int emul_type = EMULTYPE_TRAP_UD; -	enum emulation_result er;  	char sig[5]; /* ud2; .ascii "kvm" */  	struct x86_exception e; @@ -5329,15 +5469,10 @@ int handle_ud(struct kvm_vcpu *vcpu)  				sig, sizeof(sig), &e) == 0 &&  	    memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {  		kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); -		emul_type = 0; +		emul_type = EMULTYPE_TRAP_UD_FORCED;  	} -	er = kvm_emulate_instruction(vcpu, emul_type); -	if (er == EMULATE_USER_EXIT) -		return 0; -	if (er != EMULATE_DONE) -		kvm_queue_exception(vcpu, UD_VECTOR); -	return 1; +	return kvm_emulate_instruction(vcpu, emul_type);  }  EXPORT_SYMBOL_GPL(handle_ud); @@ -5370,7 +5505,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,  	 */  	if (vcpu_match_mmio_gva(vcpu, gva)  	    && !permission_fault(vcpu, vcpu->arch.walk_mmu, -				 vcpu->arch.access, 0, access)) { +				 vcpu->arch.mmio_access, 0, access)) {  		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |  					(gva & (PAGE_SIZE - 1));  		trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -5964,28 +6099,13 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,  static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,  			    u32 msr_index, u64 *pdata)  { -	struct msr_data msr; -	int r; - -	msr.index = msr_index; -	msr.host_initiated = false; -	r = kvm_get_msr(emul_to_vcpu(ctxt), &msr); -	if (r) -		return r; - -	*pdata = msr.data; -	return 0; +	return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);  }  static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,  			    u32 msr_index, u64 data)  { -	struct msr_data msr; - -	msr.data = data; -	msr.index = msr_index; -	msr.host_initiated = false; -	return kvm_set_msr(emul_to_vcpu(ctxt), &msr); +	return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);  }  static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) @@ -6068,6 +6188,11 @@ static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)  	kvm_smm_changed(emul_to_vcpu(ctxt));  } +static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) +{ +	return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); +} +  static const struct x86_emulate_ops emulate_ops = {  	.read_gpr            = emulator_read_gpr,  	.write_gpr           = emulator_write_gpr, @@ -6109,6 +6234,7 @@ static const struct x86_emulate_ops emulate_ops = {  	.set_hflags          = emulator_set_hflags,  	.pre_leave_smm       = emulator_pre_leave_smm,  	.post_leave_smm      = emulator_post_leave_smm, +	.set_xcr             = emulator_set_xcr,  };  static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6168,7 +6294,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)  	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;  } -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) +void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)  {  	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;  	int ret; @@ -6180,37 +6306,43 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)  	ctxt->_eip = ctxt->eip + inc_eip;  	ret = emulate_int_real(ctxt, irq); -	if (ret != X86EMUL_CONTINUE) -		return EMULATE_FAIL; - -	ctxt->eip = ctxt->_eip; -	kvm_rip_write(vcpu, ctxt->eip); -	kvm_set_rflags(vcpu, ctxt->eflags); - -	return EMULATE_DONE; +	if (ret != X86EMUL_CONTINUE) { +		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); +	} else { +		ctxt->eip = ctxt->_eip; +		kvm_rip_write(vcpu, ctxt->eip); +		kvm_set_rflags(vcpu, ctxt->eflags); +	}  }  EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);  static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)  { -	int r = EMULATE_DONE; -  	++vcpu->stat.insn_emulation_fail;  	trace_kvm_emulate_insn_failed(vcpu); -	if (emulation_type & EMULTYPE_NO_UD_ON_FAIL) -		return EMULATE_FAIL; +	if (emulation_type & EMULTYPE_VMWARE_GP) { +		kvm_queue_exception_e(vcpu, GP_VECTOR, 0); +		return 1; +	} -	if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { +	if (emulation_type & EMULTYPE_SKIP) {  		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;  		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;  		vcpu->run->internal.ndata = 0; -		r = EMULATE_USER_EXIT; +		return 0;  	}  	kvm_queue_exception(vcpu, UD_VECTOR); -	return r; +	if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { +		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; +		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; +		vcpu->run->internal.ndata = 0; +		return 0; +	} + +	return 1;  }  static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, @@ -6365,7 +6497,7 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,  	return dr6;  } -static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) +static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)  {  	struct kvm_run *kvm_run = vcpu->run; @@ -6374,18 +6506,20 @@ static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)  		kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;  		kvm_run->debug.arch.exception = DB_VECTOR;  		kvm_run->exit_reason = KVM_EXIT_DEBUG; -		*r = EMULATE_USER_EXIT; -	} else { -		kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); +		return 0;  	} +	kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); +	return 1;  }  int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)  {  	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); -	int r = EMULATE_DONE; +	int r; -	kvm_x86_ops->skip_emulated_instruction(vcpu); +	r = kvm_x86_ops->skip_emulated_instruction(vcpu); +	if (unlikely(!r)) +		return 0;  	/*  	 * rflags is the old, "raw" value of the flags.  The new value has @@ -6396,8 +6530,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)  	 * that sets the TF flag".  	 */  	if (unlikely(rflags & X86_EFLAGS_TF)) -		kvm_vcpu_do_singlestep(vcpu, &r); -	return r == EMULATE_DONE; +		r = kvm_vcpu_do_singlestep(vcpu); +	return r;  }  EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); @@ -6416,7 +6550,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)  			kvm_run->debug.arch.pc = eip;  			kvm_run->debug.arch.exception = DB_VECTOR;  			kvm_run->exit_reason = KVM_EXIT_DEBUG; -			*r = EMULATE_USER_EXIT; +			*r = 0;  			return true;  		}  	} @@ -6432,7 +6566,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)  			vcpu->arch.dr6 &= ~DR_TRAP_BITS;  			vcpu->arch.dr6 |= dr6 | DR6_RTM;  			kvm_queue_exception(vcpu, DB_VECTOR); -			*r = EMULATE_DONE; +			*r = 1;  			return true;  		}  	} @@ -6516,32 +6650,48 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,  		trace_kvm_emulate_insn_start(vcpu);  		++vcpu->stat.insn_emulation;  		if (r != EMULATION_OK)  { -			if (emulation_type & EMULTYPE_TRAP_UD) -				return EMULATE_FAIL; +			if ((emulation_type & EMULTYPE_TRAP_UD) || +			    (emulation_type & EMULTYPE_TRAP_UD_FORCED)) { +				kvm_queue_exception(vcpu, UD_VECTOR); +				return 1; +			}  			if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,  						emulation_type)) -				return EMULATE_DONE; -			if (ctxt->have_exception && inject_emulated_exception(vcpu)) -				return EMULATE_DONE; -			if (emulation_type & EMULTYPE_SKIP) -				return EMULATE_FAIL; +				return 1; +			if (ctxt->have_exception) { +				/* +				 * #UD should result in just EMULATION_FAILED, and trap-like +				 * exception should not be encountered during decode. +				 */ +				WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR || +					     exception_type(ctxt->exception.vector) == EXCPT_TRAP); +				inject_emulated_exception(vcpu); +				return 1; +			}  			return handle_emulation_failure(vcpu, emulation_type);  		}  	} -	if ((emulation_type & EMULTYPE_VMWARE) && -	    !is_vmware_backdoor_opcode(ctxt)) -		return EMULATE_FAIL; +	if ((emulation_type & EMULTYPE_VMWARE_GP) && +	    !is_vmware_backdoor_opcode(ctxt)) { +		kvm_queue_exception_e(vcpu, GP_VECTOR, 0); +		return 1; +	} +	/* +	 * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks +	 * for kvm_skip_emulated_instruction().  The caller is responsible for +	 * updating interruptibility state and injecting single-step #DBs. +	 */  	if (emulation_type & EMULTYPE_SKIP) {  		kvm_rip_write(vcpu, ctxt->_eip);  		if (ctxt->eflags & X86_EFLAGS_RF)  			kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); -		return EMULATE_DONE; +		return 1;  	}  	if (retry_instruction(ctxt, cr2, emulation_type)) -		return EMULATE_DONE; +		return 1;  	/* this is needed for vmware backdoor interface to work since it  	   changes registers values  during IO operation */ @@ -6557,18 +6707,18 @@ restart:  	r = x86_emulate_insn(ctxt);  	if (r == EMULATION_INTERCEPTED) -		return EMULATE_DONE; +		return 1;  	if (r == EMULATION_FAILED) {  		if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,  					emulation_type)) -			return EMULATE_DONE; +			return 1;  		return handle_emulation_failure(vcpu, emulation_type);  	}  	if (ctxt->have_exception) { -		r = EMULATE_DONE; +		r = 1;  		if (inject_emulated_exception(vcpu))  			return r;  	} else if (vcpu->arch.pio.count) { @@ -6579,27 +6729,30 @@ restart:  			writeback = false;  			vcpu->arch.complete_userspace_io = complete_emulated_pio;  		} -		r = EMULATE_USER_EXIT; +		r = 0;  	} else if (vcpu->mmio_needed) { +		++vcpu->stat.mmio_exits; +  		if (!vcpu->mmio_is_write)  			writeback = false; -		r = EMULATE_USER_EXIT; +		r = 0;  		vcpu->arch.complete_userspace_io = complete_emulated_mmio;  	} else if (r == EMULATION_RESTART)  		goto restart;  	else -		r = EMULATE_DONE; +		r = 1;  	if (writeback) {  		unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);  		toggle_interruptibility(vcpu, ctxt->interruptibility);  		vcpu->arch.emulate_regs_need_sync_to_vcpu = false; -		kvm_rip_write(vcpu, ctxt->eip); -		if (r == EMULATE_DONE && ctxt->tf) -			kvm_vcpu_do_singlestep(vcpu, &r);  		if (!ctxt->have_exception || -		    exception_type(ctxt->exception.vector) == EXCPT_TRAP) +		    exception_type(ctxt->exception.vector) == EXCPT_TRAP) { +			kvm_rip_write(vcpu, ctxt->eip); +			if (r && ctxt->tf) +				r = kvm_vcpu_do_singlestep(vcpu);  			__kvm_set_rflags(vcpu, ctxt->eflags); +		}  		/*  		 * For STI, interrupts are shadowed; so KVM_REQ_EVENT will @@ -8191,12 +8344,11 @@ static int vcpu_run(struct kvm_vcpu *vcpu)  static inline int complete_emulated_io(struct kvm_vcpu *vcpu)  {  	int r; +  	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);  	r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);  	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); -	if (r != EMULATE_DONE) -		return 0; -	return 1; +	return r;  }  static int complete_emulated_pio(struct kvm_vcpu *vcpu) @@ -8564,23 +8716,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,  	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,  				   has_error_code, error_code); - -	if (ret) -		return EMULATE_FAIL; +	if (ret) { +		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; +		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; +		vcpu->run->internal.ndata = 0; +		return 0; +	}  	kvm_rip_write(vcpu, ctxt->eip);  	kvm_set_rflags(vcpu, ctxt->eflags);  	kvm_make_request(KVM_REQ_EVENT, vcpu); -	return EMULATE_DONE; +	return 1;  }  EXPORT_SYMBOL_GPL(kvm_task_switch);  static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)  { -	if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && -			(sregs->cr4 & X86_CR4_OSXSAVE)) -		return  -EINVAL; -  	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {  		/*  		 * When EFER.LME and CR0.PG are set, the processor is in @@ -8599,7 +8750,7 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)  			return -EINVAL;  	} -	return 0; +	return kvm_valid_cr4(vcpu, sregs->cr4);  }  static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -9289,6 +9440,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)  	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);  	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); +	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);  	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);  	atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -9314,10 +9466,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)  	kvm_page_track_init(kvm);  	kvm_mmu_init_vm(kvm); -	if (kvm_x86_ops->vm_init) -		return kvm_x86_ops->vm_init(kvm); - -	return 0; +	return kvm_x86_ops->vm_init(kvm);  }  static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) @@ -9621,8 +9770,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,  	 * Scan sptes if dirty logging has been stopped, dropping those  	 * which can be collapsed into a single large-page spte.  Later  	 * page faults will create the large-page sptes. +	 * +	 * There is no need to do this in any of the following cases: +	 * CREATE:	No dirty mappings will already exist. +	 * MOVE/DELETE:	The old mappings will already have been cleaned up by +	 *		kvm_arch_flush_shadow_memslot()  	 */ -	if ((change != KVM_MR_DELETE) && +	if (change == KVM_MR_FLAGS_ONLY &&  		(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&  		!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))  		kvm_mmu_zap_collapsible_sptes(kvm, new); @@ -10009,7 +10163,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);  bool kvm_arch_has_irq_bypass(void)  { -	return kvm_x86_ops->update_pi_irte != NULL; +	return true;  }  int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, @@ -10049,9 +10203,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,  int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,  				   uint32_t guest_irq, bool set)  { -	if (!kvm_x86_ops->update_pi_irte) -		return -EINVAL; -  	return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);  } @@ -10078,11 +10229,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);  |