diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
| -rw-r--r-- | arch/x86/kvm/vmx.c | 203 | 
1 files changed, 145 insertions, 58 deletions
| diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f85815945fc6..9120ae1901e4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -42,6 +42,7 @@  #include <asm/i387.h>  #include <asm/xcr.h>  #include <asm/perf_event.h> +#include <asm/kexec.h>  #include "trace.h" @@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void)  	return vmx_capability.ept & VMX_EPT_AD_BIT;  } -static inline bool cpu_has_vmx_invept_individual_addr(void) -{ -	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; -} -  static inline bool cpu_has_vmx_invept_context(void)  {  	return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; @@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs)  		       vmcs, phys_addr);  } +#ifdef CONFIG_KEXEC +/* + * This bitmap is used to indicate whether the vmclear + * operation is enabled on all cpus. All disabled by + * default. + */ +static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; + +static inline void crash_enable_local_vmclear(int cpu) +{ +	cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline void crash_disable_local_vmclear(int cpu) +{ +	cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline int crash_local_vmclear_enabled(int cpu) +{ +	return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static void crash_vmclear_local_loaded_vmcss(void) +{ +	int cpu = raw_smp_processor_id(); +	struct loaded_vmcs *v; + +	if (!crash_local_vmclear_enabled(cpu)) +		return; + +	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), +			    loaded_vmcss_on_cpu_link) +		vmcs_clear(v->vmcs); +} +#else +static inline void crash_enable_local_vmclear(int cpu) { } +static inline void crash_disable_local_vmclear(int cpu) { } +#endif /* CONFIG_KEXEC */ +  static void __loaded_vmcs_clear(void *arg)  {  	struct loaded_vmcs *loaded_vmcs = arg; @@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg)  		return; /* vcpu migration can race with cpu offline */  	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)  		per_cpu(current_vmcs, cpu) = NULL; +	crash_disable_local_vmclear(cpu);  	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); + +	/* +	 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link +	 * is before setting loaded_vmcs->vcpu to -1 which is done in +	 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist +	 * then adds the vmcs into percpu list before it is deleted. +	 */ +	smp_wmb(); +  	loaded_vmcs_init(loaded_vmcs); +	crash_enable_local_vmclear(cpu);  }  static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)  { -	if (loaded_vmcs->cpu != -1) -		smp_call_function_single( -			loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); +	int cpu = loaded_vmcs->cpu; + +	if (cpu != -1) +		smp_call_function_single(cpu, +			 __loaded_vmcs_clear, loaded_vmcs, 1);  }  static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) @@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp)  	}  } -static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) -{ -	if (enable_ept) { -		if (cpu_has_vmx_invept_individual_addr()) -			__invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, -					eptp, gpa); -		else -			ept_sync_context(eptp); -	} -} -  static __always_inline unsigned long vmcs_readl(unsigned long field)  {  	unsigned long value; @@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);  		local_irq_disable(); +		crash_disable_local_vmclear(cpu); + +		/* +		 * Read loaded_vmcs->cpu should be before fetching +		 * loaded_vmcs->loaded_vmcss_on_cpu_link. +		 * See the comments in __loaded_vmcs_clear(). +		 */ +		smp_rmb(); +  		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,  			 &per_cpu(loaded_vmcss_on_cpu, cpu)); +		crash_enable_local_vmclear(cpu);  		local_irq_enable();  		/* @@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void)   * Like guest_read_tsc, but always returns L1's notion of the timestamp   * counter, even if a nested guest (L2) is currently running.   */ -u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) +u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)  { -	u64 host_tsc, tsc_offset; +	u64 tsc_offset; -	rdtscll(host_tsc);  	tsc_offset = is_guest_mode(vcpu) ?  		to_vmx(vcpu)->nested.vmcs01_tsc_offset :  		vmcs_read64(TSC_OFFSET); @@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)  		WARN(1, "user requested TSC rate below hardware speed\n");  } +static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) +{ +	return vmcs_read64(TSC_OFFSET); +} +  /*   * writes 'offset' into guest's timestamp counter offset register   */ @@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)   * Returns 0 on success, non-0 otherwise.   * Assumes vcpu_load() was already called.   */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu);  	struct shared_msr_entry *msr;  	int ret = 0; +	u32 msr_index = msr_info->index; +	u64 data = msr_info->data;  	switch (msr_index) {  	case MSR_EFER: -		ret = kvm_set_msr_common(vcpu, msr_index, data); +		ret = kvm_set_msr_common(vcpu, msr_info);  		break;  #ifdef CONFIG_X86_64  	case MSR_FS_BASE: @@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)  		vmcs_writel(GUEST_SYSENTER_ESP, data);  		break;  	case MSR_IA32_TSC: -		kvm_write_tsc(vcpu, data); +		kvm_write_tsc(vcpu, msr_info);  		break;  	case MSR_IA32_CR_PAT:  		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)  			vcpu->arch.pat = data;  			break;  		} -		ret = kvm_set_msr_common(vcpu, msr_index, data); +		ret = kvm_set_msr_common(vcpu, msr_info); +		break; +	case MSR_IA32_TSC_ADJUST: +		ret = kvm_set_msr_common(vcpu, msr_info);  		break;  	case MSR_TSC_AUX:  		if (!vmx->rdtscp_enabled) @@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)  			}  			break;  		} -		ret = kvm_set_msr_common(vcpu, msr_index, data); +		ret = kvm_set_msr_common(vcpu, msr_info);  	}  	return ret; @@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage)  		return -EBUSY;  	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + +	/* +	 * Now we can enable the vmclear operation in kdump +	 * since the loaded_vmcss_on_cpu list on this cpu +	 * has been initialized. +	 * +	 * Though the cpu is not in VMX operation now, there +	 * is no problem to enable the vmclear operation +	 * for the loaded_vmcss_on_cpu list is empty! +	 */ +	crash_enable_local_vmclear(cpu); +  	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);  	test_bits = FEATURE_CONTROL_LOCKED; @@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment  	if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {  		tmp.base = vmcs_readl(sf->base);  		tmp.selector = vmcs_read16(sf->selector); +		tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;  		tmp.s = 1;  	}  	vmx_set_segment(vcpu, &tmp, seg); @@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,  	 * unrestricted guest like Westmere to older host that don't have  	 * unrestricted guest like Nehelem.  	 */ -	if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { +	if (vmx->rmode.vm86_active) {  		switch (seg) {  		case VCPU_SREG_CS:  			vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); @@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)  	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);  	set_cr4_guest_host_mask(vmx); -	kvm_write_tsc(&vmx->vcpu, 0); -  	return 0;  } @@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)  	u64 msr;  	int ret; -	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); -  	vmx->rmode.vm86_active = 0;  	vmx->soft_vnmi_blocked = 0; @@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)  		msr |= MSR_IA32_APICBASE_BSP;  	kvm_set_apic_base(&vmx->vcpu, msr); -	ret = fx_init(&vmx->vcpu); -	if (ret != 0) -		goto out; -  	vmx_segment_cache_clear(vmx);  	seg_setup(VCPU_SREG_CS); @@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)  		kvm_rip_write(vcpu, 0xfff0);  	else  		kvm_rip_write(vcpu, 0); -	kvm_register_write(vcpu, VCPU_REGS_RSP, 0);  	vmcs_writel(GUEST_GDTR_BASE, 0);  	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)  	/* HACK: Don't enable emulation on guest boot/reset */  	vmx->emulation_required = 0; -out:  	return ret;  } @@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)  	if (is_machine_check(intr_info))  		return handle_machine_check(vcpu); -	if ((vect_info & VECTORING_INFO_VALID_MASK) && -	    !is_page_fault(intr_info)) { -		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; -		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; -		vcpu->run->internal.ndata = 2; -		vcpu->run->internal.data[0] = vect_info; -		vcpu->run->internal.data[1] = intr_info; -		return 0; -	} -  	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)  		return 1;  /* already handled by vmx_vcpu_run() */ @@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu)  	error_code = 0;  	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)  		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + +	/* +	 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing +	 * MMIO, it is better to report an internal error. +	 * See the comments in vmx_handle_exit. +	 */ +	if ((vect_info & VECTORING_INFO_VALID_MASK) && +	    !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { +		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; +		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; +		vcpu->run->internal.ndata = 2; +		vcpu->run->internal.data[0] = vect_info; +		vcpu->run->internal.data[1] = intr_info; +		return 0; +	} +  	if (is_page_fault(intr_info)) {  		/* EPT won't cause page fault directly */  		BUG_ON(enable_ept); @@ -4626,11 +4692,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)  static int handle_wrmsr(struct kvm_vcpu *vcpu)  { +	struct msr_data msr;  	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];  	u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)  		| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); -	if (vmx_set_msr(vcpu, ecx, data) != 0) { +	msr.data = data; +	msr.index = ecx; +	msr.host_initiated = false; +	if (vmx_set_msr(vcpu, &msr) != 0) {  		trace_kvm_msr_write_ex(ecx, data);  		kvm_inject_gp(vcpu, 0);  		return 1; @@ -4827,11 +4897,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)  	exit_qualification = vmcs_readl(EXIT_QUALIFICATION); -	if (exit_qualification & (1 << 6)) { -		printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); -		return -EINVAL; -	} -  	gla_validity = (exit_qualification >> 7) & 0x3;  	if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {  		printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); @@ -5979,13 +6044,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)  		return 0;  	} +	/* +	 * Note: +	 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by +	 * delivery event since it indicates guest is accessing MMIO. +	 * The vm-exit can be triggered again after return to guest that +	 * will cause infinite loop. +	 */  	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&  			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&  			exit_reason != EXIT_REASON_EPT_VIOLATION && -			exit_reason != EXIT_REASON_TASK_SWITCH)) -		printk(KERN_WARNING "%s: unexpected, valid vectoring info " -		       "(0x%x) and exit reason is 0x%x\n", -		       __func__, vectoring_info, exit_reason); +			exit_reason != EXIT_REASON_TASK_SWITCH)) { +		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; +		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; +		vcpu->run->internal.ndata = 2; +		vcpu->run->internal.data[0] = vectoring_info; +		vcpu->run->internal.data[1] = exit_reason; +		return 0; +	}  	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&  	    !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( @@ -7309,6 +7385,7 @@ static struct kvm_x86_ops vmx_x86_ops = {  	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,  	.set_tsc_khz = vmx_set_tsc_khz, +	.read_tsc_offset = vmx_read_tsc_offset,  	.write_tsc_offset = vmx_write_tsc_offset,  	.adjust_tsc_offset = vmx_adjust_tsc_offset,  	.compute_tsc_offset = vmx_compute_tsc_offset, @@ -7367,6 +7444,11 @@ static int __init vmx_init(void)  	if (r)  		goto out3; +#ifdef CONFIG_KEXEC +	rcu_assign_pointer(crash_vmclear_loaded_vmcss, +			   crash_vmclear_local_loaded_vmcss); +#endif +  	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);  	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);  	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); @@ -7404,6 +7486,11 @@ static void __exit vmx_exit(void)  	free_page((unsigned long)vmx_io_bitmap_b);  	free_page((unsigned long)vmx_io_bitmap_a); +#ifdef CONFIG_KEXEC +	rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); +	synchronize_rcu(); +#endif +  	kvm_exit();  } |