diff options
Diffstat (limited to 'arch/x86/kvm/vmx/nested.c')
| -rw-r--r-- | arch/x86/kvm/vmx/nested.c | 252 | 
1 files changed, 203 insertions, 49 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0e7c9301fe86..4aea7d304beb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -10,6 +10,7 @@  #include "hyperv.h"  #include "mmu.h"  #include "nested.h" +#include "pmu.h"  #include "trace.h"  #include "x86.h" @@ -27,6 +28,16 @@ module_param(nested_early_check, bool, S_IRUGO);  	failed;								\  }) +#define SET_MSR_OR_WARN(vcpu, idx, data)				\ +({									\ +	bool failed = kvm_set_msr(vcpu, idx, data);			\ +	if (failed)							\ +		pr_warn_ratelimited(					\ +				"%s cannot write MSR (0x%x, 0x%llx)\n",	\ +				__func__, idx, data);			\ +	failed;								\ +}) +  /*   * Hyper-V requires all of these, so mark them as supported even though   * they are just treated the same as all-context. @@ -257,7 +268,7 @@ static void free_nested(struct kvm_vcpu *vcpu)  	vmx->nested.cached_shadow_vmcs12 = NULL;  	/* Unpin physical memory we referred to in the vmcs02 */  	if (vmx->nested.apic_access_page) { -		kvm_release_page_dirty(vmx->nested.apic_access_page); +		kvm_release_page_clean(vmx->nested.apic_access_page);  		vmx->nested.apic_access_page = NULL;  	}  	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); @@ -929,6 +940,57 @@ fail:  	return i + 1;  } +static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, +					    u32 msr_index, +					    u64 *data) +{ +	struct vcpu_vmx *vmx = to_vmx(vcpu); + +	/* +	 * If the L0 hypervisor stored a more accurate value for the TSC that +	 * does not include the time taken for emulation of the L2->L1 +	 * VM-exit in L0, use the more accurate value. +	 */ +	if (msr_index == MSR_IA32_TSC) { +		int index = vmx_find_msr_index(&vmx->msr_autostore.guest, +					       MSR_IA32_TSC); + +		if (index >= 0) { +			u64 val = vmx->msr_autostore.guest.val[index].value; + +			*data = kvm_read_l1_tsc(vcpu, val); +			return true; +		} +	} + +	if (kvm_get_msr(vcpu, msr_index, data)) { +		pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, +			msr_index); +		return false; +	} +	return true; +} + +static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, +				     struct vmx_msr_entry *e) +{ +	if (kvm_vcpu_read_guest(vcpu, +				gpa + i * sizeof(*e), +				e, 2 * sizeof(u32))) { +		pr_debug_ratelimited( +			"%s cannot read MSR entry (%u, 0x%08llx)\n", +			__func__, i, gpa + i * sizeof(*e)); +		return false; +	} +	if (nested_vmx_store_msr_check(vcpu, e)) { +		pr_debug_ratelimited( +			"%s check failed (%u, 0x%x, 0x%x)\n", +			__func__, i, e->index, e->reserved); +		return false; +	} +	return true; +} +  static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)  {  	u64 data; @@ -940,26 +1002,12 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)  		if (unlikely(i >= max_msr_list_size))  			return -EINVAL; -		if (kvm_vcpu_read_guest(vcpu, -					gpa + i * sizeof(e), -					&e, 2 * sizeof(u32))) { -			pr_debug_ratelimited( -				"%s cannot read MSR entry (%u, 0x%08llx)\n", -				__func__, i, gpa + i * sizeof(e)); +		if (!read_and_check_msr_entry(vcpu, gpa, i, &e))  			return -EINVAL; -		} -		if (nested_vmx_store_msr_check(vcpu, &e)) { -			pr_debug_ratelimited( -				"%s check failed (%u, 0x%x, 0x%x)\n", -				__func__, i, e.index, e.reserved); -			return -EINVAL; -		} -		if (kvm_get_msr(vcpu, e.index, &data)) { -			pr_debug_ratelimited( -				"%s cannot read MSR (%u, 0x%x)\n", -				__func__, i, e.index); + +		if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))  			return -EINVAL; -		} +  		if (kvm_vcpu_write_guest(vcpu,  					 gpa + i * sizeof(e) +  					     offsetof(struct vmx_msr_entry, value), @@ -973,6 +1021,60 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)  	return 0;  } +static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) +{ +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu); +	u32 count = vmcs12->vm_exit_msr_store_count; +	u64 gpa = vmcs12->vm_exit_msr_store_addr; +	struct vmx_msr_entry e; +	u32 i; + +	for (i = 0; i < count; i++) { +		if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) +			return false; + +		if (e.index == msr_index) +			return true; +	} +	return false; +} + +static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, +					   u32 msr_index) +{ +	struct vcpu_vmx *vmx = to_vmx(vcpu); +	struct vmx_msrs *autostore = &vmx->msr_autostore.guest; +	bool in_vmcs12_store_list; +	int msr_autostore_index; +	bool in_autostore_list; +	int last; + +	msr_autostore_index = vmx_find_msr_index(autostore, msr_index); +	in_autostore_list = msr_autostore_index >= 0; +	in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); + +	if (in_vmcs12_store_list && !in_autostore_list) { +		if (autostore->nr == NR_LOADSTORE_MSRS) { +			/* +			 * Emulated VMEntry does not fail here.  Instead a less +			 * accurate value will be returned by +			 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() +			 * instead of reading the value from the vmcs02 VMExit +			 * MSR-store area. +			 */ +			pr_warn_ratelimited( +				"Not enough msr entries in msr_autostore.  Can't add msr %x\n", +				msr_index); +			return; +		} +		last = autostore->nr++; +		autostore->val[last].index = msr_index; +	} else if (!in_vmcs12_store_list && in_autostore_list) { +		last = --autostore->nr; +		autostore->val[msr_autostore_index] = autostore->val[last]; +	} +} +  static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)  {  	unsigned long invalid_mask; @@ -1012,7 +1114,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne  		kvm_mmu_new_cr3(vcpu, cr3, false);  	vcpu->arch.cr3 = cr3; -	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); +	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);  	kvm_init_mmu(vcpu, false); @@ -1024,7 +1126,9 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne   * populated by L2 differently than TLB entries populated   * by L1.   * - * If L1 uses EPT, then TLB entries are tagged with different EPTP. + * If L0 uses EPT, L1 and L2 run with different EPTP because + * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries + * are tagged with different EPTP.   *   * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged   * with different VPID (L1 entries are tagged with vmx->vpid @@ -1034,7 +1138,7 @@ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)  {  	struct vmcs12 *vmcs12 = get_vmcs12(vcpu); -	return nested_cpu_has_ept(vmcs12) || +	return enable_ept ||  	       (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);  } @@ -2018,7 +2122,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)  	 * addresses are constant (for vmcs02), the counts can change based  	 * on L2's behavior, e.g. switching to/from long mode.  	 */ -	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); +	vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));  	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));  	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); @@ -2073,6 +2177,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)  	exec_control &= ~CPU_BASED_TPR_SHADOW;  	exec_control |= vmcs12->cpu_based_vm_exec_control; +	vmx->nested.l1_tpr_threshold = -1;  	if (exec_control & CPU_BASED_TPR_SHADOW)  		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);  #ifdef CONFIG_X86_64 @@ -2285,6 +2390,13 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)  		vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);  	} +	/* +	 * Make sure the msr_autostore list is up to date before we set the +	 * count in the vmcs02. +	 */ +	prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); + +	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);  	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);  	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); @@ -2381,9 +2493,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,  	if (nested_cpu_has_ept(vmcs12))  		nested_ept_init_mmu_context(vcpu); -	else if (nested_cpu_has2(vmcs12, -				 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) -		vmx_flush_tlb(vcpu, true);  	/*  	 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those @@ -2418,6 +2527,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,  				entry_failure_code))  		return -EINVAL; +	/* +	 * Immediately write vmcs02.GUEST_CR3.  It will be propagated to vmcs12 +	 * on nested VM-Exit, which can occur without actually running L2 and +	 * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with +	 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the +	 * transition to HLT instead of running L2. +	 */ +	if (enable_ept) +		vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); +  	/* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */  	if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&  	    is_pae_paging(vcpu)) { @@ -2430,6 +2549,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,  	if (!enable_ept)  		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; +	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && +	    SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, +			    vmcs12->guest_ia32_perf_global_ctrl)) +		return -EINVAL; +  	kvm_rsp_write(vcpu, vmcs12->guest_rsp);  	kvm_rip_write(vcpu, vmcs12->guest_rip);  	return 0; @@ -2664,6 +2788,11 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,  	    CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))  		return -EINVAL; +	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && +	    CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), +					   vmcs12->host_ia32_perf_global_ctrl))) +		return -EINVAL; +  #ifdef CONFIG_X86_64  	ia32e = !!(vcpu->arch.efer & EFER_LMA);  #else @@ -2779,6 +2908,11 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,  		return -EINVAL;  	} +	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && +	    CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), +					   vmcs12->guest_ia32_perf_global_ctrl))) +		return -EINVAL; +  	/*  	 * If the load IA32_EFER VM-entry control is 1, the following checks  	 * are performed on the field for the IA32_EFER MSR: @@ -2933,7 +3067,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)  		 * to it so we can release it later.  		 */  		if (vmx->nested.apic_access_page) { /* shouldn't happen */ -			kvm_release_page_dirty(vmx->nested.apic_access_page); +			kvm_release_page_clean(vmx->nested.apic_access_page);  			vmx->nested.apic_access_page = NULL;  		}  		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); @@ -3461,6 +3595,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)  		test_bit(KVM_APIC_INIT, &apic->pending_events)) {  		if (block_nested_events)  			return -EBUSY; +		clear_bit(KVM_APIC_INIT, &apic->pending_events);  		nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);  		return 0;  	} @@ -3864,8 +3999,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,  		vcpu->arch.pat = vmcs12->host_ia32_pat;  	}  	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) -		vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, -			vmcs12->host_ia32_perf_global_ctrl); +		SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, +				vmcs12->host_ia32_perf_global_ctrl);  	/* Set L1 segment info according to Intel SDM  	    27.5.2 Loading Host Segment and Descriptor-Table Registers */ @@ -3984,7 +4119,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)  	nested_ept_uninit_mmu_context(vcpu);  	vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); -	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); +	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);  	/*  	 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs @@ -4112,6 +4247,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,  	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);  	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);  	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); +	if (vmx->nested.l1_tpr_threshold != -1) +		vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);  	if (kvm_has_tsc_control)  		decache_tsc_multiplier(vmx); @@ -4119,15 +4256,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,  	if (vmx->nested.change_vmcs01_virtual_apic_mode) {  		vmx->nested.change_vmcs01_virtual_apic_mode = false;  		vmx_set_virtual_apic_mode(vcpu); -	} else if (!nested_cpu_has_ept(vmcs12) && -		   nested_cpu_has2(vmcs12, -				   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { -		vmx_flush_tlb(vcpu, true);  	}  	/* Unpin physical memory we referred to in vmcs02 */  	if (vmx->nested.apic_access_page) { -		kvm_release_page_dirty(vmx->nested.apic_access_page); +		kvm_release_page_clean(vmx->nested.apic_access_page);  		vmx->nested.apic_access_page = NULL;  	}  	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); @@ -4327,6 +4460,27 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,  	return 0;  } +void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +{ +	struct vcpu_vmx *vmx; + +	if (!nested_vmx_allowed(vcpu)) +		return; + +	vmx = to_vmx(vcpu); +	if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { +		vmx->nested.msrs.entry_ctls_high |= +				VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; +		vmx->nested.msrs.exit_ctls_high |= +				VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; +	} else { +		vmx->nested.msrs.entry_ctls_high &= +				~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; +		vmx->nested.msrs.exit_ctls_high &= +				~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; +	} +} +  static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)  {  	gva_t gva; @@ -5766,7 +5920,7 @@ error_guest_mode:  	return ret;  } -void nested_vmx_vcpu_setup(void) +void nested_vmx_set_vmcs_shadowing_bitmap(void)  {  	if (enable_shadow_vmcs) {  		vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); @@ -6047,23 +6201,23 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))  		init_vmcs_shadow_fields();  	} -	exit_handlers[EXIT_REASON_VMCLEAR]	= handle_vmclear, -	exit_handlers[EXIT_REASON_VMLAUNCH]	= handle_vmlaunch, -	exit_handlers[EXIT_REASON_VMPTRLD]	= handle_vmptrld, -	exit_handlers[EXIT_REASON_VMPTRST]	= handle_vmptrst, -	exit_handlers[EXIT_REASON_VMREAD]	= handle_vmread, -	exit_handlers[EXIT_REASON_VMRESUME]	= handle_vmresume, -	exit_handlers[EXIT_REASON_VMWRITE]	= handle_vmwrite, -	exit_handlers[EXIT_REASON_VMOFF]	= handle_vmoff, -	exit_handlers[EXIT_REASON_VMON]		= handle_vmon, -	exit_handlers[EXIT_REASON_INVEPT]	= handle_invept, -	exit_handlers[EXIT_REASON_INVVPID]	= handle_invvpid, -	exit_handlers[EXIT_REASON_VMFUNC]	= handle_vmfunc, +	exit_handlers[EXIT_REASON_VMCLEAR]	= handle_vmclear; +	exit_handlers[EXIT_REASON_VMLAUNCH]	= handle_vmlaunch; +	exit_handlers[EXIT_REASON_VMPTRLD]	= handle_vmptrld; +	exit_handlers[EXIT_REASON_VMPTRST]	= handle_vmptrst; +	exit_handlers[EXIT_REASON_VMREAD]	= handle_vmread; +	exit_handlers[EXIT_REASON_VMRESUME]	= handle_vmresume; +	exit_handlers[EXIT_REASON_VMWRITE]	= handle_vmwrite; +	exit_handlers[EXIT_REASON_VMOFF]	= handle_vmoff; +	exit_handlers[EXIT_REASON_VMON]		= handle_vmon; +	exit_handlers[EXIT_REASON_INVEPT]	= handle_invept; +	exit_handlers[EXIT_REASON_INVVPID]	= handle_invvpid; +	exit_handlers[EXIT_REASON_VMFUNC]	= handle_vmfunc;  	kvm_x86_ops->check_nested_events = vmx_check_nested_events;  	kvm_x86_ops->get_nested_state = vmx_get_nested_state;  	kvm_x86_ops->set_nested_state = vmx_set_nested_state; -	kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, +	kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages;  	kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;  	kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;  |