diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
| -rw-r--r-- | arch/x86/kvm/vmx.c | 241 | 
1 files changed, 155 insertions, 86 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 801332edefc3..bfe11cf124a1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -383,6 +383,9 @@ struct nested_vmx {  	struct hrtimer preemption_timer;  	bool preemption_timer_expired; + +	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ +	u64 vmcs01_debugctl;  };  #define POSTED_INTR_ON  0 @@ -740,7 +743,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);  static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);  static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);  static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); -static bool vmx_mpx_supported(void);  static DEFINE_PER_CPU(struct vmcs *, vmxarea);  static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -820,7 +822,6 @@ static const u32 vmx_msr_index[] = {  #endif  	MSR_EFER, MSR_TSC_AUX, MSR_STAR,  }; -#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)  static inline bool is_page_fault(u32 intr_info)  { @@ -1940,7 +1941,7 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)  	vmcs_writel(GUEST_RFLAGS, rflags);  } -static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)  {  	u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);  	int ret = 0; @@ -1950,7 +1951,7 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)  	if (interruptibility & GUEST_INTR_STATE_MOV_SS)  		ret |= KVM_X86_SHADOW_INT_MOV_SS; -	return ret & mask; +	return ret;  }  static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) @@ -2239,10 +2240,13 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)   * or other means.   */  static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; +static u32 nested_vmx_true_procbased_ctls_low;  static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;  static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;  static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; +static u32 nested_vmx_true_exit_ctls_low;  static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; +static u32 nested_vmx_true_entry_ctls_low;  static u32 nested_vmx_misc_low, nested_vmx_misc_high;  static u32 nested_vmx_ept_caps;  static __init void nested_vmx_setup_ctls_msrs(void) @@ -2265,21 +2269,13 @@ static __init void nested_vmx_setup_ctls_msrs(void)  	/* pin-based controls */  	rdmsr(MSR_IA32_VMX_PINBASED_CTLS,  	      nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); -	/* -	 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is -	 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. -	 */  	nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;  	nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |  		PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;  	nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |  		PIN_BASED_VMX_PREEMPTION_TIMER; -	/* -	 * Exit controls -	 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and -	 * 17 must be 1. -	 */ +	/* exit controls */  	rdmsr(MSR_IA32_VMX_EXIT_CTLS,  		nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);  	nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; @@ -2296,10 +2292,13 @@ static __init void nested_vmx_setup_ctls_msrs(void)  	if (vmx_mpx_supported())  		nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; +	/* We support free control of debug control saving. */ +	nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low & +		~VM_EXIT_SAVE_DEBUG_CONTROLS; +  	/* entry controls */  	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,  		nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); -	/* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */  	nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;  	nested_vmx_entry_ctls_high &=  #ifdef CONFIG_X86_64 @@ -2311,10 +2310,14 @@ static __init void nested_vmx_setup_ctls_msrs(void)  	if (vmx_mpx_supported())  		nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; +	/* We support free control of debug control loading. */ +	nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low & +		~VM_ENTRY_LOAD_DEBUG_CONTROLS; +  	/* cpu-based controls */  	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,  		nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); -	nested_vmx_procbased_ctls_low = 0; +	nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;  	nested_vmx_procbased_ctls_high &=  		CPU_BASED_VIRTUAL_INTR_PENDING |  		CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | @@ -2335,7 +2338,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)  	 * can use it to avoid exits to L1 - even when L0 runs L2  	 * without MSR bitmaps.  	 */ -	nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS; +	nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | +		CPU_BASED_USE_MSR_BITMAPS; + +	/* We support free control of CR3 access interception. */ +	nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low & +		~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);  	/* secondary cpu-based controls */  	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, @@ -2394,7 +2402,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)  		 * guest, and the VMCS structure we give it - not about the  		 * VMX support of the underlying hardware.  		 */ -		*pdata = VMCS12_REVISION | +		*pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |  			   ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |  			   (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);  		break; @@ -2404,16 +2412,25 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)  					nested_vmx_pinbased_ctls_high);  		break;  	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: +		*pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low, +					nested_vmx_procbased_ctls_high); +		break;  	case MSR_IA32_VMX_PROCBASED_CTLS:  		*pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,  					nested_vmx_procbased_ctls_high);  		break;  	case MSR_IA32_VMX_TRUE_EXIT_CTLS: +		*pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low, +					nested_vmx_exit_ctls_high); +		break;  	case MSR_IA32_VMX_EXIT_CTLS:  		*pdata = vmx_control_msr(nested_vmx_exit_ctls_low,  					nested_vmx_exit_ctls_high);  		break;  	case MSR_IA32_VMX_TRUE_ENTRY_CTLS: +		*pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low, +					nested_vmx_entry_ctls_high); +		break;  	case MSR_IA32_VMX_ENTRY_CTLS:  		*pdata = vmx_control_msr(nested_vmx_entry_ctls_low,  					nested_vmx_entry_ctls_high); @@ -2442,7 +2459,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)  		*pdata = -1ULL;  		break;  	case MSR_IA32_VMX_VMCS_ENUM: -		*pdata = 0x1f; +		*pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */  		break;  	case MSR_IA32_VMX_PROCBASED_CTLS2:  		*pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, @@ -3653,7 +3670,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,  	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));  out: -	vmx->emulation_required |= emulation_required(vcpu); +	vmx->emulation_required = emulation_required(vcpu);  }  static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -4422,7 +4439,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)  		vmx->vcpu.arch.pat = host_pat;  	} -	for (i = 0; i < NR_VMX_MSR; ++i) { +	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {  		u32 index = vmx_msr_index[i];  		u32 data_low, data_high;  		int j = vmx->nmsrs; @@ -4873,7 +4890,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)  		if (!(vcpu->guest_debug &  		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {  			vcpu->arch.dr6 &= ~15; -			vcpu->arch.dr6 |= dr6; +			vcpu->arch.dr6 |= dr6 | DR6_RTM;  			if (!(dr6 & ~DR6_RESERVED)) /* icebp */  				skip_emulated_instruction(vcpu); @@ -5039,7 +5056,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)  	reg = (exit_qualification >> 8) & 15;  	switch ((exit_qualification >> 4) & 3) {  	case 0: /* mov to cr */ -		val = kvm_register_read(vcpu, reg); +		val = kvm_register_readl(vcpu, reg);  		trace_kvm_cr_write(cr, val);  		switch (cr) {  		case 0: @@ -5056,7 +5073,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)  			return 1;  		case 8: {  				u8 cr8_prev = kvm_get_cr8(vcpu); -				u8 cr8 = kvm_register_read(vcpu, reg); +				u8 cr8 = (u8)val;  				err = kvm_set_cr8(vcpu, cr8);  				kvm_complete_insn_gp(vcpu, err);  				if (irqchip_in_kernel(vcpu->kvm)) @@ -5132,7 +5149,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)  			return 0;  		} else {  			vcpu->arch.dr7 &= ~DR7_GD; -			vcpu->arch.dr6 |= DR6_BD; +			vcpu->arch.dr6 |= DR6_BD | DR6_RTM;  			vmcs_writel(GUEST_DR7, vcpu->arch.dr7);  			kvm_queue_exception(vcpu, DB_VECTOR);  			return 1; @@ -5165,7 +5182,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)  			return 1;  		kvm_register_write(vcpu, reg, val);  	} else -		if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) +		if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))  			return 1;  	skip_emulated_instruction(vcpu); @@ -5621,7 +5638,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)  	cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);  	intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; -	while (!guest_state_valid(vcpu) && count-- != 0) { +	while (vmx->emulation_required && count-- != 0) {  		if (intr_window_requested && vmx_interrupt_allowed(vcpu))  			return handle_interrupt_window(&vmx->vcpu); @@ -5655,7 +5672,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)  			schedule();  	} -	vmx->emulation_required = emulation_required(vcpu);  out:  	return ret;  } @@ -5754,22 +5770,27 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)  /*   * Free all VMCSs saved for this vcpu, except the one pointed by - * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one - * currently used, if running L2), and vmcs01 when running L2. + * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs + * must be &vmx->vmcs01.   */  static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)  {  	struct vmcs02_list *item, *n; + +	WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);  	list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { -		if (vmx->loaded_vmcs != &item->vmcs02) -			free_loaded_vmcs(&item->vmcs02); +		/* +		 * Something will leak if the above WARN triggers.  Better than +		 * a use-after-free. +		 */ +		if (vmx->loaded_vmcs == &item->vmcs02) +			continue; + +		free_loaded_vmcs(&item->vmcs02);  		list_del(&item->list);  		kfree(item); +		vmx->nested.vmcs02_num--;  	} -	vmx->nested.vmcs02_num = 0; - -	if (vmx->loaded_vmcs != &vmx->vmcs01) -		free_loaded_vmcs(&vmx->vmcs01);  }  /* @@ -5918,7 +5939,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,  		 * which replaces physical address width with 32  		 *  		 */ -		if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { +		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {  			nested_vmx_failInvalid(vcpu);  			skip_emulated_instruction(vcpu);  			return 1; @@ -5936,7 +5957,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,  		vmx->nested.vmxon_ptr = vmptr;  		break;  	case EXIT_REASON_VMCLEAR: -		if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { +		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {  			nested_vmx_failValid(vcpu,  					     VMXERR_VMCLEAR_INVALID_ADDRESS);  			skip_emulated_instruction(vcpu); @@ -5951,7 +5972,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,  		}  		break;  	case EXIT_REASON_VMPTRLD: -		if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { +		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {  			nested_vmx_failValid(vcpu,  					     VMXERR_VMPTRLD_INVALID_ADDRESS);  			skip_emulated_instruction(vcpu); @@ -6086,20 +6107,27 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)  static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)  {  	u32 exec_control; +	if (vmx->nested.current_vmptr == -1ull) +		return; + +	/* current_vmptr and current_vmcs12 are always set/reset together */ +	if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) +		return; +  	if (enable_shadow_vmcs) { -		if (vmx->nested.current_vmcs12 != NULL) { -			/* copy to memory all shadowed fields in case -			   they were modified */ -			copy_shadow_to_vmcs12(vmx); -			vmx->nested.sync_shadow_vmcs = false; -			exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); -			exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; -			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); -			vmcs_write64(VMCS_LINK_POINTER, -1ull); -		} +		/* copy to memory all shadowed fields in case +		   they were modified */ +		copy_shadow_to_vmcs12(vmx); +		vmx->nested.sync_shadow_vmcs = false; +		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); +		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; +		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); +		vmcs_write64(VMCS_LINK_POINTER, -1ull);  	}  	kunmap(vmx->nested.current_vmcs12_page);  	nested_release_page(vmx->nested.current_vmcs12_page); +	vmx->nested.current_vmptr = -1ull; +	vmx->nested.current_vmcs12 = NULL;  }  /* @@ -6110,12 +6138,9 @@ static void free_nested(struct vcpu_vmx *vmx)  {  	if (!vmx->nested.vmxon)  		return; +  	vmx->nested.vmxon = false; -	if (vmx->nested.current_vmptr != -1ull) { -		nested_release_vmcs12(vmx); -		vmx->nested.current_vmptr = -1ull; -		vmx->nested.current_vmcs12 = NULL; -	} +	nested_release_vmcs12(vmx);  	if (enable_shadow_vmcs)  		free_vmcs(vmx->nested.current_shadow_vmcs);  	/* Unpin physical memory we referred to in current vmcs02 */ @@ -6152,11 +6177,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)  	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))  		return 1; -	if (vmptr == vmx->nested.current_vmptr) { +	if (vmptr == vmx->nested.current_vmptr)  		nested_release_vmcs12(vmx); -		vmx->nested.current_vmptr = -1ull; -		vmx->nested.current_vmcs12 = NULL; -	}  	page = nested_get_page(vcpu, vmptr);  	if (page == NULL) { @@ -6384,7 +6406,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)  		return 1;  	/* Decode instruction info and find the field to read */ -	field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); +	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));  	/* Read the field, zero-extended to a u64 field_value */  	if (!vmcs12_read_any(vcpu, field, &field_value)) {  		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); @@ -6397,7 +6419,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)  	 * on the guest's mode (32 or 64 bit), not on the given field's length.  	 */  	if (vmx_instruction_info & (1u << 10)) { -		kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf), +		kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),  			field_value);  	} else {  		if (get_vmx_mem_address(vcpu, exit_qualification, @@ -6434,21 +6456,21 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)  		return 1;  	if (vmx_instruction_info & (1u << 10)) -		field_value = kvm_register_read(vcpu, +		field_value = kvm_register_readl(vcpu,  			(((vmx_instruction_info) >> 3) & 0xf));  	else {  		if (get_vmx_mem_address(vcpu, exit_qualification,  				vmx_instruction_info, &gva))  			return 1;  		if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, -			   &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) { +			   &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {  			kvm_inject_page_fault(vcpu, &e);  			return 1;  		}  	} -	field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); +	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));  	if (vmcs_field_readonly(field)) {  		nested_vmx_failValid(vcpu,  			VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); @@ -6498,9 +6520,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)  			skip_emulated_instruction(vcpu);  			return 1;  		} -		if (vmx->nested.current_vmptr != -1ull) -			nested_release_vmcs12(vmx); +		nested_release_vmcs12(vmx);  		vmx->nested.current_vmptr = vmptr;  		vmx->nested.current_vmcs12 = new_vmcs12;  		vmx->nested.current_vmcs12_page = page; @@ -6571,7 +6592,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)  	}  	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); -	type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); +	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);  	types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; @@ -6751,7 +6772,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,  	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);  	int cr = exit_qualification & 15;  	int reg = (exit_qualification >> 8) & 15; -	unsigned long val = kvm_register_read(vcpu, reg); +	unsigned long val = kvm_register_readl(vcpu, reg);  	switch ((exit_qualification >> 4) & 3) {  	case 0: /* mov to cr */ @@ -7112,7 +7133,26 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)  	if (max_irr == -1)  		return; -	vmx_set_rvi(max_irr); +	/* +	 * If a vmexit is needed, vmx_check_nested_events handles it. +	 */ +	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) +		return; + +	if (!is_guest_mode(vcpu)) { +		vmx_set_rvi(max_irr); +		return; +	} + +	/* +	 * Fall back to pre-APICv interrupt injection since L2 +	 * is run without virtual interrupt delivery. +	 */ +	if (!kvm_event_needs_reinjection(vcpu) && +	    vmx_interrupt_allowed(vcpu)) { +		kvm_queue_interrupt(vcpu, max_irr, false); +		vmx_inject_irq(vcpu); +	}  }  static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -7520,13 +7560,31 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)  	vmx_complete_interrupts(vmx);  } +static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) +{ +	struct vcpu_vmx *vmx = to_vmx(vcpu); +	int cpu; + +	if (vmx->loaded_vmcs == &vmx->vmcs01) +		return; + +	cpu = get_cpu(); +	vmx->loaded_vmcs = &vmx->vmcs01; +	vmx_vcpu_put(vcpu); +	vmx_vcpu_load(vcpu, cpu); +	vcpu->cpu = cpu; +	put_cpu(); +} +  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu);  	free_vpid(vmx); -	free_loaded_vmcs(vmx->loaded_vmcs); +	leave_guest_mode(vcpu); +	vmx_load_vmcs01(vcpu);  	free_nested(vmx); +	free_loaded_vmcs(vmx->loaded_vmcs);  	kfree(vmx->guest_msrs);  	kvm_vcpu_uninit(vcpu);  	kmem_cache_free(kvm_vcpu_cache, vmx); @@ -7548,6 +7606,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)  		goto free_vcpu;  	vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); +	BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) +		     > PAGE_SIZE); +  	err = -ENOMEM;  	if (!vmx->guest_msrs) {  		goto uninit_vcpu; @@ -7836,7 +7897,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)  	vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);  	vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); -	vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); +	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { +		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); +		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); +	} else { +		kvm_set_dr(vcpu, 7, vcpu->arch.dr7); +		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); +	}  	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,  		vmcs12->vm_entry_intr_info_field);  	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, @@ -7846,7 +7913,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)  	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,  		vmcs12->guest_interruptibility_info);  	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); -	kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);  	vmx_set_rflags(vcpu, vmcs12->guest_rflags);  	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,  		vmcs12->guest_pending_dbg_exceptions); @@ -8113,14 +8179,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)  	}  	if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && -			!IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { +			!PAGE_ALIGNED(vmcs12->msr_bitmap)) {  		/*TODO: Also verify bits beyond physical address width are 0*/  		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);  		return 1;  	}  	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && -			!IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) { +			!PAGE_ALIGNED(vmcs12->apic_access_addr)) {  		/*TODO: Also verify bits beyond physical address width are 0*/  		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);  		return 1; @@ -8136,15 +8202,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)  	}  	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, -	      nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) || +				nested_vmx_true_procbased_ctls_low, +				nested_vmx_procbased_ctls_high) ||  	    !vmx_control_verify(vmcs12->secondary_vm_exec_control,  	      nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||  	    !vmx_control_verify(vmcs12->pin_based_vm_exec_control,  	      nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||  	    !vmx_control_verify(vmcs12->vm_exit_controls, -	      nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) || +				nested_vmx_true_exit_ctls_low, +				nested_vmx_exit_ctls_high) ||  	    !vmx_control_verify(vmcs12->vm_entry_controls, -	      nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high)) +				nested_vmx_true_entry_ctls_low, +				nested_vmx_entry_ctls_high))  	{  		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);  		return 1; @@ -8221,6 +8290,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)  	vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); +	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) +		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); +  	cpu = get_cpu();  	vmx->loaded_vmcs = vmcs02;  	vmx_vcpu_put(vcpu); @@ -8398,7 +8470,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,  	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);  	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); -	kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);  	vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);  	vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);  	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); @@ -8477,9 +8548,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,  		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |  		(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); +	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { +		kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); +		vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); +	} +  	/* TODO: These cannot have changed unless we have MSR bitmaps and  	 * the relevant bit asks not to trap the change */ -	vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);  	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)  		vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);  	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) @@ -8670,7 +8745,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,  			      unsigned long exit_qualification)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu); -	int cpu;  	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);  	/* trying to cancel vmlaunch/vmresume is a bug */ @@ -8680,6 +8754,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,  	prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,  		       exit_qualification); +	vmx_load_vmcs01(vcpu); +  	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)  	    && nested_exit_intr_ack_set(vcpu)) {  		int irq = kvm_cpu_get_interrupt(vcpu); @@ -8695,13 +8771,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,  				       vmcs12->vm_exit_intr_error_code,  				       KVM_ISA_VMX); -	cpu = get_cpu(); -	vmx->loaded_vmcs = &vmx->vmcs01; -	vmx_vcpu_put(vcpu); -	vmx_vcpu_load(vcpu, cpu); -	vcpu->cpu = cpu; -	put_cpu(); -  	vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));  	vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));  	vmx_segment_cache_clear(vmx); @@ -8890,7 +8959,7 @@ static int __init vmx_init(void)  	rdmsrl_safe(MSR_EFER, &host_efer); -	for (i = 0; i < NR_VMX_MSR; ++i) +	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)  		kvm_define_shared_msr(i, vmx_msr_index[i]);  	vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);  |