diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx.c | 1392 |
1 files changed, 785 insertions, 607 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a8b96dc4cd83..f427723dc7db 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -34,6 +34,7 @@ #include <linux/tboot.h> #include <linux/hrtimer.h> #include <linux/frame.h> +#include <linux/nospec.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -111,6 +112,14 @@ static u64 __read_mostly host_xss; static bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 +#define MSR_TYPE_RW 3 + +#define MSR_BITMAP_MODE_X2APIC 1 +#define MSR_BITMAP_MODE_X2APIC_APICV 2 +#define MSR_BITMAP_MODE_LM 4 + #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ @@ -185,7 +194,6 @@ module_param(ple_window_max, int, S_IRUGO); extern const ulong vmx_return; #define NR_AUTOLOAD_MSRS 8 -#define VMCS02_POOL_SIZE 1 struct vmcs { u32 revision_id; @@ -210,6 +218,7 @@ struct loaded_vmcs { int soft_vnmi_blocked; ktime_t entry_time; s64 vnmi_blocked_time; + unsigned long *msr_bitmap; struct list_head loaded_vmcss_on_cpu_link; }; @@ -226,7 +235,7 @@ struct shared_msr_entry { * stored in guest memory specified by VMPTRLD, but is opaque to the guest, * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. * More than one of these structures may exist, if L1 runs multiple L2 guests. - * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the + * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the * underlying hardware which will be used to run L2. * This structure is packed to ensure that its layout is identical across * machines (necessary for live migration). @@ -409,12 +418,11 @@ struct __packed vmcs12 { */ #define VMCS12_SIZE 0x1000 -/* Used to remember the last vmcs02 used for some recently used vmcs12s */ -struct vmcs02_list { - struct list_head list; - gpa_t vmptr; - struct loaded_vmcs vmcs02; -}; +/* + * VMCS12_MAX_FIELD_INDEX is the highest index value used in any + * supported VMCS12 field encoding. + */ +#define VMCS12_MAX_FIELD_INDEX 0x17 /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need @@ -439,16 +447,17 @@ struct nested_vmx { * data hold by vmcs12 */ bool sync_shadow_vmcs; + bool dirty_vmcs12; - /* vmcs02_list cache of VMCSs recently used to run L2 guests */ - struct list_head vmcs02_pool; - int vmcs02_num; bool change_vmcs01_virtual_x2apic_mode; /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; + + struct loaded_vmcs vmcs02; + /* - * Guest pages referred to in vmcs02 with host-physical pointers, so - * we must keep them pinned while L2 runs. + * Guest pages referred to in the vmcs02 with host-physical + * pointers, so we must keep them pinned while L2 runs. */ struct page *apic_access_page; struct page *virtual_apic_page; @@ -457,8 +466,6 @@ struct nested_vmx { bool pi_pending; u16 posted_intr_nv; - unsigned long *msr_bitmap; - struct hrtimer preemption_timer; bool preemption_timer_expired; @@ -581,6 +588,7 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; u8 fail; + u8 msr_bitmap_mode; u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -592,6 +600,10 @@ struct vcpu_vmx { u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif + + u64 arch_capabilities; + u64 spec_ctrl; + u32 vm_entry_controls_shadow; u32 vm_exit_controls_shadow; u32 secondary_exec_control; @@ -659,6 +671,8 @@ struct vcpu_vmx { u32 host_pkru; + unsigned long host_debugctlmsr; + /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included @@ -687,67 +701,24 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) -#define FIELD(number, name) [number] = VMCS12_OFFSET(name) -#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ - [number##_HIGH] = VMCS12_OFFSET(name)+4 +#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) +#define FIELD64(number, name) \ + FIELD(number, name), \ + [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) -static unsigned long shadow_read_only_fields[] = { - /* - * We do NOT shadow fields that are modified when L0 - * traps and emulates any vmx instruction (e.g. VMPTRLD, - * VMXON...) executed by L1. - * For example, VM_INSTRUCTION_ERROR is read - * by L1 if a vmx instruction fails (part of the error path). - * Note the code assumes this logic. If for some reason - * we start shadowing these fields then we need to - * force a shadow sync when L0 emulates vmx instructions - * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified - * by nested_vmx_failValid) - */ - VM_EXIT_REASON, - VM_EXIT_INTR_INFO, - VM_EXIT_INSTRUCTION_LEN, - IDT_VECTORING_INFO_FIELD, - IDT_VECTORING_ERROR_CODE, - VM_EXIT_INTR_ERROR_CODE, - EXIT_QUALIFICATION, - GUEST_LINEAR_ADDRESS, - GUEST_PHYSICAL_ADDRESS +static u16 shadow_read_only_fields[] = { +#define SHADOW_FIELD_RO(x) x, +#include "vmx_shadow_fields.h" }; static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); -static unsigned long shadow_read_write_fields[] = { - TPR_THRESHOLD, - GUEST_RIP, - GUEST_RSP, - GUEST_CR0, - GUEST_CR3, - GUEST_CR4, - GUEST_INTERRUPTIBILITY_INFO, - GUEST_RFLAGS, - GUEST_CS_SELECTOR, - GUEST_CS_AR_BYTES, - GUEST_CS_LIMIT, - GUEST_CS_BASE, - GUEST_ES_BASE, - GUEST_BNDCFGS, - CR0_GUEST_HOST_MASK, - CR0_READ_SHADOW, - CR4_READ_SHADOW, - TSC_OFFSET, - EXCEPTION_BITMAP, - CPU_BASED_VM_EXEC_CONTROL, - VM_ENTRY_EXCEPTION_ERROR_CODE, - VM_ENTRY_INTR_INFO_FIELD, - VM_ENTRY_INSTRUCTION_LEN, - VM_ENTRY_EXCEPTION_ERROR_CODE, - HOST_FS_BASE, - HOST_GS_BASE, - HOST_FS_SELECTOR, - HOST_GS_SELECTOR +static u16 shadow_read_write_fields[] = { +#define SHADOW_FIELD_RW(x) x, +#include "vmx_shadow_fields.h" }; static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); @@ -898,21 +869,22 @@ static const unsigned short vmcs_field_to_offset_table[] = { static inline short vmcs_field_to_offset(unsigned long field) { - BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); + const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table); + unsigned short offset; + unsigned index; - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) + if (field >> 15) return -ENOENT; - /* - * FIXME: Mitigation for CVE-2017-5753. To be replaced with a - * generic mechanism. - */ - asm("lfence"); - - if (vmcs_field_to_offset_table[field] == 0) + index = ROL16(field, 6); + if (index >= size) return -ENOENT; - return vmcs_field_to_offset_table[field]; + index = array_index_nospec(index, size); + offset = vmcs_field_to_offset_table[index]; + if (offset == 0) + return -ENOENT; + return offset; } static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) @@ -935,6 +907,9 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, u16 error_code); +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -952,14 +927,6 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); enum { - VMX_IO_BITMAP_A, - VMX_IO_BITMAP_B, - VMX_MSR_BITMAP_LEGACY, - VMX_MSR_BITMAP_LONGMODE, - VMX_MSR_BITMAP_LEGACY_X2APIC_APICV, - VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV, - VMX_MSR_BITMAP_LEGACY_X2APIC, - VMX_MSR_BITMAP_LONGMODE_X2APIC, VMX_VMREAD_BITMAP, VMX_VMWRITE_BITMAP, VMX_BITMAP_NR @@ -967,14 +934,6 @@ enum { static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; -#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A]) -#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B]) -#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY]) -#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE]) -#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV]) -#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV]) -#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC]) -#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC]) #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) @@ -1918,6 +1877,52 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } +/* + * Check if MSR is intercepted for currently loaded MSR bitmap. + */ +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) +{ + unsigned long *msr_bitmap; + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return true; + + msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; + + if (msr <= 0x1fff) { + return !!test_bit(msr, msr_bitmap + 0x800 / f); + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + return !!test_bit(msr, msr_bitmap + 0xc00 / f); + } + + return true; +} + +/* + * Check if MSR is intercepted for L01 MSR bitmap. + */ +static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) +{ + unsigned long *msr_bitmap; + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return true; + + msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; + + if (msr <= 0x1fff) { + return !!test_bit(msr, msr_bitmap + 0x800 / f); + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + return !!test_bit(msr, msr_bitmap + 0xc00 / f); + } + + return true; +} + static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit) { @@ -2296,6 +2301,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; vmcs_load(vmx->loaded_vmcs->vmcs); + indirect_branch_prediction_barrier(); } if (!already_loaded) { @@ -2333,6 +2339,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmx_vcpu_pi_load(vcpu, cpu); vmx->host_pkru = read_pkru(); + vmx->host_debugctlmsr = get_debugctlmsr(); } static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) @@ -2572,36 +2579,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) vmx->guest_msrs[from] = tmp; } -static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) -{ - unsigned long *msr_bitmap; - - if (is_guest_mode(vcpu)) - msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; - else if (cpu_has_secondary_exec_ctrls() && - (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { - if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv; - else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv; - } else { - if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic; - else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic; - } - } else { - if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode; - else - msr_bitmap = vmx_msr_bitmap_legacy; - } - - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); -} - /* * Set up the vmcs to automatically save and restore system * msrs. Don't touch the 64-bit msrs if the guest is in legacy @@ -2642,7 +2619,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx->save_nmsrs = save_nmsrs; if (cpu_has_vmx_msr_bitmap()) - vmx_set_msr_bitmap(&vmx->vcpu); + vmx_update_msr_bitmap(&vmx->vcpu); } /* @@ -2920,7 +2897,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1); /* highest index: VMX_PREEMPTION_TIMER_VALUE */ - vmx->nested.nested_vmx_vmcs_enum = 0x2e; + vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; } /* @@ -3256,6 +3233,7 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, */ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; switch (msr_info->index) { @@ -3267,8 +3245,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(to_vmx(vcpu)); - msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base; + vmx_load_host_state(vmx); + msr_info->data = vmx->msr_guest_kernel_gs_base; break; #endif case MSR_EFER: @@ -3276,6 +3254,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: msr_info->data = guest_read_tsc(vcpu); break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + return 1; + + msr_info->data = to_vmx(vcpu)->spec_ctrl; + break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + return 1; + msr_info->data = to_vmx(vcpu)->arch_capabilities; + break; case MSR_IA32_SYSENTER_CS: msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); break; @@ -3294,13 +3286,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_MCG_EXT_CTL: if (!msr_info->host_initiated && - !(to_vmx(vcpu)->msr_ia32_feature_control & + !(vmx->msr_ia32_feature_control & FEATURE_CONTROL_LMCE)) return 1; msr_info->data = vcpu->arch.mcg_ext_ctl; break; case MSR_IA32_FEATURE_CONTROL: - msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control; + msr_info->data = vmx->msr_ia32_feature_control; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) @@ -3317,7 +3309,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; /* Otherwise falls through */ default: - msr = find_msr_entry(to_vmx(vcpu), msr_info->index); + msr = find_msr_entry(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; break; @@ -3383,6 +3375,70 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + return 1; + + /* The STIBP bit doesn't fault even if it's not advertised */ + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) + return 1; + + vmx->spec_ctrl = data; + + if (!data) + break; + + /* + * For non-nested: + * When it's written (to non-zero) for the first time, pass + * it through. + * + * For nested: + * The handling of the MSR bitmap for L2 guests is done in + * nested_vmx_merge_msr_bitmap. We should not touch the + * vmcs02.msr_bitmap here since it gets completely overwritten + * in the merging. We update the vmcs01 here for L1 as well + * since it will end up touching the MSR anyway now. + */ + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_RW); + break; + case MSR_IA32_PRED_CMD: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + return 1; + + if (data & ~PRED_CMD_IBPB) + return 1; + + if (!data) + break; + + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); + + /* + * For non-nested: + * When it's written (to non-zero) for the first time, pass + * it through. + * + * For nested: + * The handling of the MSR bitmap for L2 guests is done in + * nested_vmx_merge_msr_bitmap. We should not touch the + * vmcs02.msr_bitmap here since it gets completely overwritten + * in the merging. + */ + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, + MSR_TYPE_W); + break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated) + return 1; + vmx->arch_capabilities = data; + break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) @@ -3639,7 +3695,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) #endif CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_INVLPG_EXITING | @@ -3669,6 +3725,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_DESC | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -3837,11 +3894,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) return vmcs; } -static struct vmcs *alloc_vmcs(void) -{ - return alloc_vmcs_cpu(raw_smp_processor_id()); -} - static void free_vmcs(struct vmcs *vmcs) { free_pages((unsigned long)vmcs, vmcs_config.order); @@ -3857,9 +3909,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) loaded_vmcs_clear(loaded_vmcs); free_vmcs(loaded_vmcs->vmcs); loaded_vmcs->vmcs = NULL; + if (loaded_vmcs->msr_bitmap) + free_page((unsigned long)loaded_vmcs->msr_bitmap); WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } +static struct vmcs *alloc_vmcs(void) +{ + return alloc_vmcs_cpu(raw_smp_processor_id()); +} + +static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) +{ + loaded_vmcs->vmcs = alloc_vmcs(); + if (!loaded_vmcs->vmcs) + return -ENOMEM; + + loaded_vmcs->shadow_vmcs = NULL; + loaded_vmcs_init(loaded_vmcs); + + if (cpu_has_vmx_msr_bitmap()) { + loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!loaded_vmcs->msr_bitmap) + goto out_vmcs; + memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); + } + return 0; + +out_vmcs: + free_loaded_vmcs(loaded_vmcs); + return -ENOMEM; +} + static void free_kvm_area(void) { int cpu; @@ -3870,17 +3951,17 @@ static void free_kvm_area(void) } } -enum vmcs_field_type { - VMCS_FIELD_TYPE_U16 = 0, - VMCS_FIELD_TYPE_U64 = 1, - VMCS_FIELD_TYPE_U32 = 2, - VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 +enum vmcs_field_width { + VMCS_FIELD_WIDTH_U16 = 0, + VMCS_FIELD_WIDTH_U64 = 1, + VMCS_FIELD_WIDTH_U32 = 2, + VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3 }; -static inline int vmcs_field_type(unsigned long field) +static inline int vmcs_field_width(unsigned long field) { if (0x1 & field) /* the *_HIGH fields are all 32 bit */ - return VMCS_FIELD_TYPE_U32; + return VMCS_FIELD_WIDTH_U32; return (field >> 13) & 0x3 ; } @@ -3893,43 +3974,66 @@ static void init_vmcs_shadow_fields(void) { int i, j; - /* No checks for read only fields yet */ + for (i = j = 0; i < max_shadow_read_only_fields; i++) { + u16 field = shadow_read_only_fields[i]; + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && + (i + 1 == max_shadow_read_only_fields || + shadow_read_only_fields[i + 1] != field + 1)) + pr_err("Missing field from shadow_read_only_field %x\n", + field + 1); + + clear_bit(field, vmx_vmread_bitmap); +#ifdef CONFIG_X86_64 + if (field & 1) + continue; +#endif + if (j < i) + shadow_read_only_fields[j] = field; + j++; + } + max_shadow_read_only_fields = j; for (i = j = 0; i < max_shadow_read_write_fields; i++) { - switch (shadow_read_write_fields[i]) { - case GUEST_BNDCFGS: - if (!kvm_mpx_supported()) + u16 field = shadow_read_write_fields[i]; + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && + (i + 1 == max_shadow_read_write_fields || + shadow_read_write_fields[i + 1] != field + 1)) + pr_err("Missing field from shadow_read_write_field %x\n", + field + 1); + + /* + * PML and the preemption timer can be emulated, but the + * processor cannot vmwrite to fields that don't exist + * on bare metal. + */ + switch (field) { + case GUEST_PML_INDEX: + if (!cpu_has_vmx_pml()) + continue; + break; + case VMX_PREEMPTION_TIMER_VALUE: + if (!cpu_has_vmx_preemption_timer()) + continue; + break; + case GUEST_INTR_STATUS: + if (!cpu_has_vmx_apicv()) continue; break; default: break; } + clear_bit(field, vmx_vmwrite_bitmap); + clear_bit(field, vmx_vmread_bitmap); +#ifdef CONFIG_X86_64 + if (field & 1) + continue; +#endif if (j < i) - shadow_read_write_fields[j] = - shadow_read_write_fields[i]; + shadow_read_write_fields[j] = field; j++; } max_shadow_read_write_fields = j; - - /* shadowed fields guest access without vmexit */ - for (i = 0; i < max_shadow_read_write_fields; i++) { - unsigned long field = shadow_read_write_fields[i]; - - clear_bit(field, vmx_vmwrite_bitmap); - clear_bit(field, vmx_vmread_bitmap); - if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) { - clear_bit(field + 1, vmx_vmwrite_bitmap); - clear_bit(field + 1, vmx_vmread_bitmap); - } - } - for (i = 0; i < max_shadow_read_only_fields; i++) { - unsigned long field = shadow_read_only_fields[i]; - - clear_bit(field, vmx_vmread_bitmap); - if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) - clear_bit(field + 1, vmx_vmread_bitmap); - } } static __init int alloc_kvm_area(void) @@ -4142,9 +4246,10 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) +static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, + bool invalidate_gpa) { - if (enable_ept) { + if (enable_ept && (invalidate_gpa || !enable_vpid)) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa)); @@ -4153,15 +4258,15 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) } } -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); } static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu) { if (enable_ept) - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) @@ -4359,7 +4464,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) ept_load_pdptrs(vcpu); } - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); vmcs_writel(GUEST_CR3, guest_cr3); } @@ -4376,6 +4481,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (to_vmx(vcpu)->rmode.vm86_active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_DESC); + hw_cr4 &= ~X86_CR4_UMIP; + } else + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_DESC); + if (cr4 & X86_CR4_VMXE) { /* * To use VMXON (and later other VMX instructions), a guest @@ -4918,10 +5031,8 @@ static void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } -#define MSR_TYPE_R 1 -#define MSR_TYPE_W 2 -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) { int f = sizeof(unsigned long); @@ -4955,6 +5066,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, } } +static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return; + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R) + /* read-low */ + __set_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __set_bit(msr, msr_bitmap + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R) + /* read-high */ + __set_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __set_bit(msr, msr_bitmap + 0xc00 / f); + + } +} + +static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type, bool value) +{ + if (value) + vmx_enable_intercept_for_msr(msr_bitmap, msr, type); + else + vmx_disable_intercept_for_msr(msr_bitmap, msr, type); +} + /* * If a msr is allowed by L0, we should check whether it is allowed by L1. * The corresponding bit will be cleared unless both of L0 and L1 allow it. @@ -4965,11 +5120,6 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, { int f = sizeof(unsigned long); - if (!cpu_has_vmx_msr_bitmap()) { - WARN_ON(1); - return; - } - /* * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals * have the write-low and read-high bitmap offsets the wrong way round. @@ -5001,30 +5151,70 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, } } -static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) +static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) { - if (!longmode_only) - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, - msr, MSR_TYPE_R | MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, - msr, MSR_TYPE_R | MSR_TYPE_W); + u8 mode = 0; + + if (cpu_has_secondary_exec_ctrls() && + (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { + mode |= MSR_BITMAP_MODE_X2APIC; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) + mode |= MSR_BITMAP_MODE_X2APIC_APICV; + } + + if (is_long_mode(vcpu)) + mode |= MSR_BITMAP_MODE_LM; + + return mode; } -static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active) +#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) + +static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, + u8 mode) { - if (apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, - msr, type); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, - msr, type); - } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, type); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, type); + int msr; + + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; + msr_bitmap[word + (0x800 / sizeof(long))] = ~0; + } + + if (mode & MSR_BITMAP_MODE_X2APIC) { + /* + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. + */ + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { + vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + } } } +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + u8 mode = vmx_msr_bitmap_mode(vcpu); + u8 changed = mode ^ vmx->msr_bitmap_mode; + + if (!changed) + return; + + vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW, + !(mode & MSR_BITMAP_MODE_LM)); + + if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) + vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); + + vmx->msr_bitmap_mode = mode; +} + static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) { return enable_apicv; @@ -5069,7 +5259,8 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr != 256) { vapic_page = kmap(vmx->nested.virtual_apic_page); - __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, + vapic_page, &max_irr); kunmap(vmx->nested.virtual_apic_page); status = vmcs_read16(GUEST_INTR_STATUS); @@ -5129,14 +5320,15 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { - /* the PIR and ON have been set by L1. */ - kvm_vcpu_trigger_posted_interrupt(vcpu, true); /* * If a posted intr is not recognized by hardware, * we will accomplish it in the next vmentry. */ vmx->nested.pi_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); + /* the PIR and ON have been set by L1. */ + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) + kvm_vcpu_kick(vcpu); return 0; } return -1; @@ -5274,7 +5466,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } if (cpu_has_vmx_msr_bitmap()) - vmx_set_msr_bitmap(vcpu); + vmx_update_msr_bitmap(vcpu); } static u32 vmx_exec_control(struct vcpu_vmx *vmx) @@ -5315,6 +5507,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) struct kvm_vcpu *vcpu = &vmx->vcpu; u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; + if (!cpu_need_virtualize_apic_accesses(vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) @@ -5333,6 +5526,11 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + + /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, + * in vmx_set_cr4. */ + exec_control &= ~SECONDARY_EXEC_DESC; + /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD (handle_vmptrld). We can NOT enable shadow_vmcs here because we don't have yet @@ -5452,16 +5650,12 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) #endif int i; - /* I/O */ - vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); - vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); - if (enable_shadow_vmcs) { vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); } if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); + vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ @@ -5539,6 +5733,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities); vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); @@ -5567,6 +5763,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) u64 cr0; vmx->rmode.vm86_active = 0; + vmx->spec_ctrl = 0; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(vcpu, 0); @@ -6107,6 +6304,12 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) return kvm_set_cr4(vcpu, val); } +static int handle_desc(struct kvm_vcpu *vcpu) +{ + WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); + return emulate_instruction(vcpu, 0) == EMULATE_DONE; +} + static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; @@ -6563,7 +6766,21 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu) && !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { trace_kvm_fast_mmio(gpa); - return kvm_skip_emulated_instruction(vcpu); + /* + * Doing kvm_skip_emulated_instruction() depends on undefined + * behavior: Intel's manual doesn't mandate + * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG + * occurs and while on real hardware it was observed to be set, + * other hypervisors (namely Hyper-V) don't set it, we end up + * advancing IP with some random value. Disable fast mmio when + * running nested and keep it for real hardware in hope that + * VM_EXIT_INSTRUCTION_LEN will always be set correctly. + */ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + return kvm_skip_emulated_instruction(vcpu); + else + return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP, + NULL, 0) == EMULATE_DONE; } ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); @@ -6744,7 +6961,7 @@ void vmx_enable_tdp(void) static __init int hardware_setup(void) { - int r = -ENOMEM, i, msr; + int r = -ENOMEM, i; rdmsrl_safe(MSR_EFER, &host_efer); @@ -6760,13 +6977,6 @@ static __init int hardware_setup(void) memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); - - memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; goto out; @@ -6779,11 +6989,6 @@ static __init int hardware_setup(void) !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; - if (!cpu_has_vmx_shadow_vmcs()) - enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) - init_vmcs_shadow_fields(); - if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels() || !cpu_has_vmx_ept_mt_wb() || @@ -6835,42 +7040,8 @@ static __init int hardware_setup(void) kvm_tsc_scaling_ratio_frac_bits = 48; } - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); - vmx_disable_intercept_for_msr(MSR_GS_BASE, false); - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - - memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, - vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic_apicv, - vmx_msr_bitmap_longmode, PAGE_SIZE); - memcpy(vmx_msr_bitmap_legacy_x2apic, - vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, - vmx_msr_bitmap_longmode, PAGE_SIZE); - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - for (msr = 0x800; msr <= 0x8ff; msr++) { - if (msr == 0x839 /* TMCCT */) - continue; - vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true); - } - - /* - * TPR reads and writes can be virtualized even if virtual interrupt - * delivery is not in use. - */ - vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true); - vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false); - - /* EOI */ - vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true); - /* SELF-IPI */ - vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); - if (enable_ept) vmx_enable_tdp(); else @@ -6903,6 +7074,11 @@ static __init int hardware_setup(void) kvm_x86_ops->cancel_hv_timer = NULL; } + if (!cpu_has_vmx_shadow_vmcs()) + enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); + kvm_set_posted_intr_wakeup_handler(wakeup_handler); kvm_mce_cap_supported |= MCG_LMCE_P; @@ -6974,94 +7150,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu) } /* - * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. - * We could reuse a single VMCS for all the L2 guests, but we also want the - * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this - * allows keeping them loaded on the processor, and in the future will allow - * optimizations where prepare_vmcs02 doesn't need to set all the fields on - * every entry if they never change. - * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE - * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. - * - * The following functions allocate and free a vmcs02 in this pool. - */ - -/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ -static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) -{ - struct vmcs02_list *item; - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) - if (item->vmptr == vmx->nested.current_vmptr) { - list_move(&item->list, &vmx->nested.vmcs02_pool); - return &item->vmcs02; - } - - if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { - /* Recycle the least recently used VMCS. */ - item = list_last_entry(&vmx->nested.vmcs02_pool, - struct vmcs02_list, list); - item->vmptr = vmx->nested.current_vmptr; - list_move(&item->list, &vmx->nested.vmcs02_pool); - return &item->vmcs02; - } - - /* Create a new VMCS */ - item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL); - if (!item) - return NULL; - item->vmcs02.vmcs = alloc_vmcs(); - item->vmcs02.shadow_vmcs = NULL; - if (!item->vmcs02.vmcs) { - kfree(item); - return NULL; - } - loaded_vmcs_init(&item->vmcs02); - item->vmptr = vmx->nested.current_vmptr; - list_add(&(item->list), &(vmx->nested.vmcs02_pool)); - vmx->nested.vmcs02_num++; - return &item->vmcs02; -} - -/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ -static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) -{ - struct vmcs02_list *item; - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) - if (item->vmptr == vmptr) { - free_loaded_vmcs(&item->vmcs02); - list_del(&item->list); - kfree(item); - vmx->nested.vmcs02_num--; - return; - } -} - -/* - * Free all VMCSs saved for this vcpu, except the one pointed by - * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs - * must be &vmx->vmcs01. - */ -static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) -{ - struct vmcs02_list *item, *n; - - WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); - list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { - /* - * Something will leak if the above WARN triggers. Better than - * a use-after-free. - */ - if (vmx->loaded_vmcs == &item->vmcs02) - continue; - - free_loaded_vmcs(&item->vmcs02); - list_del(&item->list); - kfree(item); - vmx->nested.vmcs02_num--; - } -} - -/* * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), * set the success or error code of an emulated VMX instruction, as specified * by Vol 2B, VMX Instruction Reference, "Conventions". @@ -7241,13 +7329,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs *shadow_vmcs; + int r; - if (cpu_has_vmx_msr_bitmap()) { - vmx->nested.msr_bitmap = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx->nested.msr_bitmap) - goto out_msr_bitmap; - } + r = alloc_loaded_vmcs(&vmx->nested.vmcs02); + if (r < 0) + goto out_vmcs02; vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); if (!vmx->nested.cached_vmcs12) @@ -7264,9 +7350,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) vmx->vmcs01.shadow_vmcs = shadow_vmcs; } - INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); - vmx->nested.vmcs02_num = 0; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; @@ -7278,9 +7361,9 @@ out_shadow_vmcs: kfree(vmx->nested.cached_vmcs12); out_cached_vmcs12: - free_page((unsigned long)vmx->nested.msr_bitmap); + free_loaded_vmcs(&vmx->nested.vmcs02); -out_msr_bitmap: +out_vmcs02: return -ENOMEM; } @@ -7423,10 +7506,6 @@ static void free_nested(struct vcpu_vmx *vmx) free_vpid(vmx->nested.vpid02); vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; - if (vmx->nested.msr_bitmap) { - free_page((unsigned long)vmx->nested.msr_bitmap); - vmx->nested.msr_bitmap = NULL; - } if (enable_shadow_vmcs) { vmx_disable_shadow_vmcs(vmx); vmcs_clear(vmx->vmcs01.shadow_vmcs); @@ -7434,7 +7513,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); - /* Unpin physical memory we referred to in current vmcs02 */ + /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; @@ -7450,7 +7529,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.pi_desc = NULL; } - nested_free_all_saved_vmcss(vmx); + free_loaded_vmcs(&vmx->nested.vmcs02); } /* Emulate the VMXOFF instruction */ @@ -7493,8 +7572,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) vmptr + offsetof(struct vmcs12, launch_state), &zero, sizeof(zero)); - nested_free_vmcs02(vmx, vmptr); - nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); } @@ -7532,17 +7609,17 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, p = ((char *)(get_vmcs12(vcpu))) + offset; - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_NATURAL_WIDTH: + switch (vmcs_field_width(field)) { + case VMCS_FIELD_WIDTH_NATURAL_WIDTH: *ret = *((natural_width *)p); return 0; - case VMCS_FIELD_TYPE_U16: + case VMCS_FIELD_WIDTH_U16: *ret = *((u16 *)p); return 0; - case VMCS_FIELD_TYPE_U32: + case VMCS_FIELD_WIDTH_U32: *ret = *((u32 *)p); return 0; - case VMCS_FIELD_TYPE_U64: + case VMCS_FIELD_WIDTH_U64: *ret = *((u64 *)p); return 0; default: @@ -7559,17 +7636,17 @@ static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, if (offset < 0) return offset; - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: + switch (vmcs_field_width(field)) { + case VMCS_FIELD_WIDTH_U16: *(u16 *)p = field_value; return 0; - case VMCS_FIELD_TYPE_U32: + case VMCS_FIELD_WIDTH_U32: *(u32 *)p = field_value; return 0; - case VMCS_FIELD_TYPE_U64: + case VMCS_FIELD_WIDTH_U64: *(u64 *)p = field_value; return 0; - case VMCS_FIELD_TYPE_NATURAL_WIDTH: + case VMCS_FIELD_WIDTH_NATURAL_WIDTH: *(natural_width *)p = field_value; return 0; default: @@ -7585,7 +7662,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) unsigned long field; u64 field_value; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; - const unsigned long *fields = shadow_read_write_fields; + const u16 *fields = shadow_read_write_fields; const int num_fields = max_shadow_read_write_fields; preempt_disable(); @@ -7594,23 +7671,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) for (i = 0; i < num_fields; i++) { field = fields[i]; - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: - field_value = vmcs_read16(field); - break; - case VMCS_FIELD_TYPE_U32: - field_value = vmcs_read32(field); - break; - case VMCS_FIELD_TYPE_U64: - field_value = vmcs_read64(field); - break; - case VMCS_FIELD_TYPE_NATURAL_WIDTH: - field_value = vmcs_readl(field); - break; - default: - WARN_ON(1); - continue; - } + field_value = __vmcs_readl(field); vmcs12_write_any(&vmx->vcpu, field, field_value); } @@ -7622,7 +7683,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - const unsigned long *fields[] = { + const u16 *fields[] = { shadow_read_write_fields, shadow_read_only_fields }; @@ -7641,24 +7702,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; vmcs12_read_any(&vmx->vcpu, field, &field_value); - - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: - vmcs_write16(field, (u16)field_value); - break; - case VMCS_FIELD_TYPE_U32: - vmcs_write32(field, (u32)field_value); - break; - case VMCS_FIELD_TYPE_U64: - vmcs_write64(field, (u64)field_value); - break; - case VMCS_FIELD_TYPE_NATURAL_WIDTH: - vmcs_writel(field, (long)field_value); - break; - default: - WARN_ON(1); - break; - } + __vmcs_writel(field, field_value); } } @@ -7727,8 +7771,10 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) { unsigned long field; gva_t gva; + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + /* The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 @@ -7771,6 +7817,20 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } + switch (field) { +#define SHADOW_FIELD_RW(x) case x: +#include "vmx_shadow_fields.h" + /* + * The fields that can be updated by L1 without a vmexit are + * always updated in the vmcs02, the others go down the slow + * path of prepare_vmcs02. + */ + break; + default: + vmx->nested.dirty_vmcs12 = true; + break; + } + nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); } @@ -7785,6 +7845,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) __pa(vmx->vmcs01.shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; } + vmx->nested.dirty_vmcs12 = true; } /* Emulate the VMPTRLD instruction */ @@ -8005,7 +8066,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } - __vmx_flush_tlb(vcpu, vmx->nested.vpid02); + __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true); nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); @@ -8199,6 +8260,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, + [EXIT_REASON_GDTR_IDTR] = handle_desc, + [EXIT_REASON_LDTR_TR] = handle_desc, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, @@ -8406,10 +8469,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) /* * The host physical addresses of some pages of guest memory - * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU - * may write to these pages via their host physical address while - * L2 is running, bypassing any address-translation-based dirty - * tracking (e.g. EPT write protection). + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC + * Page). The CPU may write to these pages via their host + * physical address while L2 is running, bypassing any + * address-translation-based dirty tracking (e.g. EPT write + * protection). * * Mark them dirty on every exit from L2 to prevent them from * getting out of sync with dirty tracking. @@ -8943,7 +9007,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); - vmx_set_msr_bitmap(vcpu); + vmx_update_msr_bitmap(vcpu); } static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) @@ -9007,36 +9071,23 @@ static void vmx_set_rvi(int vector) static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { - if (!is_guest_mode(vcpu)) { - vmx_set_rvi(max_irr); - return; - } - - if (max_irr == -1) - return; - /* - * In guest mode. If a vmexit is needed, vmx_check_nested_events - * handles it. + * When running L2, updating RVI is only relevant when + * vmcs12 virtual-interrupt-delivery enabled. + * However, it can be enabled only when L1 also + * intercepts external-interrupts and in that case + * we should not update vmcs02 RVI but instead intercept + * interrupt. Therefore, do nothing when running L2. */ - if (nested_exit_on_intr(vcpu)) - return; - - /* - * Else, fall back to pre-APICv interrupt injection since L2 - * is run without virtual interrupt delivery. - */ - if (!kvm_event_needs_reinjection(vcpu) && - vmx_interrupt_allowed(vcpu)) { - kvm_queue_interrupt(vcpu, max_irr, false); - vmx_inject_irq(vcpu); - } + if (!is_guest_mode(vcpu)) + vmx_set_rvi(max_irr); } static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; + bool max_irr_updated; WARN_ON(!vcpu->arch.apicv_active); if (pi_test_on(&vmx->pi_desc)) { @@ -9046,7 +9097,23 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); + max_irr_updated = + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + + /* + * If we are running L2 and L1 has a new pending interrupt + * which can be injected, we should re-evaluate + * what should be done with this new L1 interrupt. + * If L1 intercepts external-interrupts, we should + * exit from L2 to L1. Otherwise, interrupt should be + * delivered directly to L2. + */ + if (is_guest_mode(vcpu) && max_irr_updated) { + if (nested_exit_on_intr(vcpu)) + kvm_vcpu_exiting_guest_mode(vcpu); + else + kvm_make_request(KVM_REQ_EVENT, vcpu); + } } else { max_irr = kvm_lapic_find_highest_irr(vcpu); } @@ -9161,6 +9228,12 @@ static bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } +static bool vmx_umip_emulated(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_DESC; +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -9316,7 +9389,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr, cr3, cr4; + unsigned long cr3, cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -9369,10 +9442,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) __write_pkru(vcpu->arch.pkru); atomic_switch_perf_msrs(vmx); - debugctlmsr = get_debugctlmsr(); vmx_arm_hv_timer(vcpu); + /* + * If this vCPU has touched SPEC_CTRL, restore the guest's value if + * it's non-zero. Since vmentry is serialising on affected CPUs, there + * is no need to worry about the conditional branch over the wrmsr + * being speculatively taken. + */ + if (vmx->spec_ctrl) + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -9491,12 +9572,33 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* + * We do not use IBRS in the kernel. If this vCPU has used the + * SPEC_CTRL MSR it may have left it on; save the value and + * turn it off. This is much more efficient than blindly adding + * it to the atomic save/restore list. Especially as the former + * (Saving guest MSRs on vmexit) doesn't even exist in KVM. + * + * For non-nested case: + * If the L01 MSR bitmap does not intercept the MSR, then we need to + * save it. + * + * For nested case: + * If the L02 MSR bitmap does not intercept the MSR, then we need to + * save it. + */ + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + + if (vmx->spec_ctrl) + wrmsrl(MSR_IA32_SPEC_CTRL, 0); + /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB(); /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (debugctlmsr) - update_debugctlmsr(debugctlmsr); + if (vmx->host_debugctlmsr) + update_debugctlmsr(vmx->host_debugctlmsr); #ifndef CONFIG_X86_64 /* @@ -9576,10 +9678,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int r; - r = vcpu_load(vcpu); - BUG_ON(r); + vcpu_load(vcpu); vmx_switch_vmcs(vcpu, &vmx->vmcs01); free_nested(vmx); vcpu_put(vcpu); @@ -9604,6 +9704,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + unsigned long *msr_bitmap; int cpu; if (!vmx) @@ -9636,13 +9737,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx->guest_msrs) goto free_pml; - vmx->loaded_vmcs = &vmx->vmcs01; - vmx->loaded_vmcs->vmcs = alloc_vmcs(); - vmx->loaded_vmcs->shadow_vmcs = NULL; - if (!vmx->loaded_vmcs->vmcs) + err = alloc_loaded_vmcs(&vmx->vmcs01); + if (err < 0) goto free_msrs; - loaded_vmcs_init(vmx->loaded_vmcs); + msr_bitmap = vmx->vmcs01.msr_bitmap; + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + vmx->msr_bitmap_mode = 0; + + vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); vmx->vcpu.cpu = cpu; @@ -9771,7 +9879,8 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl) u32 mask = SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_DESC; u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); @@ -9937,8 +10046,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, } } -static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12); +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12); static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) @@ -10027,11 +10136,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, (unsigned long)(vmcs12->posted_intr_desc_addr & (PAGE_SIZE - 1))); } - if (cpu_has_vmx_msr_bitmap() && - nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) && - nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) - ; - else + if (!nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_USE_MSR_BITMAPS); } @@ -10099,48 +10204,90 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. */ -static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { int msr; struct page *page; unsigned long *msr_bitmap_l1; - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; + /* + * pred_cmd & spec_ctrl are trying to verify two things: + * + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This + * ensures that we do not accidentally generate an L02 MSR bitmap + * from the L12 MSR bitmap that is too permissive. + * 2. That L1 or L2s have actually used the MSR. This avoids + * unnecessarily merging of the bitmap if the MSR is unused. This + * works properly because we only update the L01 MSR bitmap lazily. + * So even if L0 should pass L1 these MSRs, the L01 bitmap is only + * updated to reflect this when L1 (or its L2s) actually write to + * the MSR. + */ + bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); + bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); + + /* Nothing to do if the MSR bitmap is not in use. */ + if (!cpu_has_vmx_msr_bitmap() || + !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return false; - /* This shortcut is ok because we support only x2APIC MSRs so far. */ - if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !pred_cmd && !spec_ctrl) return false; page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); if (is_error_page(page)) return false; + msr_bitmap_l1 = (unsigned long *)kmap(page); + if (nested_cpu_has_apic_reg_virt(vmcs12)) { + /* + * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it + * just lets the processor take the value from the virtual-APIC page; + * take those 256 bits directly from the L1 bitmap. + */ + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap_l0[word] = msr_bitmap_l1[word]; + msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; + } + } else { + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap_l0[word] = ~0; + msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; + } + } - memset(msr_bitmap_l0, 0xff, PAGE_SIZE); + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_TASKPRI), + MSR_TYPE_W); - if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { - if (nested_cpu_has_apic_reg_virt(vmcs12)) - for (msr = 0x800; msr <= 0x8ff; msr++) - nested_vmx_disable_intercept_for_msr( + if (nested_cpu_has_vid(vmcs12)) { + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_EOI), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_SELF_IPI), + MSR_TYPE_W); + } + + if (spec_ctrl) + nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, - msr, MSR_TYPE_R); + MSR_IA32_SPEC_CTRL, + MSR_TYPE_R | MSR_TYPE_W); + if (pred_cmd) nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_TASKPRI >> 4), - MSR_TYPE_R | MSR_TYPE_W); - - if (nested_cpu_has_vid(vmcs12)) { - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_EOI >> 4), - MSR_TYPE_W); - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), - MSR_TYPE_W); - } - } + msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PRED_CMD, + MSR_TYPE_W); + kunmap(page); kvm_release_page_clean(page); @@ -10406,25 +10553,12 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne return 0; } -/* - * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested - * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it - * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 - * guest in a way that will both be appropriate to L1's requests, and our - * needs. In addition to modifying the active vmcs (which is vmcs02), this - * function also has additional necessary side-effects, like setting various - * vcpu->arch fields. - * Returns 0 on success, 1 on failure. Invalid state exit qualification code - * is assigned to entry_failure_code on failure. - */ -static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - bool from_vmentry, u32 *entry_failure_code) +static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + bool from_vmentry) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control, vmcs12_exec_ctrl; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); @@ -10432,7 +10566,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); - vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); @@ -10442,15 +10575,12 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); - vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); - vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); - vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); @@ -10460,6 +10590,125 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs12->guest_pending_dbg_exceptions); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + + if (nested_cpu_has_xsaves(vmcs12)) + vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); + vmcs_write64(VMCS_LINK_POINTER, -1ull); + + if (cpu_has_vmx_posted_intr()) + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); + + /* + * Whether page-faults are trapped is determined by a combination of + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. + * If enable_ept, L0 doesn't care about page faults and we should + * set all of these to L1's desires. However, if !enable_ept, L0 does + * care about (at least some) page faults, and because it is not easy + * (if at all possible?) to merge L0 and L1's desires, we simply ask + * to exit on each and every L2 page fault. This is done by setting + * MASK=MATCH=0 and (see below) EB.PF=1. + * Note that below we don't need special code to set EB.PF beyond the + * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, + * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when + * !enable_ept, EB.PF is 1, so the "or" will always be 1. + */ + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, + enable_ept ? vmcs12->page_fault_error_code_mask : 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, + enable_ept ? vmcs12->page_fault_error_code_match : 0); + + /* All VMFUNCs are currently emulated through L0 vmexits. */ + if (cpu_has_vmx_vmfunc()) + vmcs_write64(VM_FUNCTION_CONTROL, 0); + + if (cpu_has_vmx_apicv()) { + vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); + vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); + vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); + vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); + } + + /* + * Set host-state according to L0's settings (vmcs12 is irrelevant here) + * Some constant fields are set here by vmx_set_constant_host_state(). + * Other fields are different per CPU, and will be set later when + * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. + */ + vmx_set_constant_host_state(vmx); + + /* + * Set the MSR load/store lists to match L0's settings. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + + set_cr4_guest_host_mask(vmx); + + if (vmx_mpx_supported()) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + + if (enable_vpid) { + if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + else + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + } + + /* + * L1 may access the L2's PDPTR, so save them to construct vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); +} + +/* + * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 + * guest in a way that will both be appropriate to L1's requests, and our + * needs. In addition to modifying the active vmcs (which is vmcs02), this + * function also has additional necessary side-effects, like setting various + * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + bool from_vmentry, u32 *entry_failure_code) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exec_control, vmcs12_exec_ctrl; + + /* + * First, the fields that are shadowed. This must be kept in sync + * with vmx_shadow_fields.h. + */ + + vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); + vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); + vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); + vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); + + /* + * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR, + * HOST_FS_BASE, HOST_GS_BASE. + */ + if (from_vmentry && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); @@ -10482,16 +10731,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } else { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, - vmcs12->guest_pending_dbg_exceptions); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); - - if (nested_cpu_has_xsaves(vmcs12)) - vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); - vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; @@ -10505,7 +10745,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (nested_cpu_has_posted_intr(vmcs12)) { vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; vmx->nested.pi_pending = false; - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); } else { exec_control &= ~PIN_BASED_POSTED_INTR; } @@ -10516,25 +10755,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (nested_cpu_has_preemption_timer(vmcs12)) vmx_start_preemption_timer(vcpu); - /* - * Whether page-faults are trapped is determined by a combination of - * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. - * If enable_ept, L0 doesn't care about page faults and we should - * set all of these to L1's desires. However, if !enable_ept, L0 does - * care about (at least some) page faults, and because it is not easy - * (if at all possible?) to merge L0 and L1's desires, we simply ask - * to exit on each and every L2 page fault. This is done by setting - * MASK=MATCH=0 and (see below) EB.PF=1. - * Note that below we don't need special code to set EB.PF beyond the - * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, - * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when - * !enable_ept, EB.PF is 1, so the "or" will always be 1. - */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, - enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, - enable_ept ? vmcs12->page_fault_error_code_match : 0); - if (cpu_has_secondary_exec_ctrls()) { exec_control = vmx->secondary_exec_control; @@ -10553,22 +10773,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control |= vmcs12_exec_ctrl; } - /* All VMFUNCs are currently emulated through L0 vmexits. */ - if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC) - vmcs_write64(VM_FUNCTION_CONTROL, 0); - - if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { - vmcs_write64(EOI_EXIT_BITMAP0, - vmcs12->eoi_exit_bitmap0); - vmcs_write64(EOI_EXIT_BITMAP1, - vmcs12->eoi_exit_bitmap1); - vmcs_write64(EOI_EXIT_BITMAP2, - vmcs12->eoi_exit_bitmap2); - vmcs_write64(EOI_EXIT_BITMAP3, - vmcs12->eoi_exit_bitmap3); + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); - } /* * Write an illegal value to APIC_ACCESS_ADDR. Later, @@ -10581,24 +10788,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } - - /* - * Set host-state according to L0's settings (vmcs12 is irrelevant here) - * Some constant fields are set here by vmx_set_constant_host_state(). - * Other fields are different per CPU, and will be set later when - * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. - */ - vmx_set_constant_host_state(vmx); - - /* - * Set the MSR load/store lists to match L0's settings. - */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); - /* * HOST_RSP is normally set correctly in vmx_vcpu_run() just before * entry, but only if the current (host) sp changed from the value @@ -10630,8 +10819,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } /* - * Merging of IO bitmap not currently supported. - * Rather, exit every time. + * A vmexit (to either L1 hypervisor or L0 userspace) is always needed + * for I/O port accesses. */ exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; @@ -10668,12 +10857,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); } - set_cr4_guest_host_mask(vmx); - - if (from_vmentry && - vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset + vmcs12->tsc_offset); @@ -10692,16 +10875,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * even if spawn a lot of nested vCPUs. */ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true); } } else { - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); } - } if (enable_pml) { @@ -10750,6 +10930,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ vmx_set_efer(vcpu, vcpu->arch.efer); + if (vmx->nested.dirty_vmcs12) { + prepare_vmcs02_full(vcpu, vmcs12, from_vmentry); + vmx->nested.dirty_vmcs12 = false; + } + /* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), entry_failure_code)) @@ -10758,16 +10943,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; - /* - * L1 may access the L2's PDPTR, so save them to construct vmcs12 - */ - if (enable_ept) { - vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); - vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); - vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); - vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); - } - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); return 0; @@ -10903,20 +11078,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct loaded_vmcs *vmcs02; u32 msr_entry_idx; u32 exit_qual; - vmcs02 = nested_get_current_vmcs02(vmx); - if (!vmcs02) - return -ENOMEM; - enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - vmx_switch_vmcs(vcpu, vmcs02); + vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); vmx_segment_cache_clear(vmx); if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { @@ -11128,7 +11298,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) if (block_nested_events) return -EBUSY; nested_vmx_inject_exception_vmexit(vcpu, exit_qual); - vcpu->arch.exception.pending = false; return 0; } @@ -11409,11 +11578,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * L1's vpid. TODO: move to a more elaborate solution, giving * each L2 its own vpid and exposing the vpid feature to L1. */ - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); } - /* Restore posted intr vector. */ - if (nested_cpu_has_posted_intr(vmcs12)) - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); @@ -11485,7 +11651,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_write64(GUEST_IA32_DEBUGCTL, 0); if (cpu_has_vmx_msr_bitmap()) - vmx_set_msr_bitmap(vcpu); + vmx_update_msr_bitmap(vcpu); if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) @@ -11534,10 +11700,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); - /* if no vmcs02 cache requested, remove the one we used */ - if (VMCS02_POOL_SIZE == 0) - nested_free_vmcs02(vmx, vmx->nested.current_vmptr); - /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); @@ -11678,6 +11840,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + /* + * RDPID causes #UD if disabled through secondary execution controls. + * Because it is marked as EmulateOnUD, we need to intercept it here. + */ + if (info->intercept == x86_intercept_rdtscp && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { + ctxt->exception.vector = UD_VECTOR; + ctxt->exception.error_code_valid = false; + return X86EMUL_PROPAGATE_FAULT; + } + + /* TODO: check more intercepts... */ return X86EMUL_CONTINUE; } @@ -12191,6 +12368,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .handle_external_intr = vmx_handle_external_intr, .mpx_supported = vmx_mpx_supported, .xsaves_supported = vmx_xsaves_supported, + .umip_emulated = vmx_umip_emulated, .check_nested_events = vmx_check_nested_events, |