aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt13
-rw-r--r--arch/arm/kvm/arm.c4
-rw-r--r--arch/mips/kvm/mips.c7
-rw-r--r--arch/powerpc/kvm/powerpc.c6
-rw-r--r--arch/s390/kvm/kvm-s390.c4
-rw-r--r--arch/x86/include/asm/kvm_host.h5
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/lapic.c64
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/svm.c55
-rw-r--r--arch/x86/kvm/vmx.c752
-rw-r--r--arch/x86/kvm/x86.c109
-rw-r--r--drivers/ptp/ptp_kvm.c7
-rw-r--r--include/linux/kvm_host.h17
-rw-r--r--include/uapi/linux/kvm.h4
-rw-r--r--virt/kvm/kvm_main.c113
16 files changed, 572 insertions, 594 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e4f2cdcf78eb..069450938b79 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3389,7 +3389,18 @@ struct kvm_run {
Request that KVM_RUN return when it becomes possible to inject external
interrupts into the guest. Useful in conjunction with KVM_INTERRUPT.
- __u8 padding1[7];
+ __u8 immediate_exit;
+
+This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN
+exits immediately, returning -EINTR. In the common scenario where a
+signal is used to "kick" a VCPU out of KVM_RUN, this field can be used
+to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability.
+Rather than blocking the signal outside KVM_RUN, userspace can set up
+a signal handler that sets run->immediate_exit to a non-zero value.
+
+This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available.
+
+ __u8 padding1[6];
/* out */
__u32 exit_reason;
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 21c493a9e5c9..c9a2103faeb9 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -206,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_PSCI_0_2:
case KVM_CAP_READONLY_MEM:
case KVM_CAP_MP_STATE:
+ case KVM_CAP_IMMEDIATE_EXIT:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -604,6 +605,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
return ret;
}
+ if (run->immediate_exit)
+ return -EINTR;
+
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 31ee5ee0010b..ed81e5ac1426 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -397,7 +397,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
{
- int r = 0;
+ int r = -EINTR;
sigset_t sigsaved;
if (vcpu->sigset_active)
@@ -409,6 +409,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
vcpu->mmio_needed = 0;
}
+ if (run->immediate_exit)
+ goto out;
+
lose_fpu(1);
local_irq_disable();
@@ -429,6 +432,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
guest_exit_irqoff();
local_irq_enable();
+out:
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -1021,6 +1025,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENABLE_CAP:
case KVM_CAP_READONLY_MEM:
case KVM_CAP_SYNC_MMU:
+ case KVM_CAP_IMMEDIATE_EXIT:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index fcb253ba51e5..2b38d824e9e5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -511,6 +511,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ONE_REG:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_DEVICE_CTRL:
+ case KVM_CAP_IMMEDIATE_EXIT:
r = 1;
break;
case KVM_CAP_PPC_PAIRED_SINGLES:
@@ -1118,7 +1119,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
#endif
}
- r = kvmppc_vcpu_run(run, vcpu);
+ if (run->immediate_exit)
+ r = -EINTR;
+ else
+ r = kvmppc_vcpu_run(run, vcpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 502de74ea984..99e35fe0dea8 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -370,6 +370,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_IRQCHIP:
case KVM_CAP_VM_ATTRIBUTES:
case KVM_CAP_MP_STATE:
+ case KVM_CAP_IMMEDIATE_EXIT:
case KVM_CAP_S390_INJECT_IRQ:
case KVM_CAP_S390_USER_SIGP:
case KVM_CAP_S390_USER_STSI:
@@ -2798,6 +2799,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int rc;
sigset_t sigsaved;
+ if (kvm_run->immediate_exit)
+ return -EINTR;
+
if (guestdbg_exit_pending(vcpu)) {
kvm_s390_prepare_debug_exit(vcpu);
return 0;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 417502cf42b6..74ef58c8ff53 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -55,7 +55,6 @@
#define KVM_REQ_TRIPLE_FAULT 10
#define KVM_REQ_MMU_SYNC 11
#define KVM_REQ_CLOCK_UPDATE 12
-#define KVM_REQ_DEACTIVATE_FPU 13
#define KVM_REQ_EVENT 14
#define KVM_REQ_APF_HALT 15
#define KVM_REQ_STEAL_UPDATE 16
@@ -936,8 +935,6 @@ struct kvm_x86_ops {
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
u32 (*get_pkru)(struct kvm_vcpu *vcpu);
- void (*fpu_activate)(struct kvm_vcpu *vcpu);
- void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
void (*tlb_flush)(struct kvm_vcpu *vcpu);
@@ -969,7 +966,7 @@ struct kvm_x86_ops {
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
- void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
+ int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c0e2036217ad..1d155cc56629 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -123,8 +123,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- kvm_x86_ops->fpu_activate(vcpu);
-
/*
* The existing code assumes virtual address is 48-bit in the canonical
* address checks; exit if it is ever changed.
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 33b799fd3a6e..bad6a25067bc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -341,7 +341,7 @@ static int find_highest_vector(void *bitmap)
vec >= 0; vec -= APIC_VECTORS_PER_REG) {
reg = bitmap + REG_POS(vec);
if (*reg)
- return fls(*reg) - 1 + vec;
+ return __fls(*reg) + vec;
}
return -1;
@@ -361,27 +361,32 @@ static u8 count_vectors(void *bitmap)
return count;
}
-void __kvm_apic_update_irr(u32 *pir, void *regs)
+int __kvm_apic_update_irr(u32 *pir, void *regs)
{
- u32 i, pir_val;
+ u32 i, vec;
+ u32 pir_val, irr_val;
+ int max_irr = -1;
- for (i = 0; i <= 7; i++) {
+ for (i = vec = 0; i <= 7; i++, vec += 32) {
pir_val = READ_ONCE(pir[i]);
+ irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
if (pir_val) {
- pir_val = xchg(&pir[i], 0);
- *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
+ irr_val |= xchg(&pir[i], 0);
+ *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
}
+ if (irr_val)
+ max_irr = __fls(irr_val) + vec;
}
+
+ return max_irr;
}
EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- __kvm_apic_update_irr(pir, apic->regs);
-
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return __kvm_apic_update_irr(pir, apic->regs);
}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
@@ -401,8 +406,6 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
if (!apic->irr_pending)
return -1;
- if (apic->vcpu->arch.apicv_active)
- kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
result = apic_search_irr(apic);
ASSERT(result == -1 || result >= 16);
@@ -416,9 +419,10 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
vcpu = apic->vcpu;
if (unlikely(vcpu->arch.apicv_active)) {
- /* try to update RVI */
+ /* need to update RVI */
apic_clear_vector(vec, apic->regs + APIC_IRR);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
apic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -508,6 +512,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
*/
return apic_find_highest_irr(vcpu->arch.apic);
}
+EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode,
@@ -524,16 +529,14 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
{
-
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
- sizeof(val));
+ return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val,
+ sizeof(val));
}
static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
{
-
- return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
- sizeof(*val));
+ return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val,
+ sizeof(*val));
}
static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
@@ -572,7 +575,11 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
- int highest_irr = apic_find_highest_irr(apic);
+ int highest_irr;
+ if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active)
+ highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
+ else
+ highest_irr = apic_find_highest_irr(apic);
if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
return -1;
return highest_irr;
@@ -2204,8 +2211,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
1 : count_vectors(apic->regs + APIC_ISR);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
- if (kvm_x86_ops->apicv_post_state_restore)
- kvm_x86_ops->apicv_post_state_restore(vcpu);
+ kvm_x86_ops->apicv_post_state_restore(vcpu);
kvm_x86_ops->hwapic_irr_update(vcpu,
apic_find_highest_irr(apic));
kvm_x86_ops->hwapic_isr_update(vcpu,
@@ -2279,8 +2285,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
- sizeof(u32)))
+ if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32)))
return;
apic_set_tpr(vcpu->arch.apic, data & 0xff);
@@ -2332,14 +2338,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
max_isr = 0;
data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
- sizeof(u32));
+ kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32));
}
int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
{
if (vapic_addr) {
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
&vcpu->arch.apic->vapic_cache,
vapic_addr, sizeof(u32)))
return -EINVAL;
@@ -2433,7 +2439,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
vcpu->arch.pv_eoi.msr_val = data;
if (!pv_eoi_enabled(vcpu))
return 0;
- return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data,
addr, sizeof(u8));
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 05abd837b78a..bcbe811f3b97 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -71,8 +71,8 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, unsigned int dest, int dest_mode);
-void __kvm_apic_update_irr(u32 *pir, void *regs);
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+int __kvm_apic_update_irr(u32 *pir, void *regs);
+int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d0414f054bdf..d1efe2c62b3f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -971,8 +971,8 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
* a particular vCPU.
*/
#define SVM_VM_DATA_HASH_BITS 8
-DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
-static spinlock_t svm_vm_data_hash_lock;
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
/* Note:
* This function is called from IOMMU driver to notify
@@ -1077,8 +1077,6 @@ static __init int svm_hardware_setup(void)
} else {
pr_info("AVIC enabled\n");
- hash_init(svm_vm_data_hash);
- spin_lock_init(&svm_vm_data_hash_lock);
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
}
}
@@ -1159,7 +1157,6 @@ static void init_vmcb(struct vcpu_svm *svm)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
- svm->vcpu.fpu_active = 1;
svm->vcpu.arch.hflags = 0;
set_cr_intercept(svm, INTERCEPT_CR0_READ);
@@ -1901,15 +1898,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
ulong gcr0 = svm->vcpu.arch.cr0;
u64 *hcr0 = &svm->vmcb->save.cr0;
- if (!svm->vcpu.fpu_active)
- *hcr0 |= SVM_CR0_SELECTIVE_MASK;
- else
- *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
- | (gcr0 & SVM_CR0_SELECTIVE_MASK);
+ *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
+ | (gcr0 & SVM_CR0_SELECTIVE_MASK);
mark_dirty(svm->vmcb, VMCB_CR);
- if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
+ if (gcr0 == *hcr0) {
clr_cr_intercept(svm, INTERCEPT_CR0_READ);
clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
} else {
@@ -1940,8 +1934,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!npt_enabled)
cr0 |= X86_CR0_PG | X86_CR0_WP;
- if (!vcpu->fpu_active)
- cr0 |= X86_CR0_TS;
/*
* re-enable caching here because the QEMU bios
* does not do it - this results in some delay at
@@ -2160,22 +2152,6 @@ static int ac_interception(struct vcpu_svm *svm)
return 1;
}
-static void svm_fpu_activate(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- clr_exception_intercept(svm, NM_VECTOR);
-
- svm->vcpu.fpu_active = 1;
- update_cr0_intercept(svm);
-}
-
-static int nm_interception(struct vcpu_svm *svm)
-{
- svm_fpu_activate(&svm->vcpu);
- return 1;
-}
-
static bool is_erratum_383(void)
{
int err, i;
@@ -2573,9 +2549,6 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
if (!npt_enabled && svm->apf_reason == 0)
return NESTED_EXIT_HOST;
break;
- case SVM_EXIT_EXCP_BASE + NM_VECTOR:
- nm_interception(svm);
- break;
default:
break;
}
@@ -4020,7 +3993,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
[SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
- [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
[SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
[SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
[SVM_EXIT_INTR] = intr_interception,
@@ -4359,11 +4331,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
return;
}
-static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
-{
- return;
-}
-
static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
{
kvm_lapic_set_irr(vec, vcpu->arch.apic);
@@ -5079,14 +5046,6 @@ static bool svm_has_wbinvd_exit(void)
return true;
}
-static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- set_exception_intercept(svm, NM_VECTOR);
- update_cr0_intercept(svm);
-}
-
#define PRE_EX(exit) { .exit_code = (exit), \
.stage = X86_ICPT_PRE_EXCEPT, }
#define POST_EX(exit) { .exit_code = (exit), \
@@ -5347,9 +5306,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.get_pkru = svm_get_pkru,
- .fpu_activate = svm_fpu_activate,
- .fpu_deactivate = svm_fpu_deactivate,
-
.tlb_flush = svm_flush_tlb,
.run = svm_vcpu_run,
@@ -5373,7 +5329,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.get_enable_apicv = svm_get_enable_apicv,
.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = svm_load_eoi_exitmap,
- .sync_pir_to_irr = svm_sync_pir_to_irr,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
.apicv_post_state_restore = avic_post_state_restore,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7c3e42623090..9856b73a21ad 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1856,7 +1856,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
u32 eb;
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
- (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
+ (1u << DB_VECTOR) | (1u << AC_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1865,8 +1865,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = ~0;
if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
- if (vcpu->fpu_active)
- eb &= ~(1u << NM_VECTOR);
/* When we are running a nested L2 guest and L1 specified for it a
* certain exception bitmap, we must trap the same exceptions and pass
@@ -2340,25 +2338,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
}
}
-static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-{
- ulong cr0;
-
- if (vcpu->fpu_active)
- return;
- vcpu->fpu_active = 1;
- cr0 = vmcs_readl(GUEST_CR0);
- cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
- cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
- vmcs_writel(GUEST_CR0, cr0);
- update_exception_bitmap(vcpu);
- vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
- if (is_guest_mode(vcpu))
- vcpu->arch.cr0_guest_owned_bits &=
- ~get_vmcs12(vcpu)->cr0_guest_host_mask;
- vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-}
-
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
/*
@@ -2377,33 +2356,6 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
(fields->cr4_read_shadow & fields->cr4_guest_host_mask);
}
-static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
- /* Note that there is no vcpu->fpu_active = 0 here. The caller must
- * set this *before* calling this function.
- */
- vmx_decache_cr0_guest_bits(vcpu);
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
- update_exception_bitmap(vcpu);
- vcpu->arch.cr0_guest_owned_bits = 0;
- vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
- if (is_guest_mode(vcpu)) {
- /*
- * L1's specified read shadow might not contain the TS bit,
- * so now that we turned on shadowing of this bit, we need to
- * set this bit of the shadow. Like in nested_vmx_run we need
- * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
- * up-to-date here because we just decached cr0.TS (and we'll
- * only update vmcs12->guest_cr0 on nested exit).
- */
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
- (vcpu->arch.cr0 & X86_CR0_TS);
- vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
- } else
- vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
-}
-
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags, save_rflags;
@@ -4232,9 +4184,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (enable_ept)
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
- if (!vcpu->fpu_active)
- hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
-
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
@@ -5051,26 +5000,12 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (pi_test_and_set_pir(vector, &vmx->pi_desc))
return;
- r = pi_test_and_set_on(&vmx->pi_desc);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
- kvm_vcpu_kick(vcpu);
-}
-
-static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!pi_test_on(&vmx->pi_desc))
+ /* If a previous notification has sent the IPI, nothing to do. */
+ if (pi_test_and_set_on(&vmx->pi_desc))
return;
- pi_clear_on(&vmx->pi_desc);
- /*
- * IOMMU can write to PIR.ON, so the barrier matters even on UP.
- * But on x86 this is just a compiler barrier anyway.
- */
- smp_mb__after_atomic();
- kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+ if (!kvm_vcpu_trigger_posted_interrupt(vcpu))
+ kvm_vcpu_kick(vcpu);
}
/*
@@ -5335,7 +5270,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
/* 22.2.1, 20.8.1 */
vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
- vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
+ vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
+
set_cr4_guest_host_mask(vmx);
if (vmx_xsaves_supported())
@@ -5439,7 +5376,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_set_cr0(vcpu, cr0); /* enter rmode */
vmx_set_cr4(vcpu, 0);
vmx_set_efer(vcpu, 0);
- vmx_fpu_activate(vcpu);
+
update_exception_bitmap(vcpu);
vpid_sync_context(vmx->vpid);
@@ -5473,26 +5410,20 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
- u32 cpu_based_vm_exec_control;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_INTR_PENDING);
}
static void enable_nmi_window(struct kvm_vcpu *vcpu)
{
- u32 cpu_based_vm_exec_control;
-
if (!cpu_has_virtual_nmis() ||
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
enable_irq_window(vcpu);
return;
}
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_NMI_PENDING);
}
static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -5718,11 +5649,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (is_nmi(intr_info))
return 1; /* already handled by vmx_vcpu_run() */
- if (is_no_device(intr_info)) {
- vmx_fpu_activate(vcpu);
- return 1;
- }
-
if (is_invalid_opcode(intr_info)) {
if (is_guest_mode(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5912,22 +5838,6 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
return kvm_set_cr4(vcpu, val);
}
-/* called to set cr0 as appropriate for clts instruction exit. */
-static void handle_clts(struct kvm_vcpu *vcpu)
-{
- if (is_guest_mode(vcpu)) {
- /*
- * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
- * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
- * just pretend it's off (also in arch.cr0 for fpu_activate).
- */
- vmcs_writel(CR0_READ_SHADOW,
- vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
- vcpu->arch.cr0 &= ~X86_CR0_TS;
- } else
- vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-}
-
static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
@@ -5973,9 +5883,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
}
break;
case 2: /* clts */
- handle_clts(vcpu);
+ WARN_ONCE(1, "Guest should always own CR0.TS");
+ vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
- vmx_fpu_activate(vcpu);
return kvm_skip_emulated_instruction(vcpu);
case 1: /*mov from cr*/
switch (cr) {
@@ -6151,12 +6061,8 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
static int handle_interrupt_window(struct kvm_vcpu *vcpu)
{
- u32 cpu_based_vm_exec_control;
-
- /* clear pending irq */
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_INTR_PENDING);
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -6382,6 +6288,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
EPT_VIOLATION_EXECUTABLE))
? PFERR_PRESENT_MASK : 0;
+ vcpu->arch.gpa_available = true;
vcpu->arch.exit_qualification = exit_qualification;
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -6399,6 +6306,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
}
ret = handle_mmio_page_fault(vcpu, gpa, true);
+ vcpu->arch.gpa_available = true;
if (likely(ret == RET_MMIO_PF_EMULATE))
return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
EMULATE_DONE;
@@ -6420,12 +6328,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
- u32 cpu_based_vm_exec_control;
-
- /* clear pending NMI */
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_NMI_PENDING);
++vcpu->stat.nmi_window_exits;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -6663,8 +6567,10 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_ple())
ple_gap = 0;
- if (!cpu_has_vmx_apicv())
+ if (!cpu_has_vmx_apicv()) {
enable_apicv = 0;
+ kvm_x86_ops->sync_pir_to_irr = NULL;
+ }
if (cpu_has_vmx_tsc_scaling()) {
kvm_has_tsc_control = true;
@@ -7134,6 +7040,53 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
return 0;
}
+static int enter_vmx_operation(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs *shadow_vmcs;
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ vmx->nested.msr_bitmap =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx->nested.msr_bitmap)
+ goto out_msr_bitmap;
+ }
+
+ vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+ if (!vmx->nested.cached_vmcs12)
+ goto out_cached_vmcs12;
+
+ if (enable_shadow_vmcs) {
+ shadow_vmcs = alloc_vmcs();
+ if (!shadow_vmcs)
+ goto out_shadow_vmcs;
+ /* mark vmcs as shadow */
+ shadow_vmcs->revision_id |= (1u << 31);
+ /* init shadow vmcs */
+ vmcs_clear(shadow_vmcs);
+ vmx->vmcs01.shadow_vmcs = shadow_vmcs;
+ }
+
+ INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+ vmx->nested.vmcs02_num = 0;
+
+ hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_PINNED);
+ vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+
+ vmx->nested.vmxon = true;
+ return 0;
+
+out_shadow_vmcs:
+ kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+ free_page((unsigned long)vmx->nested.msr_bitmap);
+
+out_msr_bitmap:
+ return -ENOMEM;
+}
+
/*
* Emulate the VMXON instruction.
* Currently, we just remember that VMX is active, and do not save or even
@@ -7144,9 +7097,9 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
*/
static int handle_vmon(struct kvm_vcpu *vcpu)
{
+ int ret;
struct kvm_segment cs;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct vmcs *shadow_vmcs;
const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -7186,49 +7139,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
return 1;
-
- if (cpu_has_vmx_msr_bitmap()) {
- vmx->nested.msr_bitmap =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx->nested.msr_bitmap)
- goto out_msr_bitmap;
- }
-
- vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
- if (!vmx->nested.cached_vmcs12)
- goto out_cached_vmcs12;
-
- if (enable_shadow_vmcs) {
- shadow_vmcs = alloc_vmcs();
- if (!shadow_vmcs)
- goto out_shadow_vmcs;
- /* mark vmcs as shadow */
- shadow_vmcs->revision_id |= (1u << 31);
- /* init shadow vmcs */
- vmcs_clear(shadow_vmcs);
- vmx->vmcs01.shadow_vmcs = shadow_vmcs;
- }
-
- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num = 0;
-
- hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL_PINNED);
- vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
-
- vmx->nested.vmxon = true;
+
+ ret = enter_vmx_operation(vcpu);
+ if (ret)
+ return ret;
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
-
-out_shadow_vmcs:
- kfree(vmx->nested.cached_vmcs12);
-
-out_cached_vmcs12:
- free_page((unsigned long)vmx->nested.msr_bitmap);
-
-out_msr_bitmap:
- return -ENOMEM;
}
/*
@@ -7677,6 +7594,18 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
+static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+ vmx->nested.current_vmptr = vmptr;
+ if (enable_shadow_vmcs) {
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER,
+ __pa(vmx->vmcs01.shadow_vmcs));
+ vmx->nested.sync_shadow_vmcs = true;
+ }
+}
+
/* Emulate the VMPTRLD instruction */
static int handle_vmptrld(struct kvm_vcpu *vcpu)
{
@@ -7707,7 +7636,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
}
nested_release_vmcs12(vmx);
- vmx->nested.current_vmptr = vmptr;
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
/*
@@ -7716,14 +7644,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
*/
memcpy(vmx->nested.cached_vmcs12,
vmx->nested.current_vmcs12, VMCS12_SIZE);
-
- if (enable_shadow_vmcs) {
- vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_SHADOW_VMCS);
- vmcs_write64(VMCS_LINK_POINTER,
- __pa(vmx->vmcs01.shadow_vmcs));
- vmx->nested.sync_shadow_vmcs = true;
- }
+ set_current_vmptr(vmx, vmptr);
}
nested_vmx_succeed(vcpu);
@@ -8517,6 +8438,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 vectoring_info = vmx->idt_vectoring_info;
trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+ vcpu->arch.gpa_available = false;
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -8735,6 +8657,27 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
}
}
+static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int max_irr;
+
+ WARN_ON(!vcpu->arch.apicv_active);
+ if (pi_test_on(&vmx->pi_desc)) {
+ pi_clear_on(&vmx->pi_desc);
+ /*
+ * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+ * But on x86 this is just a compiler barrier anyway.
+ */
+ smp_mb__after_atomic();
+ max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+ } else {
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ }
+ vmx_hwapic_irr_update(vcpu, max_irr);
+ return max_irr;
+}
+
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
@@ -8746,6 +8689,14 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}
+static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ pi_clear_on(&vmx->pi_desc);
+ memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
+}
+
static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -9591,17 +9542,16 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
kvm_inject_page_fault(vcpu, fault);
}
-static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12);
+
+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
+ u64 hpa;
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
- if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
- vmcs12->apic_access_addr >> maxphyaddr)
- return false;
-
/*
* Translate L1 physical address to host physical
* address for vmcs02. Keep the page pinned, so this
@@ -9612,59 +9562,80 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
nested_release_page(vmx->nested.apic_access_page);
vmx->nested.apic_access_page =
nested_get_page(vcpu, vmcs12->apic_access_addr);
+ /*
+ * If translation failed, no matter: This feature asks
+ * to exit when accessing the given address, and if it
+ * can never be accessed, this feature won't do
+ * anything anyway.
+ */
+ if (vmx->nested.apic_access_page) {
+ hpa = page_to_phys(vmx->nested.apic_access_page);
+ vmcs_write64(APIC_ACCESS_ADDR, hpa);
+ } else {
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ }
+ } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+ cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ kvm_vcpu_reload_apic_access_page(vcpu);
}
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
- if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
- vmcs12->virtual_apic_page_addr >> maxphyaddr)
- return false;
-
if (vmx->nested.virtual_apic_page) /* shouldn't happen */
nested_release_page(vmx->nested.virtual_apic_page);
vmx->nested.virtual_apic_page =
nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
/*
- * Failing the vm entry is _not_ what the processor does
- * but it's basically the only possibility we have.
- * We could still enter the guest if CR8 load exits are
- * enabled, CR8 store exits are enabled, and virtualize APIC
- * access is disabled; in this case the processor would never
- * use the TPR shadow and we could simply clear the bit from
- * the execution control. But such a configuration is useless,
- * so let's keep the code simple.
+ * If translation failed, VM entry will fail because
+ * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
+ * Failing the vm entry is _not_ what the processor
+ * does but it's basically the only possibility we
+ * have. We could still enter the guest if CR8 load
+ * exits are enabled, CR8 store exits are enabled, and
+ * virtualize APIC access is disabled; in this case
+ * the processor would never use the TPR shadow and we
+ * could simply clear the bit from the execution
+ * control. But such a configuration is useless, so
+ * let's keep the code simple.
*/
- if (!vmx->nested.virtual_apic_page)
- return false;
+ if (vmx->nested.virtual_apic_page) {
+ hpa = page_to_phys(vmx->nested.virtual_apic_page);
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
+ }
}
if (nested_cpu_has_posted_intr(vmcs12)) {
- if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
- vmcs12->posted_intr_desc_addr >> maxphyaddr)
- return false;
-
if (vmx->nested.pi_desc_page) { /* shouldn't happen */
kunmap(vmx->nested.pi_desc_page);
nested_release_page(vmx->nested.pi_desc_page);
}
vmx->nested.pi_desc_page =
nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
- if (!vmx->nested.pi_desc_page)
- return false;
-
vmx->nested.pi_desc =
(struct pi_desc *)kmap(vmx->nested.pi_desc_page);
if (!vmx->nested.pi_desc) {
nested_release_page_clean(vmx->nested.pi_desc_page);
- return false;
+ return;
}
vmx->nested.pi_desc =
(struct pi_desc *)((void *)vmx->nested.pi_desc +
(unsigned long)(vmcs12->posted_intr_desc_addr &
(PAGE_SIZE - 1)));
+ vmcs_write64(POSTED_INTR_DESC_ADDR,
+ page_to_phys(vmx->nested.pi_desc_page) +
+ (unsigned long)(vmcs12->posted_intr_desc_addr &
+ (PAGE_SIZE - 1)));
}
-
- return true;
+ if (cpu_has_vmx_msr_bitmap() &&
+ nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
+ nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+ ;
+ else
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_USE_MSR_BITMAPS);
}
static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
@@ -9980,7 +9951,7 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
* is assigned to entry_failure_code on failure.
*/
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
- unsigned long *entry_failure_code)
+ u32 *entry_failure_code)
{
if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
if (!nested_cr3_valid(vcpu, cr3)) {
@@ -10020,7 +9991,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
* is assigned to entry_failure_code on failure.
*/
static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- unsigned long *entry_failure_code)
+ bool from_vmentry, u32 *entry_failure_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exec_control;
@@ -10063,21 +10034,26 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+ if (from_vmentry &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
} else {
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
}
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- vmcs12->vm_entry_intr_info_field);
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
- vmcs12->vm_entry_exception_error_code);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
- vmcs12->vm_entry_instruction_len);
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
- vmcs12->guest_interruptibility_info);
+ if (from_vmentry) {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ vmcs12->vm_entry_intr_info_field);
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vmcs12->vm_entry_exception_error_code);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmcs12->vm_entry_instruction_len);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+ vmcs12->guest_interruptibility_info);
+ } else {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+ }
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
@@ -10106,12 +10082,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
vmx->nested.pi_pending = false;
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
- vmcs_write64(POSTED_INTR_DESC_ADDR,
- page_to_phys(vmx->nested.pi_desc_page) +
- (unsigned long)(vmcs12->posted_intr_desc_addr &
- (PAGE_SIZE - 1)));
- } else
+ } else {
exec_control &= ~PIN_BASED_POSTED_INTR;
+ }
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
@@ -10156,26 +10129,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
- if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
- /*
- * If translation failed, no matter: This feature asks
- * to exit when accessing the given address, and if it
- * can never be accessed, this feature won't do
- * anything anyway.
- */
- if (!vmx->nested.apic_access_page)
- exec_control &=
- ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- else
- vmcs_write64(APIC_ACCESS_ADDR,
- page_to_phys(vmx->nested.apic_access_page));
- } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
- cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
- exec_control |=
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- kvm_vcpu_reload_apic_access_page(vcpu);
- }
-
if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
vmcs_write64(EOI_EXIT_BITMAP0,
vmcs12->eoi_exit_bitmap0);
@@ -10190,6 +10143,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
+
+ /*
+ * Write an illegal value to APIC_ACCESS_ADDR. Later,
+ * nested_get_vmcs12_pages will either fix it up or
+ * remove the VM execution control.
+ */
+ if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+ vmcs_write64(APIC_ACCESS_ADDR, -1ull);
+
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -10226,19 +10188,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
exec_control &= ~CPU_BASED_TPR_SHADOW;
exec_control |= vmcs12->cpu_based_vm_exec_control;
+ /*
+ * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
+ * nested_get_vmcs12_pages can't fix it up, the illegal value
+ * will result in a VM entry failure.
+ */
if (exec_control & CPU_BASED_TPR_SHADOW) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- page_to_phys(vmx->nested.virtual_apic_page));
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
}
- if (cpu_has_vmx_msr_bitmap() &&
- exec_control & CPU_BASED_USE_MSR_BITMAPS &&
- nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
- ; /* MSR_BITMAP will be set by following vmx_set_efer. */
- else
- exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
-
/*
* Merging of IO bitmap not currently supported.
* Rather, exit every time.
@@ -10270,16 +10229,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
~VM_ENTRY_IA32E_MODE) |
(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
+ if (from_vmentry &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
vcpu->arch.pat = vmcs12->guest_ia32_pat;
- } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
-
+ }
set_cr4_guest_host_mask(vmx);
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
+ if (from_vmentry &&
+ vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
@@ -10318,8 +10279,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
/*
- * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
- * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
+ * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
+ * bits which we consider mandatory enabled.
* The CR0_READ_SHADOW is what L2 should have expected to read given
* the specifications by L1; It's not enough to take
* vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
@@ -10331,7 +10292,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmx_set_cr4(vcpu, vmcs12->guest_cr4);
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+ if (from_vmentry &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
vcpu->arch.efer = vmcs12->guest_ia32_efer;
else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
vcpu->arch.efer |= (EFER_LMA | EFER_LME);
@@ -10365,73 +10327,22 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
return 0;
}
-/*
- * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
- * for running an L2 nested guest.
- */
-static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
- struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int cpu;
- struct loaded_vmcs *vmcs02;
- bool ia32e;
- u32 msr_entry_idx;
- unsigned long exit_qualification;
-
- if (!nested_vmx_check_permission(vcpu))
- return 1;
-
- if (!nested_vmx_check_vmcs12(vcpu))
- goto out;
-
- vmcs12 = get_vmcs12(vcpu);
-
- if (enable_shadow_vmcs)
- copy_shadow_to_vmcs12(vmx);
-
- /*
- * The nested entry process starts with enforcing various prerequisites
- * on vmcs12 as required by the Intel SDM, and act appropriately when
- * they fail: As the SDM explains, some conditions should cause the
- * instruction to fail, while others will cause the instruction to seem
- * to succeed, but return an EXIT_REASON_INVALID_STATE.
- * To speed up the normal (success) code path, we should avoid checking
- * for misconfigurations which will anyway be caught by the processor
- * when using the merged vmcs02.
- */
- if (vmcs12->launch_state == launch) {
- nested_vmx_failValid(vcpu,
- launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
- : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
- goto out;
- }
if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
- vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- goto out;
- }
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
- if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- goto out;
- }
-
- if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- goto out;
- }
+ if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
- if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- goto out;
- }
+ if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
- if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- goto out;
- }
+ if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
vmx->nested.nested_vmx_procbased_ctls_low,
@@ -10448,28 +10359,30 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
!vmx_control_verify(vmcs12->vm_entry_controls,
vmx->nested.nested_vmx_entry_ctls_low,
vmx->nested.nested_vmx_entry_ctls_high))
- {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- goto out;
- }
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
!nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
- !nested_cr3_valid(vcpu, vmcs12->host_cr3)) {
- nested_vmx_failValid(vcpu,
- VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
- goto out;
- }
+ !nested_cr3_valid(vcpu, vmcs12->host_cr3))
+ return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+
+ return 0;
+}
+
+static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 *exit_qual)
+{
+ bool ia32e;
+
+ *exit_qual = ENTRY_FAIL_DEFAULT;
if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
- !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) {
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+ !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
return 1;
- }
- if (vmcs12->vmcs_link_pointer != -1ull) {
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
+
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
+ vmcs12->vmcs_link_pointer != -1ull) {
+ *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
return 1;
}
@@ -10482,16 +10395,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
* CR0.PG) is 1.
*/
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
+ if (to_vmx(vcpu)->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
((vmcs12->guest_cr0 & X86_CR0_PG) &&
- ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+ ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
return 1;
- }
}
/*
@@ -10505,28 +10416,26 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
- ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+ ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
return 1;
- }
}
- /*
- * We're finally done with prerequisite checking, and can start with
- * the nested entry.
- */
+ return 0;
+}
+
+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct loaded_vmcs *vmcs02;
+ int cpu;
+ u32 msr_entry_idx;
+ u32 exit_qual;
vmcs02 = nested_get_current_vmcs02(vmx);
if (!vmcs02)
return -ENOMEM;
- /*
- * After this point, the trap flag no longer triggers a singlestep trap
- * on the vm entry instructions. Don't call
- * kvm_skip_emulated_instruction.
- */
- skip_emulated_instruction(vcpu);
enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
@@ -10541,14 +10450,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmx_segment_cache_clear(vmx);
- if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) {
+ if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
leave_guest_mode(vcpu);
vmx_load_vmcs01(vcpu);
nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, exit_qualification);
+ EXIT_REASON_INVALID_STATE, exit_qual);
return 1;
}
+ nested_get_vmcs12_pages(vcpu, vmcs12);
+
msr_entry_idx = nested_vmx_load_msr(vcpu,
vmcs12->vm_entry_msr_load_addr,
vmcs12->vm_entry_msr_load_count);
@@ -10562,17 +10473,90 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmcs12->launch_state = 1;
- if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
- return kvm_vcpu_halt(vcpu);
-
- vmx->nested.nested_run_pending = 1;
-
/*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
* returned as far as L1 is concerned. It will only return (and set
* the success flag) when L2 exits (see nested_vmx_vmexit()).
*/
+ return 0;
+}
+
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exit_qual;
+ int ret;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!nested_vmx_check_vmcs12(vcpu))
+ goto out;
+
+ vmcs12 = get_vmcs12(vcpu);
+
+ if (enable_shadow_vmcs)
+ copy_shadow_to_vmcs12(vmx);
+
+ /*
+ * The nested entry process starts with enforcing various prerequisites
+ * on vmcs12 as required by the Intel SDM, and act appropriately when
+ * they fail: As the SDM explains, some conditions should cause the
+ * instruction to fail, while others will cause the instruction to seem
+ * to succeed, but return an EXIT_REASON_INVALID_STATE.
+ * To speed up the normal (success) code path, we should avoid checking
+ * for misconfigurations which will anyway be caught by the processor
+ * when using the merged vmcs02.
+ */
+ if (vmcs12->launch_state == launch) {
+ nested_vmx_failValid(vcpu,
+ launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+ : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+ goto out;
+ }
+
+ ret = check_vmentry_prereqs(vcpu, vmcs12);
+ if (ret) {
+ nested_vmx_failValid(vcpu, ret);
+ goto out;
+ }
+
+ /*
+ * After this point, the trap flag no longer triggers a singlestep trap
+ * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
+ * This is not 100% correct; for performance reasons, we delegate most
+ * of the checks on host state to the processor. If those fail,
+ * the singlestep trap is missed.
+ */
+ skip_emulated_instruction(vcpu);
+
+ ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
+ if (ret) {
+ nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_INVALID_STATE, exit_qual);
+ return 1;
+ }
+
+ /*
+ * We're finally done with prerequisite checking, and can start with
+ * the nested entry.
+ */
+
+ ret = enter_vmx_non_root_mode(vcpu, true);
+ if (ret)
+ return ret;
+
+ if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
+ return kvm_vcpu_halt(vcpu);
+
+ vmx->nested.nested_run_pending = 1;
+
return 1;
out:
@@ -10713,21 +10697,13 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
}
/*
- * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
- * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
- * and this function updates it to reflect the changes to the guest state while
- * L2 was running (and perhaps made some exits which were handled directly by L0
- * without going back to L1), and to reflect the exit reason.
- * Note that we do not have to copy here all VMCS fields, just those that
- * could have changed by the L2 guest or the exit - i.e., the guest-state and
- * exit-information fields only. Other fields are modified by L1 with VMWRITE,
- * which already writes to vmcs12 directly.
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
*/
-static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- u32 exit_reason, u32 exit_intr_info,
- unsigned long exit_qualification)
+static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
- /* update guest state fields: */
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
@@ -10833,6 +10809,25 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
if (nested_cpu_has_xsaves(vmcs12))
vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 exit_reason, u32 exit_intr_info,
+ unsigned long exit_qualification)
+{
+ /* update guest state fields: */
+ sync_vmcs12(vcpu, vmcs12);
/* update exit information fields: */
@@ -10883,7 +10878,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
struct kvm_segment seg;
- unsigned long entry_failure_code;
+ u32 entry_failure_code;
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -10898,24 +10893,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
/*
* Note that calling vmx_set_cr0 is important, even if cr0 hasn't
- * actually changed, because it depends on the current state of
- * fpu_active (which may have changed).
- * Note that vmx_set_cr0 refers to efer set above.
+ * actually changed, because vmx_set_cr0 refers to efer set above.
+ *
+ * CR0_GUEST_HOST_MASK is already set in the original vmcs01
+ * (KVM doesn't change it);
*/
+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
vmx_set_cr0(vcpu, vmcs12->host_cr0);
- /*
- * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
- * to apply the same changes to L1's vmcs. We just set cr0 correctly,
- * but we also need to update cr0_guest_host_mask and exception_bitmap.
- */
- update_exception_bitmap(vcpu);
- vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
- vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
- /*
- * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
- * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
- */
+ /* Same as above - no reason to call set_cr4_guest_host_mask(). */
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
kvm_set_cr4(vcpu, vmcs12->host_cr4);
@@ -11544,9 +11530,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.get_pkru = vmx_get_pkru,
- .fpu_activate = vmx_fpu_activate,
- .fpu_deactivate = vmx_fpu_deactivate,
-
.tlb_flush = vmx_flush_tlb,
.run = vmx_vcpu_run,
@@ -11571,6 +11554,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.get_enable_apicv = vmx_get_enable_apicv,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .apicv_post_state_restore = vmx_apicv_post_state_restore,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2f64e5d0ae53..c48404017e4f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1811,7 +1811,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info guest_hv_clock;
- if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time,
&guest_hv_clock, sizeof(guest_hv_clock))))
return;
@@ -1832,9 +1832,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
smp_wmb();
@@ -1848,16 +1848,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
+ kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
smp_wmb();
vcpu->hv_clock.version++;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
}
static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2090,7 +2090,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+ if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa,
sizeof(u32)))
return 1;
@@ -2109,7 +2109,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
return;
@@ -2120,7 +2120,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
vcpu->arch.st.steal.version += 1;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
smp_wmb();
@@ -2129,14 +2129,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
smp_wmb();
vcpu->arch.st.steal.version += 1;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
}
@@ -2241,7 +2241,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!(data & 1))
break;
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)))
vcpu->arch.pv_time_enabled = false;
@@ -2262,7 +2262,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & KVM_STEAL_RESERVED_MASK)
return 1;
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+ if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime,
data & KVM_STEAL_VALID_BITS,
sizeof(struct kvm_steal_time)))
return 1;
@@ -2672,6 +2672,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_DISABLE_QUIRKS:
case KVM_CAP_SET_BOOT_CPU_ID:
case KVM_CAP_SPLIT_IRQCHIP:
+ case KVM_CAP_IMMEDIATE_EXIT:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -2875,7 +2876,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
vcpu->arch.st.steal.preempted = 1;
- kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime,
&vcpu->arch.st.steal.preempted,
offsetof(struct kvm_steal_time, preempted),
sizeof(vcpu->arch.st.steal.preempted));
@@ -2909,7 +2910,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- if (vcpu->arch.apicv_active)
+ if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
return kvm_apic_get_state(vcpu, s);
@@ -6659,7 +6660,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
- if (vcpu->arch.apicv_active)
+ if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -6750,10 +6751,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 0;
goto out;
}
- if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
- vcpu->fpu_active = 0;
- kvm_x86_ops->fpu_deactivate(vcpu);
- }
if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
/* Page is swapped out. Do synthetic halt */
vcpu->arch.apf.halted = true;
@@ -6813,20 +6810,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_hv_process_stimers(vcpu);
}
- /*
- * KVM_REQ_EVENT is not set when posted interrupts are set by
- * VT-d hardware, so we have to update RVI unconditionally.
- */
- if (kvm_lapic_enabled(vcpu)) {
- /*
- * Update architecture specific hints for APIC
- * virtual interrupt delivery.
- */
- if (vcpu->arch.apicv_active)
- kvm_x86_ops->hwapic_irr_update(vcpu,
- kvm_lapic_find_highest_irr(vcpu));
- }
-
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
++vcpu->stat.req_event;
kvm_apic_accept_events(vcpu);
@@ -6869,22 +6852,40 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
- if (vcpu->fpu_active)
- kvm_load_guest_fpu(vcpu);
+ kvm_load_guest_fpu(vcpu);
+
+ /*
+ * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
+ * IPI are then delayed after guest entry, which ensures that they
+ * result in virtual interrupt delivery.
+ */
+ local_irq_disable();
vcpu->mode = IN_GUEST_MODE;
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
/*
- * We should set ->mode before check ->requests,
- * Please see the comment in kvm_make_all_cpus_request.
- * This also orders the write to mode from any reads
- * to the page tables done while the VCPU is running.
- * Please see the comment in kvm_flush_remote_tlbs.
+ * 1) We should set ->mode before checking ->requests. Please see
+ * the comment in kvm_make_all_cpus_request.
+ *
+ * 2) For APICv, we should set ->mode before checking PIR.ON. This
+ * pairs with the memory barrier implicit in pi_test_and_set_on
+ * (see vmx_deliver_posted_interrupt).
+ *
+ * 3) This also orders the write to mode from any reads to the page
+ * tables done while the VCPU is running. Please see the comment
+ * in kvm_flush_remote_tlbs.
*/
smp_mb__after_srcu_read_unlock();
- local_irq_disable();
+ /*
+ * This handles the case where a posted interrupt was
+ * notified with kvm_vcpu_kick.
+ */
+ if (kvm_lapic_enabled(vcpu)) {
+ if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+ }
if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
|| need_resched() || signal_pending(current)) {
@@ -7023,6 +7024,9 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+ kvm_x86_ops->check_nested_events(vcpu, false);
+
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
}
@@ -7194,7 +7198,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
} else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
- r = vcpu_run(vcpu);
+ if (kvm_run->immediate_exit)
+ r = -EINTR;
+ else
+ r = vcpu_run(vcpu);
out:
post_kvm_run_save(vcpu);
@@ -8389,9 +8396,6 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
- kvm_x86_ops->check_nested_events(vcpu, false);
-
return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
}
@@ -8528,9 +8532,8 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
{
-
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
- sizeof(val));
+ return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val,
+ sizeof(val));
}
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c
index 0a54e8326a90..09b4df74291e 100644
--- a/drivers/ptp/ptp_kvm.c
+++ b/drivers/ptp/ptp_kvm.c
@@ -176,12 +176,19 @@ static void __exit ptp_kvm_exit(void)
static int __init ptp_kvm_init(void)
{
+ long ret;
+
clock_pair_gpa = slow_virt_to_phys(&clock_pair);
hv_clock = pvclock_pvti_cpu0_va();
if (!hv_clock)
return -ENODEV;
+ ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
+ KVM_CLOCK_PAIRING_WALLCLOCK);
+ if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
+ return -ENODEV;
+
kvm_ptp_clock.caps = ptp_kvm_caps;
kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cda457bcedc1..8d69d5150748 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -221,7 +221,6 @@ struct kvm_vcpu {
struct mutex mutex;
struct kvm_run *run;
- int fpu_active;
int guest_fpu_loaded, guest_xcr0_loaded;
struct swait_queue_head wq;
struct pid *pid;
@@ -641,18 +640,18 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
unsigned long len);
int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
-int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- void *data, unsigned long len);
+int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+ void *data, unsigned long len);
int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
int offset, int len);
int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
unsigned long len);
-int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- void *data, unsigned long len);
-int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- void *data, int offset, unsigned long len);
-int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- gpa_t gpa, unsigned long len);
+int kvm_vcpu_write_guest_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
+ void *data, unsigned long len);
+int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
+ void *data, int offset, unsigned long len);
+int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
+ gpa_t gpa, unsigned long len);
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7964b970b9ad..f51d5082a377 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -218,7 +218,8 @@ struct kvm_hyperv_exit {
struct kvm_run {
/* in */
__u8 request_interrupt_window;
- __u8 padding1[7];
+ __u8 immediate_exit;
+ __u8 padding1[6];
/* out */
__u32 exit_reason;
@@ -881,6 +882,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_SPAPR_RESIZE_HPT 133
#define KVM_CAP_PPC_MMU_RADIX 134
#define KVM_CAP_PPC_MMU_HASH_V3 135
+#define KVM_CAP_IMMEDIATE_EXIT 136
#ifdef KVM_CAP_IRQ_ROUTING
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 482612b4e496..cc4d6e0dd2a2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -506,11 +506,6 @@ static struct kvm_memslots *kvm_alloc_memslots(void)
if (!slots)
return NULL;
- /*
- * Init kvm generation close to the maximum to easily test the
- * code of handling generation number wrap-around.
- */
- slots->generation = -150;
for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
slots->id_to_index[i] = slots->memslots[i].id = i;
@@ -641,9 +636,16 @@ static struct kvm *kvm_create_vm(unsigned long type)
r = -ENOMEM;
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- kvm->memslots[i] = kvm_alloc_memslots();
- if (!kvm->memslots[i])
+ struct kvm_memslots *slots = kvm_alloc_memslots();
+ if (!slots)
goto out_err_no_srcu;
+ /*
+ * Generations must be different for each address space.
+ * Init kvm generation close to the maximum to easily test the
+ * code of handling generation number wrap-around.
+ */
+ slots->generation = i * 2 - 150;
+ rcu_assign_pointer(kvm->memslots[i], slots);
}
if (init_srcu_struct(&kvm->srcu))
@@ -870,8 +872,14 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
* Increment the new memslot generation a second time. This prevents
* vm exits that race with memslot updates from caching a memslot
* generation that will (potentially) be valid forever.
+ *
+ * Generations must be unique even across address spaces. We do not need
+ * a global counter for that, instead the generation space is evenly split
+ * across address spaces. For example, with two address spaces, address
+ * space 0 will use generations 0, 4, 8, ... while * address space 1 will
+ * use generations 2, 6, 10, 14, ...
*/
- slots->generation++;
+ slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1;
kvm_arch_memslots_updated(kvm, slots);
@@ -1094,37 +1102,31 @@ int kvm_get_dirty_log(struct kvm *kvm,
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
- int r, i, as_id, id;
+ int i, as_id, id;
unsigned long n;
unsigned long any = 0;
- r = -EINVAL;
as_id = log->slot >> 16;
id = (u16)log->slot;
if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
- goto out;
+ return -EINVAL;
slots = __kvm_memslots(kvm, as_id);
memslot = id_to_memslot(slots, id);
- r = -ENOENT;
if (!memslot->dirty_bitmap)
- goto out;
+ return -ENOENT;
n = kvm_dirty_bitmap_bytes(memslot);
for (i = 0; !any && i < n/sizeof(long); ++i)
any = memslot->dirty_bitmap[i];
- r = -EFAULT;
if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
- goto out;
+ return -EFAULT;
if (any)
*is_dirty = 1;
-
- r = 0;
-out:
- return r;
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
@@ -1156,24 +1158,22 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
- int r, i, as_id, id;
+ int i, as_id, id;
unsigned long n;
unsigned long *dirty_bitmap;
unsigned long *dirty_bitmap_buffer;
- r = -EINVAL;
as_id = log->slot >> 16;
id = (u16)log->slot;
if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
- goto out;
+ return -EINVAL;
slots = __kvm_memslots(kvm, as_id);
memslot = id_to_memslot(slots, id);
dirty_bitmap = memslot->dirty_bitmap;
- r = -ENOENT;
if (!dirty_bitmap)
- goto out;
+ return -ENOENT;
n = kvm_dirty_bitmap_bytes(memslot);
@@ -1202,14 +1202,9 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
}
spin_unlock(&kvm->mmu_lock);
-
- r = -EFAULT;
if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
- goto out;
-
- r = 0;
-out:
- return r;
+ return -EFAULT;
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
#endif
@@ -1937,10 +1932,10 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
}
EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
-int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- gpa_t gpa, unsigned long len)
+static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
+ struct gfn_to_hva_cache *ghc,
+ gpa_t gpa, unsigned long len)
{
- struct kvm_memslots *slots = kvm_memslots(kvm);
int offset = offset_in_page(gpa);
gfn_t start_gfn = gpa >> PAGE_SHIFT;
gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
@@ -1950,7 +1945,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
ghc->gpa = gpa;
ghc->generation = slots->generation;
ghc->len = len;
- ghc->memslot = gfn_to_memslot(kvm, start_gfn);
+ ghc->memslot = __gfn_to_memslot(slots, start_gfn);
ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
ghc->hva += offset;
@@ -1960,7 +1955,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
* verify that the entire region is valid here.
*/
while (start_gfn <= end_gfn) {
- ghc->memslot = gfn_to_memslot(kvm, start_gfn);
+ ghc->memslot = __gfn_to_memslot(slots, start_gfn);
ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
&nr_pages_avail);
if (kvm_is_error_hva(ghc->hva))
@@ -1972,22 +1967,29 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
}
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
-int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- void *data, int offset, unsigned long len)
+int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+ gpa_t gpa, unsigned long len)
{
- struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
+ return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva_cache_init);
+
+int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+ void *data, int offset, unsigned long len)
+{
+ struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
int r;
gpa_t gpa = ghc->gpa + offset;
BUG_ON(len + offset > ghc->len);
if (slots->generation != ghc->generation)
- kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len);
+ __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
if (unlikely(!ghc->memslot))
- return kvm_write_guest(kvm, gpa, data, len);
+ return kvm_vcpu_write_guest(vcpu, gpa, data, len);
if (kvm_is_error_hva(ghc->hva))
return -EFAULT;
@@ -1999,28 +2001,28 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
+EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_offset_cached);
-int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- void *data, unsigned long len)
+int kvm_vcpu_write_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+ void *data, unsigned long len)
{
- return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
+ return kvm_vcpu_write_guest_offset_cached(vcpu, ghc, data, 0, len);
}
-EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
+EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_cached);
-int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- void *data, unsigned long len)
+int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+ void *data, unsigned long len)
{
- struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
int r;
BUG_ON(len > ghc->len);
if (slots->generation != ghc->generation)
- kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len);
+ __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
if (unlikely(!ghc->memslot))
- return kvm_read_guest(kvm, ghc->gpa, data, len);
+ return kvm_vcpu_read_guest(vcpu, ghc->gpa, data, len);
if (kvm_is_error_hva(ghc->hva))
return -EFAULT;
@@ -2031,7 +2033,7 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
+EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_cached);
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
{
@@ -3133,10 +3135,9 @@ static long kvm_vm_compat_ioctl(struct file *filp,
struct compat_kvm_dirty_log compat_log;
struct kvm_dirty_log log;
- r = -EFAULT;
if (copy_from_user(&compat_log, (void __user *)arg,
sizeof(compat_log)))
- goto out;
+ return -EFAULT;
log.slot = compat_log.slot;
log.padding1 = compat_log.padding1;
log.padding2 = compat_log.padding2;
@@ -3148,8 +3149,6 @@ static long kvm_vm_compat_ioctl(struct file *filp,
default:
r = kvm_vm_ioctl(filp, ioctl, arg);
}
-
-out:
return r;
}
#endif