diff options
57 files changed, 2299 insertions, 872 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 22a4b687ea5b..73588fcaff8c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1553,6 +1553,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nosid disable Source ID checking no_x2apic_optout BIOS x2APIC opt-out request will be ignored + nopost disable Interrupt Posting iomem= Disable strict checking of access to MMIO memory strict regions from userspace. diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index d9ecceea5a02..34cc068e81ea 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -401,10 +401,9 @@ Capability: basic Architectures: x86, ppc, mips Type: vcpu ioctl Parameters: struct kvm_interrupt (in) -Returns: 0 on success, -1 on error +Returns: 0 on success, negative on failure. -Queues a hardware interrupt vector to be injected. This is only -useful if in-kernel local APIC or equivalent is not used. +Queues a hardware interrupt vector to be injected. /* for KVM_INTERRUPT */ struct kvm_interrupt { @@ -414,7 +413,14 @@ struct kvm_interrupt { X86: -Note 'irq' is an interrupt vector, not an interrupt pin or line. +Returns: 0 on success, + -EEXIST if an interrupt is already enqueued + -EINVAL the the irq number is invalid + -ENXIO if the PIC is in the kernel + -EFAULT if the pointer is invalid + +Note 'irq' is an interrupt vector, not an interrupt pin or line. This +ioctl is useful if the in-kernel PIC is not used. PPC: @@ -1598,7 +1604,7 @@ provided event instead of triggering an exit. struct kvm_ioeventfd { __u64 datamatch; __u64 addr; /* legal pio/mmio address */ - __u32 len; /* 1, 2, 4, or 8 bytes */ + __u32 len; /* 0, 1, 2, 4, or 8 bytes */ __s32 fd; __u32 flags; __u8 pad[36]; @@ -1621,6 +1627,10 @@ to the registered address is equal to datamatch in struct kvm_ioeventfd. For virtio-ccw devices, addr contains the subchannel id and datamatch the virtqueue index. +With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and +the kernel will ignore the length of guest write and may get a faster vmexit. +The speedup may only apply to specific architectures, but the ioeventfd will +work anyway. 4.60 KVM_DIRTY_TLB @@ -3309,6 +3319,18 @@ Valid values for 'type' are: to ignore the request, or to gather VM memory core dump and/or reset/shutdown of the VM. + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; + +Indicates that the VCPU's in-kernel local APIC received an EOI for a +level-triggered IOAPIC interrupt. This exit only triggers when the +IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled); +the userspace IOAPIC should process the EOI and retrigger the interrupt if +it is still asserted. Vector is the LAPIC interrupt vector for which the +EOI was received. + /* Fix the size of the union. */ char padding[256]; }; @@ -3627,6 +3649,26 @@ struct { KVM handlers should exit to userspace with rc = -EREMOTE. +7.5 KVM_CAP_SPLIT_IRQCHIP + +Architectures: x86 +Parameters: args[0] - number of routes reserved for userspace IOAPICs +Returns: 0 on success, -1 on error + +Create a local apic for each processor in the kernel. This can be used +instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the +IOAPIC and PIC (and also the PIT, even though this has to be enabled +separately). + +This capability also enables in kernel routing of interrupt requests; +when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are +used in the IRQ routing table. The first args[0] MSI routes are reserved +for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes, +a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace. + +Fails if VCPU has already been created, or if the irqchip is already in the +kernel (i.e. KVM_CREATE_IRQCHIP has already been called). + 8. Other capabilities. ---------------------- diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index d68af4dc3006..19f94a6b9bb0 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -166,3 +166,15 @@ Comment: The srcu read lock must be held while accessing memslots (e.g. MMIO/PIO address->device structure mapping (kvm->buses). The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu if it is needed by multiple functions. + +Name: blocked_vcpu_on_cpu_lock +Type: spinlock_t +Arch: x86 +Protects: blocked_vcpu_on_cpu +Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts. + When VT-d posted-interrupts is supported and the VM has assigned + devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu + protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues + wakeup notification event since external interrupts from the + assigned devices happens, we will find the vCPU on the list to + wakeup. diff --git a/MAINTAINERS b/MAINTAINERS index 9f6685f6c5a9..3b738cb8bdeb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5957,7 +5957,7 @@ F: virt/kvm/ KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V M: Joerg Roedel <[email protected]> -W: http://kvm.qumranet.com +W: http://www.linux-kvm.org/ S: Maintained F: arch/x86/include/asm/svm.h F: arch/x86/kvm/svm.c @@ -5965,7 +5965,7 @@ F: arch/x86/kvm/svm.c KERNEL VIRTUAL MACHINE (KVM) FOR POWERPC M: Alexander Graf <[email protected]> -W: http://kvm.qumranet.com +W: http://www.linux-kvm.org/ T: git git://github.com/agraf/linux-2.6.git S: Supported F: arch/powerpc/include/asm/kvm* @@ -11177,6 +11177,13 @@ L: [email protected] S: Maintained F: drivers/net/ethernet/via/via-velocity.* +VIRT LIB +M: Alex Williamson <[email protected]> +M: Paolo Bonzini <[email protected]> +S: Supported +F: virt/lib/ + VIVID VIRTUAL VIDEO DRIVER M: Hans Verkuil <[email protected]> @@ -550,6 +550,7 @@ drivers-y := drivers/ sound/ firmware/ net-y := net/ libs-y := lib/ core-y := usr/ +virt-y := virt/ endif # KBUILD_EXTMOD ifeq ($(dot-config),1) @@ -882,10 +883,10 @@ core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ - $(net-y) $(net-m) $(libs-y) $(libs-m))) + $(net-y) $(net-m) $(libs-y) $(libs-m) $(virt-y))) vmlinux-alldirs := $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \ - $(init-) $(core-) $(drivers-) $(net-) $(libs-)))) + $(init-) $(core-) $(drivers-) $(net-) $(libs-) $(virt-)))) init-y := $(patsubst %/, %/built-in.o, $(init-y)) core-y := $(patsubst %/, %/built-in.o, $(core-y)) @@ -894,14 +895,15 @@ net-y := $(patsubst %/, %/built-in.o, $(net-y)) libs-y1 := $(patsubst %/, %/lib.a, $(libs-y)) libs-y2 := $(patsubst %/, %/built-in.o, $(libs-y)) libs-y := $(libs-y1) $(libs-y2) +virt-y := $(patsubst %/, %/built-in.o, $(virt-y)) # Externally visible symbols (used by link-vmlinux.sh) export KBUILD_VMLINUX_INIT := $(head-y) $(init-y) -export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) +export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) $(virt-y) export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds export LDFLAGS_vmlinux # used by scripts/pacmage/Makefile -export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools virt) +export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools) vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN) diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 7365e8a46032..b4a5aa110cec 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -336,28 +336,28 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } -static const intercept_handler_t intercept_funcs[] = { - [0x00 >> 2] = handle_noop, - [0x04 >> 2] = handle_instruction, - [0x08 >> 2] = handle_prog, - [0x10 >> 2] = handle_noop, - [0x14 >> 2] = handle_external_interrupt, - [0x18 >> 2] = handle_noop, - [0x1C >> 2] = kvm_s390_handle_wait, - [0x20 >> 2] = handle_validity, - [0x28 >> 2] = handle_stop, - [0x38 >> 2] = handle_partial_execution, -}; - int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { - intercept_handler_t func; - u8 code = vcpu->arch.sie_block->icptcode; - - if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs)) + switch (vcpu->arch.sie_block->icptcode) { + case 0x00: + case 0x10: + case 0x18: + return handle_noop(vcpu); + case 0x04: + return handle_instruction(vcpu); + case 0x08: + return handle_prog(vcpu); + case 0x14: + return handle_external_interrupt(vcpu); + case 0x1c: + return kvm_s390_handle_wait(vcpu); + case 0x20: + return handle_validity(vcpu); + case 0x28: + return handle_stop(vcpu); + case 0x38: + return handle_partial_execution(vcpu); + default: return -EOPNOTSUPP; - func = intercept_funcs[code >> 2]; - if (func) - return func(vcpu); - return -EOPNOTSUPP; + } } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 5c2c169395c3..373e32346d68 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -51,11 +51,9 @@ static int psw_mchk_disabled(struct kvm_vcpu *vcpu) static int psw_interrupts_disabled(struct kvm_vcpu *vcpu) { - if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) || - (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO) || - (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT)) - return 0; - return 1; + return psw_extint_disabled(vcpu) && + psw_ioint_disabled(vcpu) && + psw_mchk_disabled(vcpu); } static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu) @@ -71,13 +69,8 @@ static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu) static int ckc_irq_pending(struct kvm_vcpu *vcpu) { - preempt_disable(); - if (!(vcpu->arch.sie_block->ckc < - get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) { - preempt_enable(); + if (vcpu->arch.sie_block->ckc >= kvm_s390_get_tod_clock_fast(vcpu->kvm)) return 0; - } - preempt_enable(); return ckc_interrupts_enabled(vcpu); } @@ -109,14 +102,10 @@ static inline u8 int_word_to_isc(u32 int_word) return (int_word & 0x38000000) >> 27; } -static inline unsigned long pending_floating_irqs(struct kvm_vcpu *vcpu) +static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.float_int.pending_irqs; -} - -static inline unsigned long pending_local_irqs(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.local_int.pending_irqs; + return vcpu->kvm->arch.float_int.pending_irqs | + vcpu->arch.local_int.pending_irqs; } static unsigned long disable_iscs(struct kvm_vcpu *vcpu, @@ -135,8 +124,7 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) { unsigned long active_mask; - active_mask = pending_local_irqs(vcpu); - active_mask |= pending_floating_irqs(vcpu); + active_mask = pending_irqs(vcpu); if (!active_mask) return 0; @@ -204,7 +192,7 @@ static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag) static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) { - if (!(pending_floating_irqs(vcpu) & IRQ_PEND_IO_MASK)) + if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK)) return; else if (psw_ioint_disabled(vcpu)) __set_cpuflag(vcpu, CPUSTAT_IO_INT); @@ -214,7 +202,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) { - if (!(pending_local_irqs(vcpu) & IRQ_PEND_EXT_MASK)) + if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK)) return; if (psw_extint_disabled(vcpu)) __set_cpuflag(vcpu, CPUSTAT_EXT_INT); @@ -224,7 +212,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu) { - if (!(pending_local_irqs(vcpu) & IRQ_PEND_MCHK_MASK)) + if (!(pending_irqs(vcpu) & IRQ_PEND_MCHK_MASK)) return; if (psw_mchk_disabled(vcpu)) vcpu->arch.sie_block->ictl |= ICTL_LPSW; @@ -815,23 +803,21 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu) int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop) { - int rc; + if (deliverable_irqs(vcpu)) + return 1; - rc = !!deliverable_irqs(vcpu); - - if (!rc && kvm_cpu_has_pending_timer(vcpu)) - rc = 1; + if (kvm_cpu_has_pending_timer(vcpu)) + return 1; /* external call pending and deliverable */ - if (!rc && kvm_s390_ext_call_pending(vcpu) && + if (kvm_s390_ext_call_pending(vcpu) && !psw_extint_disabled(vcpu) && (vcpu->arch.sie_block->gcr[0] & 0x2000ul)) - rc = 1; - - if (!rc && !exclude_stop && kvm_s390_is_stop_irq_pending(vcpu)) - rc = 1; + return 1; - return rc; + if (!exclude_stop && kvm_s390_is_stop_irq_pending(vcpu)) + return 1; + return 0; } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) @@ -846,7 +832,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) vcpu->stat.exit_wait_state++; /* fast path */ - if (kvm_cpu_has_pending_timer(vcpu) || kvm_arch_vcpu_runnable(vcpu)) + if (kvm_arch_vcpu_runnable(vcpu)) return 0; if (psw_interrupts_disabled(vcpu)) { @@ -860,9 +846,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) goto no_timer; } - preempt_disable(); - now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; - preempt_enable(); + now = kvm_s390_get_tod_clock_fast(vcpu->kvm); sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); /* underflow */ @@ -901,9 +885,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) u64 now, sltime; vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); - preempt_disable(); - now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; - preempt_enable(); + now = kvm_s390_get_tod_clock_fast(vcpu->kvm); sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); /* @@ -981,39 +963,30 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, irq->u.pgm.code, 0); - li->irq.pgm = irq->u.pgm; + if (irq->u.pgm.code == PGM_PER) { + li->irq.pgm.code |= PGM_PER; + /* only modify PER related information */ + li->irq.pgm.per_address = irq->u.pgm.per_address; + li->irq.pgm.per_code = irq->u.pgm.per_code; + li->irq.pgm.per_atmid = irq->u.pgm.per_atmid; + li->irq.pgm.per_access_id = irq->u.pgm.per_access_id; + } else if (!(irq->u.pgm.code & PGM_PER)) { + li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) | + irq->u.pgm.code; + /* only modify non-PER information */ + li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code; + li->irq.pgm.mon_code = irq->u.pgm.mon_code; + li->irq.pgm.data_exc_code = irq->u.pgm.data_exc_code; + li->irq.pgm.mon_class_nr = irq->u.pgm.mon_class_nr; + li->irq.pgm.exc_access_id = irq->u.pgm.exc_access_id; + li->irq.pgm.op_access_id = irq->u.pgm.op_access_id; + } else { + li->irq.pgm = irq->u.pgm; + } set_bit(IRQ_PEND_PROG, &li->pending_irqs); return 0; } -int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) -{ - struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - struct kvm_s390_irq irq; - - spin_lock(&li->lock); - irq.u.pgm.code = code; - __inject_prog(vcpu, &irq); - BUG_ON(waitqueue_active(li->wq)); - spin_unlock(&li->lock); - return 0; -} - -int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, - struct kvm_s390_pgm_info *pgm_info) -{ - struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - struct kvm_s390_irq irq; - int rc; - - spin_lock(&li->lock); - irq.u.pgm = *pgm_info; - rc = __inject_prog(vcpu, &irq); - BUG_ON(waitqueue_active(li->wq)); - spin_unlock(&li->lock); - return rc; -} - static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; @@ -1390,12 +1363,9 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { - struct kvm_s390_float_interrupt *fi; u64 type = READ_ONCE(inti->type); int rc; - fi = &kvm->arch.float_int; - switch (type) { case KVM_S390_MCHK: rc = __inject_float_mchk(kvm, inti); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0a67c40eece9..07a6aa896c12 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -514,35 +514,20 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) if (gtod_high != 0) return -EINVAL; - VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x\n", gtod_high); + VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x", gtod_high); return 0; } static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) { - struct kvm_vcpu *cur_vcpu; - unsigned int vcpu_idx; - u64 host_tod, gtod; - int r; + u64 gtod; if (copy_from_user(>od, (void __user *)attr->addr, sizeof(gtod))) return -EFAULT; - r = store_tod_clock(&host_tod); - if (r) - return r; - - mutex_lock(&kvm->lock); - preempt_disable(); - kvm->arch.epoch = gtod - host_tod; - kvm_s390_vcpu_block_all(kvm); - kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) - cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch; - kvm_s390_vcpu_unblock_all(kvm); - preempt_enable(); - mutex_unlock(&kvm->lock); - VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod); + kvm_s390_set_tod_clock(kvm, gtod); + VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod); return 0; } @@ -574,26 +559,19 @@ static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) if (copy_to_user((void __user *)attr->addr, >od_high, sizeof(gtod_high))) return -EFAULT; - VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x\n", gtod_high); + VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x", gtod_high); return 0; } static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) { - u64 host_tod, gtod; - int r; + u64 gtod; - r = store_tod_clock(&host_tod); - if (r) - return r; - - preempt_disable(); - gtod = host_tod + kvm->arch.epoch; - preempt_enable(); + gtod = kvm_s390_get_tod_clock_fast(kvm); if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) return -EFAULT; - VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod); + VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx", gtod); return 0; } @@ -1120,7 +1098,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.sca) goto out_err; spin_lock(&kvm_lock); - sca_offset = (sca_offset + 16) & 0x7f0; + sca_offset += 16; + if (sca_offset + sizeof(struct sca_block) > PAGE_SIZE) + sca_offset = 0; kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset); spin_unlock(&kvm_lock); @@ -1916,6 +1896,22 @@ retry: return 0; } +void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod) +{ + struct kvm_vcpu *vcpu; + int i; + + mutex_lock(&kvm->lock); + preempt_disable(); + kvm->arch.epoch = tod - get_tod_clock(); + kvm_s390_vcpu_block_all(kvm); + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.sie_block->epoch = kvm->arch.epoch; + kvm_s390_vcpu_unblock_all(kvm); + preempt_enable(); + mutex_unlock(&kvm->lock); +} + /** * kvm_arch_fault_in_page - fault-in guest page if necessary * @vcpu: The corresponding virtual cpu diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index c446aabf60d3..1e70e00d3c5e 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -175,6 +175,7 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) return kvm->arch.user_cpu_state_ctrl != 0; } +/* implemented in interrupt.c */ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); @@ -185,7 +186,25 @@ int __must_check kvm_s390_inject_vm(struct kvm *kvm, struct kvm_s390_interrupt *s390int); int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq); -int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); +static inline int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, + struct kvm_s390_pgm_info *pgm_info) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_PROGRAM_INT, + .u.pgm = *pgm_info, + }; + + return kvm_s390_inject_vcpu(vcpu, &irq); +} +static inline int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_PROGRAM_INT, + .u.pgm.code = code, + }; + + return kvm_s390_inject_vcpu(vcpu, &irq); +} struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, u64 isc_mask, u32 schid); int kvm_s390_reinject_io_int(struct kvm *kvm, @@ -212,6 +231,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); /* implemented in kvm-s390.c */ +void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu, @@ -231,9 +251,6 @@ extern unsigned long kvm_s390_fac_list_mask[]; /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); -/* implemented in interrupt.c */ -int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, - struct kvm_s390_pgm_info *pgm_info); static inline void kvm_s390_vcpu_block_all(struct kvm *kvm) { @@ -254,6 +271,16 @@ static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm) kvm_s390_vcpu_unblock(vcpu); } +static inline u64 kvm_s390_get_tod_clock_fast(struct kvm *kvm) +{ + u64 rc; + + preempt_disable(); + rc = get_tod_clock_fast() + kvm->arch.epoch; + preempt_enable(); + return rc; +} + /** * kvm_s390_inject_prog_cond - conditionally inject a program check * @vcpu: virtual cpu diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 4d21dc4d1a84..77191b85ea7a 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -33,11 +33,9 @@ /* Handle SCK (SET CLOCK) interception */ static int handle_set_clock(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *cpup; - s64 hostclk, val; - int i, rc; + int rc; ar_t ar; - u64 op2; + u64 op2, val; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -49,19 +47,8 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - if (store_tod_clock(&hostclk)) { - kvm_s390_set_psw_cc(vcpu, 3); - return 0; - } VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val); - val = (val - hostclk) & ~0x3fUL; - - mutex_lock(&vcpu->kvm->lock); - preempt_disable(); - kvm_for_each_vcpu(i, cpup, vcpu->kvm) - cpup->arch.sie_block->epoch = val; - preempt_enable(); - mutex_unlock(&vcpu->kvm->lock); + kvm_s390_set_tod_clock(vcpu->kvm, val); kvm_s390_set_psw_cc(vcpu, 0); return 0; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index ebf6d5e5668c..a30316bf801a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -115,6 +115,59 @@ static inline bool apic_is_x2apic_enabled(void) return msr & X2APIC_ENABLE; } +extern void enable_IR_x2apic(void); + +extern int get_physical_broadcast(void); + +extern int lapic_get_maxlvt(void); +extern void clear_local_APIC(void); +extern void disconnect_bsp_APIC(int virt_wire_setup); +extern void disable_local_APIC(void); +extern void lapic_shutdown(void); +extern void sync_Arb_IDs(void); +extern void init_bsp_APIC(void); +extern void setup_local_APIC(void); +extern void init_apic_mappings(void); +void register_lapic_address(unsigned long address); +extern void setup_boot_APIC_clock(void); +extern void setup_secondary_APIC_clock(void); +extern int APIC_init_uniprocessor(void); + +#ifdef CONFIG_X86_64 +static inline int apic_force_enable(unsigned long addr) +{ + return -1; +} +#else +extern int apic_force_enable(unsigned long addr); +#endif + +extern int apic_bsp_setup(bool upmode); +extern void apic_ap_setup(void); + +/* + * On 32bit this is mach-xxx local + */ +#ifdef CONFIG_X86_64 +extern int apic_is_clustered_box(void); +#else +static inline int apic_is_clustered_box(void) +{ + return 0; +} +#endif + +extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); + +#else /* !CONFIG_X86_LOCAL_APIC */ +static inline void lapic_shutdown(void) { } +#define local_apic_timer_c2_ok 1 +static inline void init_apic_mappings(void) { } +static inline void disable_local_APIC(void) { } +# define setup_boot_APIC_clock x86_init_noop +# define setup_secondary_APIC_clock x86_init_noop +#endif /* !CONFIG_X86_LOCAL_APIC */ + #ifdef CONFIG_X86_X2APIC /* * Make previous memory operations globally visible before @@ -186,67 +239,14 @@ static inline int x2apic_enabled(void) } #define x2apic_supported() (cpu_has_x2apic) -#else +#else /* !CONFIG_X86_X2APIC */ static inline void check_x2apic(void) { } static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } #define x2apic_mode (0) #define x2apic_supported() (0) -#endif - -extern void enable_IR_x2apic(void); - -extern int get_physical_broadcast(void); - -extern int lapic_get_maxlvt(void); -extern void clear_local_APIC(void); -extern void disconnect_bsp_APIC(int virt_wire_setup); -extern void disable_local_APIC(void); -extern void lapic_shutdown(void); -extern void sync_Arb_IDs(void); -extern void init_bsp_APIC(void); -extern void setup_local_APIC(void); -extern void init_apic_mappings(void); -void register_lapic_address(unsigned long address); -extern void setup_boot_APIC_clock(void); -extern void setup_secondary_APIC_clock(void); -extern int APIC_init_uniprocessor(void); - -#ifdef CONFIG_X86_64 -static inline int apic_force_enable(unsigned long addr) -{ - return -1; -} -#else -extern int apic_force_enable(unsigned long addr); -#endif - -extern int apic_bsp_setup(bool upmode); -extern void apic_ap_setup(void); - -/* - * On 32bit this is mach-xxx local - */ -#ifdef CONFIG_X86_64 -extern int apic_is_clustered_box(void); -#else -static inline int apic_is_clustered_box(void) -{ - return 0; -} -#endif - -extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); - -#else /* !CONFIG_X86_LOCAL_APIC */ -static inline void lapic_shutdown(void) { } -#define local_apic_timer_c2_ok 1 -static inline void init_apic_mappings(void) { } -static inline void disable_local_APIC(void) { } -# define setup_boot_APIC_clock x86_init_noop -# define setup_secondary_APIC_clock x86_init_noop -#endif /* !CONFIG_X86_LOCAL_APIC */ +#endif /* !CONFIG_X86_X2APIC */ #ifdef CONFIG_X86_64 #define SET_APIC_ID(x) (apic->set_apic_id(x)) diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 046c7fb1ca43..a210eba2727c 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -33,6 +33,11 @@ enum irq_remap_cap { IRQ_POSTING_CAP = 0, }; +struct vcpu_data { + u64 pi_desc_addr; /* Physical address of PI Descriptor */ + u32 vector; /* Guest vector of the interrupt */ +}; + #ifdef CONFIG_IRQ_REMAP extern bool irq_remapping_cap(enum irq_remap_cap cap); @@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void) return x86_vector_domain; } -struct vcpu_data { - u64 pi_desc_addr; /* Physical address of PI Descriptor */ - u32 vector; /* Guest vector of the interrupt */ -}; - #else /* CONFIG_IRQ_REMAP */ static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2beee0382088..53deb2750bf6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -24,6 +24,7 @@ #include <linux/perf_event.h> #include <linux/pvclock_gtod.h> #include <linux/clocksource.h> +#include <linux/irqbypass.h> #include <asm/pvclock-abi.h> #include <asm/desc.h> @@ -176,6 +177,8 @@ enum { */ #define KVM_APIC_PV_EOI_PENDING 1 +struct kvm_kernel_irq_routing_entry; + /* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. @@ -374,6 +377,7 @@ struct kvm_mtrr { /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { u64 hv_vapic; + s64 runtime_offset; }; struct kvm_vcpu_arch { @@ -396,6 +400,7 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ + u64 eoi_exit_bitmap[4]; unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; @@ -573,6 +578,9 @@ struct kvm_vcpu_arch { struct { bool pv_unhalted; } pv; + + int pending_ioapic_eoi; + int pending_external_vector; }; struct kvm_lpage_info { @@ -683,6 +691,9 @@ struct kvm_arch { u32 bsp_vcpu_id; u64 disabled_quirks; + + bool irqchip_split; + u8 nr_reserved_ioapic_pins; }; struct kvm_vm_stat { @@ -819,10 +830,10 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - int (*vm_has_apicv)(struct kvm *kvm); + int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm *kvm, int isr); - void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); + void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); @@ -887,6 +898,20 @@ struct kvm_x86_ops { gfn_t offset, unsigned long mask); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; + + /* + * Architecture specific hooks for vCPU blocking due to + * HLT instruction. + * Returns for .pre_block(): + * - 0 means continue to block the vCPU. + * - 1 means we cannot block the vCPU since some event + * happens during this period, such as, 'ON' bit in + * posted-interrupts descriptor is set. + */ + int (*pre_block)(struct kvm_vcpu *vcpu); + void (*post_block)(struct kvm_vcpu *vcpu); + int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); }; struct kvm_arch_async_pf { @@ -1226,11 +1251,14 @@ void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); int kvm_is_in_guest(void); -int __x86_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); -int x86_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); +int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); +int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); + +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h index 655e07a48f6c..67f08230103a 100644 --- a/arch/x86/include/asm/pvclock-abi.h +++ b/arch/x86/include/asm/pvclock-abi.h @@ -41,6 +41,7 @@ struct pvclock_wall_clock { #define PVCLOCK_TSC_STABLE_BIT (1 << 0) #define PVCLOCK_GUEST_STOPPED (1 << 1) +/* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */ #define PVCLOCK_COUNTS_FROM_ZERO (1 << 2) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 448b7ca61aee..aa336ff3e03e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -72,7 +72,7 @@ #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 - +#define SECONDARY_EXEC_PCOMMIT 0x00200000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 @@ -416,6 +416,7 @@ enum vmcs_field { #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) +#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index f0412c50c47b..2677a0aac2cc 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -153,6 +153,12 @@ /* MSR used to provide vcpu index */ #define HV_X64_MSR_VP_INDEX 0x40000002 +/* MSR used to reset the guest OS. */ +#define HV_X64_MSR_RESET 0x40000003 + +/* MSR used to provide vcpu runtime in 100ns units */ +#define HV_X64_MSR_VP_RUNTIME 0x40000010 + /* MSR used to read the per-partition time reference counter */ #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 37fee272618f..5b15d94a33f8 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -78,6 +78,7 @@ #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 +#define EXIT_REASON_PCOMMIT 65 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -126,7 +127,8 @@ { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ - { EXIT_REASON_XRSTORS, "XRSTORS" } + { EXIT_REASON_XRSTORS, "XRSTORS" }, \ + { EXIT_REASON_PCOMMIT, "PCOMMIT" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2c7aafa70702..2bd81e302427 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -32,6 +32,7 @@ static int kvmclock = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; +static cycle_t kvm_sched_clock_offset; static int parse_no_kvmclock(char *arg) { @@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) return kvm_clock_read(); } +static cycle_t kvm_sched_clock_read(void) +{ + return kvm_clock_read() - kvm_sched_clock_offset; +} + +static inline void kvm_sched_clock_init(bool stable) +{ + if (!stable) { + pv_time_ops.sched_clock = kvm_clock_read; + return; + } + + kvm_sched_clock_offset = kvm_clock_read(); + pv_time_ops.sched_clock = kvm_sched_clock_read; + set_sched_clock_stable(); + + printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", + kvm_sched_clock_offset); + + BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); +} + /* * If we don't do that, there is the possibility that the guest * will calibrate under heavy load - thus, getting a lower lpj - @@ -248,7 +272,17 @@ void __init kvmclock_init(void) memblock_free(mem, size); return; } - pv_time_ops.sched_clock = kvm_clock_read; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + + cpu = get_cpu(); + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); + put_cpu(); + x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; @@ -265,16 +299,6 @@ void __init kvmclock_init(void) kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; - - if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) - pvclock_set_flags(~0); - - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - if (flags & PVCLOCK_COUNTS_FROM_ZERO) - set_sched_clock_stable(); - put_cpu(); } int __init kvm_setup_vsyscall_timeinfo(void) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index d8a1d56276e1..639a6e34500c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -28,6 +28,8 @@ config KVM select ANON_INODES select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD + select IRQ_BYPASS_MANAGER + select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2fbea2544f24..faeb0b3bb343 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT); /* cpuid 0xD.1.eax */ const u32 kvm_supported_word10_x86_features = diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index dd05b9cef6ae..06332cb7e7d1 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -133,4 +133,41 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_MPX)); } + +static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_PCOMMIT)); +} + +static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->edx & bit(X86_FEATURE_RDTSCP)); +} + +/* + * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 + */ +#define BIT_NRIPS 3 + +static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0); + + /* + * NRIPS is a scattered cpuid feature, so we can't use + * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit + * position 8, not 3). + */ + return best && (best->edx & bit(BIT_NRIPS)); +} +#undef BIT_NRIPS + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b372a7557c16..9da95b9daf8d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2418,7 +2418,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) u64 val, cr0, cr4; u32 base3; u16 selector; - int i; + int i, r; for (i = 0; i < 16; i++) *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8); @@ -2460,13 +2460,17 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) dt.address = GET_SMSTATE(u64, smbase, 0x7e68); ctxt->ops->set_gdt(ctxt, &dt); + r = rsm_enter_protected_mode(ctxt, cr0, cr4); + if (r != X86EMUL_CONTINUE) + return r; + for (i = 0; i < 6; i++) { - int r = rsm_load_seg_64(ctxt, smbase, i); + r = rsm_load_seg_64(ctxt, smbase, i); if (r != X86EMUL_CONTINUE) return r; } - return rsm_enter_protected_mode(ctxt, cr0, cr4); + return X86EMUL_CONTINUE; } static int em_rsm(struct x86_emulate_ctxt *ctxt) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a8160d2ae362..62cf8c915e95 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -41,6 +41,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_TIME_REF_COUNT: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_RESET: r = true; break; } @@ -163,6 +164,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, data); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_set_crash_ctl(vcpu, data, host); + case HV_X64_MSR_RESET: + if (data == 1) { + vcpu_debug(vcpu, "hyper-v reset requested\n"); + kvm_make_request(KVM_REQ_HV_RESET, vcpu); + } + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -171,7 +178,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, return 0; } -static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +/* Calculate cpu time spent by current task in 100ns units */ +static u64 current_task_runtime_100ns(void) +{ + cputime_t utime, stime; + + task_cputime_adjusted(current, &utime, &stime); + return div_u64(cputime_to_nsecs(utime + stime), 100); +} + +static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; @@ -205,6 +221,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + case HV_X64_MSR_VP_RUNTIME: + if (!host) + return 1; + hv->runtime_offset = data - current_task_runtime_100ns(); + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -241,6 +262,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) pdata); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_get_crash_ctl(vcpu, pdata); + case HV_X64_MSR_RESET: + data = 0; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -277,6 +301,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_APIC_ASSIST_PAGE: data = hv->hv_vapic; break; + case HV_X64_MSR_VP_RUNTIME: + data = current_task_runtime_100ns() + hv->runtime_offset; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -295,7 +322,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) mutex_unlock(&vcpu->kvm->lock); return r; } else - return kvm_hv_set_msr(vcpu, msr, data); + return kvm_hv_set_msr(vcpu, msr, data, host); } int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index f90952f64e79..08116ff227cc 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -35,6 +35,7 @@ #include <linux/kvm_host.h> #include <linux/slab.h> +#include "ioapic.h" #include "irq.h" #include "i8254.h" #include "x86.h" @@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; s64 interval; - if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) + if (!ioapic_in_kernel(kvm) || + ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) return; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 856f79105bb5..88d0a92d3f94 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -233,21 +233,7 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) } -static void update_handled_vectors(struct kvm_ioapic *ioapic) -{ - DECLARE_BITMAP(handled_vectors, 256); - int i; - - memset(handled_vectors, 0, sizeof(handled_vectors)); - for (i = 0; i < IOAPIC_NUM_PINS; ++i) - __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors); - memcpy(ioapic->handled_vectors, handled_vectors, - sizeof(handled_vectors)); - smp_wmb(); -} - -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, - u32 *tmr) +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; union kvm_ioapic_redirect_entry *e; @@ -260,13 +246,11 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, - e->fields.dest_id, e->fields.dest_mode)) { + e->fields.dest_id, e->fields.dest_mode) || + (e->fields.trig_mode == IOAPIC_EDGE_TRIG && + kvm_apic_pending_eoi(vcpu, e->fields.vector))) __set_bit(e->fields.vector, (unsigned long *)eoi_exit_bitmap); - if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG) - __set_bit(e->fields.vector, - (unsigned long *)tmr); - } } } spin_unlock(&ioapic->lock); @@ -315,7 +299,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) e->bits |= (u32) val; e->fields.remote_irr = 0; } - update_handled_vectors(ioapic); mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); @@ -599,7 +582,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->id = 0; memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); rtc_irq_eoi_tracking_reset(ioapic); - update_handled_vectors(ioapic); } static const struct kvm_io_device_ops ioapic_mmio_ops = { @@ -628,8 +610,10 @@ int kvm_ioapic_init(struct kvm *kvm) if (ret < 0) { kvm->arch.vioapic = NULL; kfree(ioapic); + return ret; } + kvm_vcpu_request_scan_ioapic(kvm); return ret; } @@ -666,7 +650,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); ioapic->irr = 0; ioapic->irr_delivered = 0; - update_handled_vectors(ioapic); kvm_vcpu_request_scan_ioapic(kvm); kvm_ioapic_inject_all(ioapic, state->irr); spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index ca0b0b4e6256..084617d37c74 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -9,6 +9,7 @@ struct kvm; struct kvm_vcpu; #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ #define IOAPIC_EDGE_TRIG 0 #define IOAPIC_LEVEL_TRIG 1 @@ -73,7 +74,6 @@ struct kvm_ioapic { struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); spinlock_t lock; - DECLARE_BITMAP(handled_vectors, 256); struct rtc_status rtc_status; struct delayed_work eoi_inject; u32 irq_eoi[IOAPIC_NUM_PINS]; @@ -98,11 +98,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) return kvm->arch.vioapic; } -static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) +static inline int ioapic_in_kernel(struct kvm *kvm) { - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - smp_rmb(); - return test_bit(vector, ioapic->handled_vectors); + int ret; + + ret = (ioapic_irqchip(kvm) != NULL); + return ret; } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); @@ -120,7 +121,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, - u32 *tmr); +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); #endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index a1ec6a50a05a..097060e33bd6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -38,14 +38,27 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) EXPORT_SYMBOL(kvm_cpu_has_pending_timer); /* + * check if there is a pending userspace external interrupt + */ +static int pending_userspace_extint(struct kvm_vcpu *v) +{ + return v->arch.pending_external_vector != -1; +} + +/* * check if there is pending interrupt from * non-APIC source without intack. */ static int kvm_cpu_has_extint(struct kvm_vcpu *v) { - if (kvm_apic_accept_pic_intr(v)) - return pic_irqchip(v->kvm)->output; /* PIC */ - else + u8 accept = kvm_apic_accept_pic_intr(v); + + if (accept) { + if (irqchip_split(v->kvm)) + return pending_userspace_extint(v); + else + return pic_irqchip(v->kvm)->output; + } else return 0; } @@ -57,13 +70,13 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v) */ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) { - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.pending; if (kvm_cpu_has_extint(v)) return 1; - if (kvm_apic_vid_enabled(v->kvm)) + if (kvm_vcpu_apic_vid_enabled(v)) return 0; return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ @@ -75,7 +88,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) */ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.pending; if (kvm_cpu_has_extint(v)) @@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); */ static int kvm_cpu_get_extint(struct kvm_vcpu *v) { - if (kvm_cpu_has_extint(v)) - return kvm_pic_read_irq(v->kvm); /* PIC */ - return -1; + if (kvm_cpu_has_extint(v)) { + if (irqchip_split(v->kvm)) { + int vector = v->arch.pending_external_vector; + + v->arch.pending_external_vector = -1; + return vector; + } else + return kvm_pic_read_irq(v->kvm); /* PIC */ + } else + return -1; } /* @@ -103,7 +123,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) { int vector; - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.nr; vector = kvm_cpu_get_extint(v); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 3d782a2c336a..ae5c78f2337d 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -83,13 +83,38 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) return kvm->arch.vpic; } +static inline int pic_in_kernel(struct kvm *kvm) +{ + int ret; + + ret = (pic_irqchip(kvm) != NULL); + return ret; +} + +static inline int irqchip_split(struct kvm *kvm) +{ + return kvm->arch.irqchip_split; +} + static inline int irqchip_in_kernel(struct kvm *kvm) { struct kvm_pic *vpic = pic_irqchip(kvm); + bool ret; + + ret = (vpic != NULL); + ret |= irqchip_split(kvm); /* Read vpic before kvm->irq_routing. */ smp_rmb(); - return vpic != NULL; + return ret; +} + +static inline int lapic_in_kernel(struct kvm_vcpu *vcpu) +{ + /* Same as irqchip_in_kernel(vcpu->kvm), but with less + * pointer chasing and no unnecessary memory barriers. + */ + return vcpu->arch.apic != NULL; } void kvm_pic_reset(struct kvm_kpic_state *s); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 9efff9e5b58c..8f4499c7ffc1 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -91,8 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm_lapic_irq *irq) +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq) { trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); @@ -108,6 +108,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, irq->level = 1; irq->shorthand = 0; } +EXPORT_SYMBOL_GPL(kvm_set_msi_irq); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -208,7 +209,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!irqchip_in_kernel(kvm)) + if (!ioapic_in_kernel(kvm)) goto unlock; kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); @@ -297,6 +298,33 @@ out: return r; } +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + int i, r = 0; + struct kvm_vcpu *vcpu; + + if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) + return true; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (++r == 2) + return false; + + *dest_vcpu = vcpu; + } + + return r == 1; +} +EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); + #define IOAPIC_ROUTING_ENTRY(irq) \ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } @@ -328,3 +356,54 @@ int kvm_setup_default_irq_routing(struct kvm *kvm) return kvm_set_irq_routing(kvm, default_routing, ARRAY_SIZE(default_routing), 0); } + +static const struct kvm_irq_routing_entry empty_routing[] = {}; + +int kvm_setup_empty_irq_routing(struct kvm *kvm) +{ + return kvm_set_irq_routing(kvm, empty_routing, 0, 0); +} + +void kvm_arch_irq_routing_update(struct kvm *kvm) +{ + if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) + return; + kvm_make_scan_ioapic_request(kvm); +} + +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_kernel_irq_routing_entry *entry; + struct kvm_irq_routing_table *table; + u32 i, nr_ioapic_pins; + int idx; + + /* kvm->irq_routing must be read after clearing + * KVM_SCAN_IOAPIC. */ + smp_mb(); + idx = srcu_read_lock(&kvm->irq_srcu); + table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + nr_ioapic_pins = min_t(u32, table->nr_rt_entries, + kvm->arch.nr_reserved_ioapic_pins); + for (i = 0; i < nr_ioapic_pins; ++i) { + hlist_for_each_entry(entry, &table->map[i], link) { + u32 dest_id, dest_mode; + bool level; + + if (entry->type != KVM_IRQ_ROUTING_MSI) + continue; + dest_id = (entry->msi.address_lo >> 12) & 0xff; + dest_mode = (entry->msi.address_lo >> 2) & 0x1; + level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL; + if (level && kvm_apic_match_dest(vcpu, NULL, 0, + dest_id, dest_mode)) { + u32 vector = entry->msi.data & 0xff; + + __set_bit(vector, + (unsigned long *) eoi_exit_bitmap); + } + } + } + srcu_read_unlock(&kvm->irq_srcu, idx); +} diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8d9013c5e1ee..168b8759bd73 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -209,7 +209,7 @@ out: if (old) kfree_rcu(old, rcu); - kvm_vcpu_request_scan_ioapic(kvm); + kvm_make_scan_ioapic_request(kvm); } static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) @@ -348,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) struct kvm_lapic *apic = vcpu->arch.apic; __kvm_apic_update_irr(pir, apic->regs); + + kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); @@ -390,7 +392,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) vcpu = apic->vcpu; - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) { + if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) { /* try to update RVI */ apic_clear_vector(vec, apic->regs + APIC_IRR); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -551,15 +553,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int i; - - for (i = 0; i < 8; i++) - apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]); -} - static void apic_update_ppr(struct kvm_lapic *apic) { u32 tpr, isrv, ppr, old_ppr; @@ -764,6 +757,65 @@ out: return ret; } +bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + struct kvm_apic_map *map; + bool ret = false; + struct kvm_lapic *dst = NULL; + + if (irq->shorthand) + return false; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + if (!map) + goto out; + + if (irq->dest_mode == APIC_DEST_PHYSICAL) { + if (irq->dest_id == 0xFF) + goto out; + + if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) + goto out; + + dst = map->phys_map[irq->dest_id]; + if (dst && kvm_apic_present(dst->vcpu)) + *dest_vcpu = dst->vcpu; + else + goto out; + } else { + u16 cid; + unsigned long bitmap = 1; + int i, r = 0; + + if (!kvm_apic_logical_map_valid(map)) + goto out; + + apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); + + if (cid >= ARRAY_SIZE(map->logical_map)) + goto out; + + for_each_set_bit(i, &bitmap, 16) { + dst = map->logical_map[cid][i]; + if (++r == 2) + goto out; + } + + if (dst && kvm_apic_present(dst->vcpu)) + *dest_vcpu = dst->vcpu; + else + goto out; + } + + ret = true; +out: + rcu_read_unlock(); + return ret; +} + /* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded. @@ -781,6 +833,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; case APIC_DM_FIXED: + if (unlikely(trig_mode && !level)) + break; + /* FIXME add logic for vcpu on reset */ if (unlikely(!apic_enabled(apic))) break; @@ -790,6 +845,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); + if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { + if (trig_mode) + apic_set_vector(vector, apic->regs + APIC_TMR); + else + apic_clear_vector(vector, apic->regs + APIC_TMR); + } + if (kvm_x86_ops->deliver_posted_interrupt) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); else { @@ -868,16 +930,32 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } +static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) +{ + return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap); +} + static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { - int trigger_mode; - if (apic_test_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; - kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); + int trigger_mode; + + /* Eoi the ioapic only if the ioapic doesn't own the vector. */ + if (!kvm_ioapic_handles_vector(apic, vector)) + return; + + /* Request a KVM exit to inform the userspace IOAPIC. */ + if (irqchip_split(apic->vcpu->kvm)) { + apic->vcpu->arch.pending_ioapic_eoi = vector; + kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); + return; } + + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + + kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); } static int apic_set_eoi(struct kvm_lapic *apic) @@ -1615,7 +1693,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); + apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu); apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0; apic->highest_isr_cache = -1; update_divide_count(apic); @@ -1838,7 +1916,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_rtc_eoi_tracking_restore_one(vcpu); + if (ioapic_in_kernel(vcpu->kvm)) + kvm_rtc_eoi_tracking_restore_one(vcpu); } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) @@ -1922,7 +2001,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, /* Cache not set: could be safe but we don't bother. */ apic->highest_isr_cache == -1 || /* Need EOI to update ioapic. */ - kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { + kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) { /* * PV EOI was disabled by apic_sync_pv_eoi_from_guest * so we need not do anything here. @@ -1978,7 +2057,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4; - if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; if (reg == APIC_ICR2) @@ -1995,7 +2074,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; - if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; if (reg == APIC_DFR || reg == APIC_ICR2) { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 764037991d26..fde8e35d5850 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); -void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, @@ -144,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) return apic->vcpu->arch.apic_base & X2APIC_ENABLE; } -static inline bool kvm_apic_vid_enabled(struct kvm *kvm) +static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->vm_has_apicv(kvm); + return kvm_x86_ops->cpu_uses_apicv(vcpu); } static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) @@ -169,4 +168,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); +bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff606f507913..7d85bcae3332 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -818,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm->arch.indirect_shadow_pages--; } -static int has_wrprotected_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - int level) +static int __has_wrprotected_page(gfn_t gfn, int level, + struct kvm_memory_slot *slot) { - struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (slot) { linfo = lpage_info_slot(gfn, slot, level); return linfo->write_count; @@ -834,6 +831,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu, return 1; } +static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level) +{ + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + return __has_wrprotected_page(gfn, level, slot); +} + static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { unsigned long page_size; @@ -851,6 +856,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } +static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, + bool no_dirty_log) +{ + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return false; + if (no_dirty_log && slot->dirty_bitmap) + return false; + + return true; +} + static struct kvm_memory_slot * gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) @@ -858,21 +874,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) + if (!memslot_valid_for_gpte(slot, no_dirty_log)) slot = NULL; return slot; } -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) -{ - return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); -} - -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, + bool *force_pt_level) { int host_level, level, max_level; + struct kvm_memory_slot *slot; + + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); + *force_pt_level = !memslot_valid_for_gpte(slot, true); + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; host_level = host_mapping_level(vcpu->kvm, large_gfn); @@ -882,7 +902,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (has_wrprotected_page(vcpu, large_gfn, level)) + if (__has_wrprotected_page(large_gfn, level, slot)) break; return level - 1; @@ -2962,14 +2982,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - int force_pt_level; + bool force_pt_level = false; pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); /* * This path builds a PAE pagetable - so we can map * 2mb pages at maximum. Therefore check if the level @@ -2979,8 +2998,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, v, level, error_code)) return 0; @@ -3427,7 +3445,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) static bool can_do_async_pf(struct kvm_vcpu *vcpu) { - if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu))) return false; @@ -3476,7 +3494,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, pfn_t pfn; int r; int level; - int force_pt_level; + bool force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -3495,20 +3513,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - if (mapping_level_dirty_bitmap(vcpu, gfn) || - !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) - force_pt_level = 1; - else - force_pt_level = 0; - + force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, + PT_DIRECTORY_LEVEL); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); if (level > PT_DIRECTORY_LEVEL && !check_hugepage_cache_consistency(vcpu, gfn, level)) level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, gpa, level, error_code)) return 0; @@ -3706,7 +3719,7 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, int maxphyaddr, bool execonly) { - int pte; + u64 bad_mt_xwr; rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); @@ -3724,14 +3737,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; - for (pte = 0; pte < 64; pte++) { - int rwx_bits = pte & 7; - int mt = pte >> 3; - if (mt == 0x2 || mt == 0x3 || mt == 0x7 || - rwx_bits == 0x2 || rwx_bits == 0x6 || - (rwx_bits == 0x4 && !execonly)) - rsvd_check->bad_mt_xwr |= (1ull << pte); + bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ + bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ + bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ + if (!execonly) { + /* bits 0..2 must not be 100 unless VMX capabilities allow it */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 4); } + rsvd_check->bad_mt_xwr = bad_mt_xwr; } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 736e6ab8784d..b41faa91a6f9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - int force_pt_level; + bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; @@ -743,15 +743,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - if (walker.level >= PT_DIRECTORY_LEVEL) - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) - || is_self_change_mapping; - else - force_pt_level = 1; - if (!force_pt_level) { - level = min(walker.level, mapping_level(vcpu, walker.gfn)); - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - } + if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { + level = mapping_level(vcpu, walker.gfn, &force_pt_level); + if (likely(!force_pt_level)) { + level = min(walker.level, level); + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + } + } else + force_pt_level = true; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 94b7d15db3fc..f2c8e4917688 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -159,6 +159,9 @@ struct vcpu_svm { u32 apf_reason; u64 tsc_ratio; + + /* cached guest cpuid flags for faster access */ + bool nrips_enabled : 1; }; static DEFINE_PER_CPU(u64, current_tsc_ratio); @@ -514,7 +517,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (svm->vmcb->control.next_rip != 0) { - WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS)); + WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); svm->next_rip = svm->vmcb->control.next_rip; } @@ -866,64 +869,6 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } -#define MTRR_TYPE_UC_MINUS 7 -#define MTRR2PROTVAL_INVALID 0xff - -static u8 mtrr2protval[8]; - -static u8 fallback_mtrr_type(int mtrr) -{ - /* - * WT and WP aren't always available in the host PAT. Treat - * them as UC and UC- respectively. Everything else should be - * there. - */ - switch (mtrr) - { - case MTRR_TYPE_WRTHROUGH: - return MTRR_TYPE_UNCACHABLE; - case MTRR_TYPE_WRPROT: - return MTRR_TYPE_UC_MINUS; - default: - BUG(); - } -} - -static void build_mtrr2protval(void) -{ - int i; - u64 pat; - - for (i = 0; i < 8; i++) - mtrr2protval[i] = MTRR2PROTVAL_INVALID; - - /* Ignore the invalid MTRR types. */ - mtrr2protval[2] = 0; - mtrr2protval[3] = 0; - - /* - * Use host PAT value to figure out the mapping from guest MTRR - * values to nested page table PAT/PCD/PWT values. We do not - * want to change the host PAT value every time we enter the - * guest. - */ - rdmsrl(MSR_IA32_CR_PAT, pat); - for (i = 0; i < 8; i++) { - u8 mtrr = pat >> (8 * i); - - if (mtrr2protval[mtrr] == MTRR2PROTVAL_INVALID) - mtrr2protval[mtrr] = __cm_idx2pte(i); - } - - for (i = 0; i < 8; i++) { - if (mtrr2protval[i] == MTRR2PROTVAL_INVALID) { - u8 fallback = fallback_mtrr_type(i); - mtrr2protval[i] = mtrr2protval[fallback]; - BUG_ON(mtrr2protval[i] == MTRR2PROTVAL_INVALID); - } - } -} - static __init int svm_hardware_setup(void) { int cpu; @@ -990,7 +935,6 @@ static __init int svm_hardware_setup(void) } else kvm_disable_tdp(); - build_mtrr2protval(); return 0; err: @@ -1145,44 +1089,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) return target_tsc - tsc; } -static void svm_set_guest_pat(struct vcpu_svm *svm, u64 *g_pat) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; - - /* Unlike Intel, AMD takes the guest's CR0.CD into account. - * - * AMD does not have IPAT. To emulate it for the case of guests - * with no assigned devices, just set everything to WB. If guests - * have assigned devices, however, we cannot force WB for RAM - * pages only, so use the guest PAT directly. - */ - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - *g_pat = 0x0606060606060606; - else - *g_pat = vcpu->arch.pat; -} - -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - u8 mtrr; - - /* - * 1. MMIO: trust guest MTRR, so same as item 3. - * 2. No passthrough: always map as WB, and force guest PAT to WB as well - * 3. Passthrough: can't guarantee the result, try to trust guest. - */ - if (!is_mmio && !kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED) && - kvm_read_cr0(vcpu) & X86_CR0_CD) - return _PAGE_NOCACHE; - - mtrr = kvm_mtrr_get_guest_memory_type(vcpu, gfn); - return mtrr2protval[mtrr]; -} - -static void init_vmcb(struct vcpu_svm *svm, bool init_event) +static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; @@ -1253,8 +1160,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - if (!init_event) - svm_set_efer(&svm->vcpu, 0); + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; @@ -1278,7 +1184,6 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) clr_cr_intercept(svm, INTERCEPT_CR3_READ); clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = svm->vcpu.arch.pat; - svm_set_guest_pat(svm, &save->g_pat); save->cr3 = 0; save->cr4 = 0; } @@ -1309,7 +1214,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; } - init_vmcb(svm, init_event); + init_vmcb(svm); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); @@ -1365,7 +1270,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; - init_vmcb(svm, false); + init_vmcb(svm); svm_init_osvw(&svm->vcpu); @@ -1673,10 +1578,13 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - - /* These are emulated via page tables. */ - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); - + /* + * re-enable caching here because the QEMU bios + * does not do it - this results in some delay at + * reboot + */ + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); @@ -1984,7 +1892,7 @@ static int shutdown_interception(struct vcpu_svm *svm) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm, false); + init_vmcb(svm); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; @@ -2459,7 +2367,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; - nested_vmcb->control.next_rip = vmcb->control.next_rip; + + if (svm->nrips_enabled) + nested_vmcb->control.next_rip = vmcb->control.next_rip; /* * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have @@ -3154,7 +3064,7 @@ static int cr8_write_interception(struct vcpu_svm *svm) u8 cr8_prev = kvm_get_cr8(&svm->vcpu); /* instruction emulation calls kvm_set_cr8() */ r = cr_interception(svm); - if (irqchip_in_kernel(svm->vcpu.kvm)) + if (lapic_in_kernel(&svm->vcpu)) return r; if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return r; @@ -3351,16 +3261,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_IGNNE: vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; - case MSR_IA32_CR_PAT: - if (npt_enabled) { - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) - return 1; - vcpu->arch.pat = data; - svm_set_guest_pat(svm, &svm->vmcb->save.g_pat); - mark_dirty(svm->vmcb, VMCB_NPT); - break; - } - /* fall through */ default: return kvm_set_msr_common(vcpu, msr); } @@ -3398,24 +3298,11 @@ static int msr_interception(struct vcpu_svm *svm) static int interrupt_window_interception(struct vcpu_svm *svm) { - struct kvm_run *kvm_run = svm->vcpu.run; - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); ++svm->vcpu.stat.irq_window_exits; - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (!irqchip_in_kernel(svm->vcpu.kvm) && - kvm_run->request_interrupt_window && - !kvm_cpu_has_interrupt(&svm->vcpu)) { - kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } - return 1; } @@ -3763,12 +3650,12 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } -static int svm_vm_has_apicv(struct kvm *kvm) +static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu) { return 0; } -static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu) { return; } @@ -4195,8 +4082,17 @@ static bool svm_has_high_real_mode_segbase(void) return true; } +static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +{ + return 0; +} + static void svm_cpuid_update(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + + /* Update nrips enabled cache */ + svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu); } static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -4524,7 +4420,7 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, - .vm_has_apicv = svm_vm_has_apicv, + .cpu_uses_apicv = svm_cpu_uses_apicv, .load_eoi_exitmap = svm_load_eoi_exitmap, .sync_pir_to_irr = svm_sync_pir_to_irr, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4eae7c35ddf5..120302511802 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -129,6 +129,24 @@ TRACE_EVENT(kvm_pio, ); /* + * Tracepoint for fast mmio. + */ +TRACE_EVENT(kvm_fast_mmio, + TP_PROTO(u64 gpa), + TP_ARGS(gpa), + + TP_STRUCT__entry( + __field(u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = gpa; + ), + + TP_printk("fast mmio at gpa 0x%llx", __entry->gpa) +); + +/* * Tracepoint for cpuid. */ TRACE_EVENT(kvm_cpuid, @@ -974,6 +992,39 @@ TRACE_EVENT(kvm_enter_smm, __entry->smbase) ); +/* + * Tracepoint for VT-d posted-interrupts. + */ +TRACE_EVENT(kvm_pi_irte_update, + TP_PROTO(unsigned int vcpu_id, unsigned int gsi, + unsigned int gvec, u64 pi_desc_addr, bool set), + TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( unsigned int, gsi ) + __field( unsigned int, gvec ) + __field( u64, pi_desc_addr ) + __field( bool, set ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->gsi = gsi; + __entry->gvec = gvec; + __entry->pi_desc_addr = pi_desc_addr; + __entry->set = set; + ), + + TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, " + "gvec: 0x%x, pi_desc_addr: 0x%llx", + __entry->set ? "enabled and being updated" : "disabled", + __entry->vcpu_id, + __entry->gsi, + __entry->gvec, + __entry->pi_desc_addr) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 64076740251e..4d0aa31a42ba 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -35,6 +35,7 @@ #include "kvm_cache_regs.h" #include "x86.h" +#include <asm/cpu.h> #include <asm/io.h> #include <asm/desc.h> #include <asm/vmx.h> @@ -45,6 +46,7 @@ #include <asm/debugreg.h> #include <asm/kexec.h> #include <asm/apic.h> +#include <asm/irq_remapping.h> #include "trace.h" #include "pmu.h" @@ -424,6 +426,9 @@ struct nested_vmx { /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ u64 vmcs01_debugctl; + u16 vpid02; + u16 last_vpid; + u32 nested_vmx_procbased_ctls_low; u32 nested_vmx_procbased_ctls_high; u32 nested_vmx_true_procbased_ctls_low; @@ -440,14 +445,33 @@ struct nested_vmx { u32 nested_vmx_misc_low; u32 nested_vmx_misc_high; u32 nested_vmx_ept_caps; + u32 nested_vmx_vpid_caps; }; #define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ - u32 control; /* bit 0 of control is outstanding notification bit */ - u32 rsvd[7]; + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; } __aligned(64); static bool pi_test_and_set_on(struct pi_desc *pi_desc) @@ -467,6 +491,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + return clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + return set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -532,8 +580,6 @@ struct vcpu_vmx { s64 vnmi_blocked_time; u32 exit_reason; - bool rdtscp_enabled; - /* Posted interrupt descriptor */ struct pi_desc pi_desc; @@ -563,6 +609,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } +static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [number] = VMCS12_OFFSET(name) #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ @@ -809,7 +860,7 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); -static int vmx_vm_has_apicv(struct kvm *kvm); +static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -831,6 +882,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DEFINE_PER_CPU(struct desc_ptr, host_gdt); +/* + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * can find which vCPU should be waken up. + */ +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); + static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; static unsigned long *vmx_msr_bitmap_legacy; @@ -946,9 +1004,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void) return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; } -static inline bool vm_need_tpr_shadow(struct kvm *kvm) +static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) { - return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); + return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); } static inline bool cpu_has_secondary_exec_ctrls(void) @@ -983,7 +1041,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) static inline bool cpu_has_vmx_posted_intr(void) { - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; + return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && + vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; } static inline bool cpu_has_vmx_apicv(void) @@ -1062,9 +1121,9 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } -static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) +static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { - return flexpriority_enabled && irqchip_in_kernel(kvm); + return flexpriority_enabled && lapic_in_kernel(vcpu); } static inline bool cpu_has_vmx_vpid(void) @@ -1157,6 +1216,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); } +static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); +} + static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); @@ -1337,13 +1401,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) __loaded_vmcs_clear, loaded_vmcs, 1); } -static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) +static inline void vpid_sync_vcpu_single(int vpid) { - if (vmx->vpid == 0) + if (vpid == 0) return; if (cpu_has_vmx_invvpid_single()) - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); } static inline void vpid_sync_vcpu_global(void) @@ -1352,10 +1416,10 @@ static inline void vpid_sync_vcpu_global(void) __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); } -static inline void vpid_sync_context(struct vcpu_vmx *vmx) +static inline void vpid_sync_context(int vpid) { if (cpu_has_vmx_invvpid_single()) - vpid_sync_vcpu_single(vmx); + vpid_sync_vcpu_single(vpid); else vpid_sync_vcpu_global(); } @@ -1895,6 +1959,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) preempt_enable(); } +static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + do { + old.control = new.control = pi_desc->control; + + /* + * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there + * are two possible cases: + * 1. After running 'pre_block', context switch + * happened. For this case, 'sn' was set in + * vmx_vcpu_put(), so we need to clear it here. + * 2. After running 'pre_block', we were blocked, + * and woken up by some other guy. For this case, + * we don't need to do anything, 'pi_post_block' + * will do everything for us. However, we cannot + * check whether it is case #1 or case #2 here + * (maybe, not needed), so we also clear sn here, + * I think it is not a big deal. + */ + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { + if (vcpu->cpu != cpu) { + dest = cpu_physical_id(cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + } + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); +} /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -1945,10 +2055,27 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ vmx->loaded_vmcs->cpu = cpu; } + + vmx_vcpu_pi_load(vcpu, cpu); +} + +static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + /* Set SN when the vCPU is preempted */ + if (vcpu->preempted) + pi_set_sn(pi_desc); } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { + vmx_vcpu_pi_put(vcpu); + __vmx_load_host_state(to_vmx(vcpu)); if (!vmm_exclusive) { __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); @@ -2207,7 +2334,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && vmx->rdtscp_enabled) + if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu)) move_msr_up(vmx, index, save_nmsrs++); /* * MSR_STAR is only needed on long mode guests, and only @@ -2377,7 +2504,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - if (vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (vmx_cpu_uses_apicv(&vmx->vcpu)) vmx->nested.nested_vmx_pinbased_ctls_high |= PIN_BASED_POSTED_INTR; @@ -2471,10 +2598,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_PCOMMIT; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -2493,6 +2622,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; + if (enable_vpid) + vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; + else + vmx->nested.nested_vmx_vpid_caps = 0; + if (enable_unrestricted_guest) vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -2608,7 +2743,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) break; case MSR_IA32_VMX_EPT_VPID_CAP: /* Currently, no nested vpid support */ - *pdata = vmx->nested.nested_vmx_ept_caps; + *pdata = vmx->nested.nested_vmx_ept_caps | + ((u64)vmx->nested.nested_vmx_vpid_caps << 32); break; default: return 1; @@ -2673,7 +2809,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.ia32_xss; break; case MSR_TSC_AUX: - if (!to_vmx(vcpu)->rdtscp_enabled) + if (!guest_cpuid_has_rdtscp(vcpu)) return 1; /* Otherwise falls through */ default: @@ -2779,7 +2915,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) clear_atomic_switch_msr(vmx, MSR_IA32_XSS); break; case MSR_TSC_AUX: - if (!vmx->rdtscp_enabled) + if (!guest_cpuid_has_rdtscp(vcpu)) return 1; /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) @@ -2874,6 +3010,8 @@ static int hardware_enable(void) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); /* * Now we can enable the vmclear operation in kdump @@ -3015,7 +3153,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_ENABLE_PML; + SECONDARY_EXEC_ENABLE_PML | + SECONDARY_EXEC_PCOMMIT; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -3441,9 +3580,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) { - vpid_sync_context(to_vmx(vcpu)); + vpid_sync_context(vpid); if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; @@ -3451,6 +3590,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) } } +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -4105,17 +4249,13 @@ static void seg_setup(int seg) static int alloc_apic_access_page(struct kvm *kvm) { struct page *page; - struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page_done) goto out; - kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; - kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; - kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __x86_set_memory_region(kvm, &kvm_userspace_mem); + r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); if (r) goto out; @@ -4140,44 +4280,38 @@ static int alloc_identity_pagetable(struct kvm *kvm) { /* Called with kvm->slots_lock held. */ - struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; BUG_ON(kvm->arch.ept_identity_pagetable_done); - kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; - kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = - kvm->arch.ept_identity_map_addr; - kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __x86_set_memory_region(kvm, &kvm_userspace_mem); + r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + kvm->arch.ept_identity_map_addr, PAGE_SIZE); return r; } -static void allocate_vpid(struct vcpu_vmx *vmx) +static int allocate_vpid(void) { int vpid; - vmx->vpid = 0; if (!enable_vpid) - return; + return 0; spin_lock(&vmx_vpid_lock); vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); - if (vpid < VMX_NR_VPIDS) { - vmx->vpid = vpid; + if (vpid < VMX_NR_VPIDS) __set_bit(vpid, vmx_vpid_bitmap); - } + else + vpid = 0; spin_unlock(&vmx_vpid_lock); + return vpid; } -static void free_vpid(struct vcpu_vmx *vmx) +static void free_vpid(int vpid) { - if (!enable_vpid) + if (!enable_vpid || vpid == 0) return; spin_lock(&vmx_vpid_lock); - if (vmx->vpid != 0) - __clear_bit(vmx->vpid, vmx_vpid_bitmap); + __clear_bit(vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); } @@ -4332,9 +4466,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr) msr, MSR_TYPE_W); } -static int vmx_vm_has_apicv(struct kvm *kvm) +static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu) { - return enable_apicv && irqchip_in_kernel(kvm); + return enable_apicv && lapic_in_kernel(vcpu); } static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) @@ -4378,6 +4512,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) { #ifdef CONFIG_SMP if (vcpu->mode == IN_GUEST_MODE) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Currently, we don't support urgent interrupt, + * all interrupts are recognized as non-urgent + * interrupt, so we cannot post interrupts when + * 'SN' is set. + * + * If the vcpu is in guest mode, it means it is + * running instead of being scheduled out and + * waiting in the run queue, and that's the only + * case when 'SN' is set currently, warning if + * 'SN' is set. + */ + WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR); return true; @@ -4514,7 +4664,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; - if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (!vmx_cpu_uses_apicv(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; return pin_based_exec_ctrl; } @@ -4526,7 +4676,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) exec_control &= ~CPU_BASED_MOV_DR_EXITING; - if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { + if (!cpu_need_tpr_shadow(&vmx->vcpu)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 exec_control |= CPU_BASED_CR8_STORE_EXITING | @@ -4543,7 +4693,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; @@ -4557,7 +4707,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (!vmx_cpu_uses_apicv(&vmx->vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; @@ -4570,6 +4720,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) /* PML is enabled/disabled in creating/destorying vcpu */ exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + /* Currently, we allow L1 guest to directly run pcommit instruction. */ + exec_control &= ~SECONDARY_EXEC_PCOMMIT; + return exec_control; } @@ -4613,12 +4766,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) { + if (cpu_has_secondary_exec_ctrls()) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control(vmx)); - } - if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { + if (vmx_cpu_uses_apicv(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); @@ -4762,7 +4914,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (cpu_has_vmx_tpr_shadow() && !init_event) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (vm_need_tpr_shadow(vcpu->kvm)) + if (cpu_need_tpr_shadow(vcpu)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, __pa(vcpu->arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); @@ -4770,7 +4922,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (vmx_vm_has_apicv(vcpu->kvm)) + if (vmx_cpu_uses_apicv(vcpu)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); if (vmx->vpid != 0) @@ -4780,12 +4932,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx->vcpu.arch.cr0 = cr0; vmx_set_cr4(vcpu, 0); - if (!init_event) - vmx_set_efer(vcpu, 0); + vmx_set_efer(vcpu, 0); vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); - vpid_sync_context(vmx); + vpid_sync_context(vmx->vpid); } /* @@ -4949,14 +5100,9 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { int ret; - struct kvm_userspace_memory_region tss_mem = { - .slot = TSS_PRIVATE_MEMSLOT, - .guest_phys_addr = addr, - .memory_size = PAGE_SIZE * 3, - .flags = 0, - }; - ret = x86_set_memory_region(kvm, &tss_mem); + ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, + PAGE_SIZE * 3); if (ret) return ret; kvm->arch.tss_addr = addr; @@ -5310,7 +5456,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); kvm_complete_insn_gp(vcpu, err); - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) return 1; if (cr8_prev <= cr8) return 1; @@ -5524,17 +5670,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); ++vcpu->stat.irq_window_exits; - - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (!irqchip_in_kernel(vcpu->kvm) && - vcpu->run->request_interrupt_window && - !kvm_cpu_has_interrupt(vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } return 1; } @@ -5767,6 +5902,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { skip_emulated_instruction(vcpu); + trace_kvm_fast_mmio(gpa); return 1; } @@ -5924,6 +6060,25 @@ static void update_ple_window_actual_max(void) ple_window_grow, INT_MIN); } +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +static void wakeup_handler(void) +{ + struct kvm_vcpu *vcpu; + int cpu = smp_processor_id(); + + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), + blocked_vcpu_list) { + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (pi_test_on(pi_desc) == 1) + kvm_vcpu_kick(vcpu); + } + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + static __init int hardware_setup(void) { int r = -ENOMEM, i, msr; @@ -6110,6 +6265,8 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + kvm_set_posted_intr_wakeup_handler(wakeup_handler); + return alloc_kvm_area(); out8: @@ -6641,7 +6798,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { - u32 exec_control; if (vmx->nested.current_vmptr == -1ull) return; @@ -6654,9 +6810,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) they were modified */ copy_shadow_to_vmcs12(vmx); vmx->nested.sync_shadow_vmcs = false; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, -1ull); } vmx->nested.posted_intr_nv = -1; @@ -6676,6 +6831,7 @@ static void free_nested(struct vcpu_vmx *vmx) return; vmx->nested.vmxon = false; + free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); @@ -7052,7 +7208,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t vmptr; - u32 exec_control; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -7084,9 +7239,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; if (enable_shadow_vmcs) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, __pa(vmx->nested.current_shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; @@ -7192,7 +7346,63 @@ static int handle_invept(struct kvm_vcpu *vcpu) static int handle_invvpid(struct kvm_vcpu *vcpu) { - kvm_queue_exception(vcpu, UD_VECTOR); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmx_instruction_info; + unsigned long type, types; + gva_t gva; + struct x86_exception e; + int vpid; + + if (!(vmx->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_ENABLE_VPID) || + !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; + + if (!(types & (1UL << type))) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + + /* according to the intel vmx instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, + sizeof(u32), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_VPID_EXTENT_ALL_CONTEXT: + if (get_vmcs12(vcpu)->virtual_processor_id == 0) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); + nested_vmx_succeed(vcpu); + break; + default: + /* Trap single context invalidation invvpid calls */ + BUG_ON(1); + break; + } + + skip_emulated_instruction(vcpu); return 1; } @@ -7221,6 +7431,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } +static int handle_pcommit(struct kvm_vcpu *vcpu) +{ + /* we never catch pcommit instruct for L1 guest. */ + WARN_ON(1); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7271,6 +7488,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_PCOMMIT] = handle_pcommit, }; static const int kvm_vmx_max_exit_handlers = @@ -7572,6 +7790,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + case EXIT_REASON_PCOMMIT: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT); default: return true; } @@ -7586,7 +7806,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) static int vmx_enable_pml(struct vcpu_vmx *vmx) { struct page *pml_pg; - u32 exec_control; pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!pml_pg) @@ -7597,24 +7816,18 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_ENABLE_PML; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); return 0; } static void vmx_disable_pml(struct vcpu_vmx *vmx) { - u32 exec_control; - ASSERT(vmx->pml_pg); __free_page(vmx->pml_pg); vmx->pml_pg = NULL; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); } static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) @@ -7938,10 +8151,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) * apicv */ if (!cpu_has_vmx_virtualize_x2apic_mode() || - !vmx_vm_has_apicv(vcpu->kvm)) + !vmx_cpu_uses_apicv(vcpu)) return; - if (!vm_need_tpr_shadow(vcpu->kvm)) + if (!cpu_need_tpr_shadow(vcpu)) return; sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); @@ -8043,9 +8256,10 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) } } -static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu) { - if (!vmx_vm_has_apicv(vcpu->kvm)) + u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap; + if (!vmx_cpu_uses_apicv(vcpu)) return; vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); @@ -8492,7 +8706,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) if (enable_pml) vmx_disable_pml(vmx); - free_vpid(vmx); + free_vpid(vmx->vpid); leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); free_nested(vmx); @@ -8511,7 +8725,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); - allocate_vpid(vmx); + vmx->vpid = allocate_vpid(); err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) @@ -8544,7 +8758,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) put_cpu(); if (err) goto free_vmcs; - if (vm_need_virtualize_apic_accesses(kvm)) { + if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { err = alloc_apic_access_page(kvm); if (err) goto free_vmcs; @@ -8559,8 +8773,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } - if (nested) + if (nested) { nested_vmx_setup_ctls_msrs(vmx); + vmx->nested.vpid02 = allocate_vpid(); + } vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -8581,13 +8797,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: + free_vpid(vmx->nested.vpid02); free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: - free_vpid(vmx); + free_vpid(vmx->vpid); kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); } @@ -8617,17 +8834,22 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) u64 ipat = 0; /* For VT-d and EPT combination - * 1. MMIO: guest may want to apply WC, trust it. + * 1. MMIO: always map as UC * 2. EPT with VT-d: * a. VT-d without snooping control feature: can't guarantee the - * result, try to trust guest. So the same as item 1. + * result, try to trust guest. * b. VT-d with snooping control feature: snooping control feature of * VT-d engine can guarantee the cache correctness. Just set it * to WB to keep consistent with host. So the same as item 3. * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep * consistent with host MTRR */ - if (!is_mmio && !kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + if (is_mmio) { + cache = MTRR_TYPE_UNCACHABLE; + goto exit; + } + + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { ipat = VMX_EPT_IPAT_BIT; cache = MTRR_TYPE_WRBACK; goto exit; @@ -8657,49 +8879,67 @@ static int vmx_get_lpage_level(void) return PT_PDPE_LEVEL; } +static void vmcs_set_secondary_exec_control(u32 new_ctl) +{ + /* + * These bits in the secondary execution controls field + * are dynamic, the others are mostly based on the hypervisor + * architecture and the guest's CPUID. Do not touch the + * dynamic bits. + */ + u32 mask = + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + + u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + (new_ctl & ~mask) | (cur_ctl & mask)); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control; + u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); - vmx->rdtscp_enabled = false; if (vmx_rdtscp_supported()) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - if (exec_control & SECONDARY_EXEC_RDTSCP) { - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) - vmx->rdtscp_enabled = true; - else { - exec_control &= ~SECONDARY_EXEC_RDTSCP; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu); + if (!rdtscp_enabled) + secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP; + + if (nested) { + if (rdtscp_enabled) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_RDTSCP; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_RDTSCP; } - if (nested && !vmx->rdtscp_enabled) - vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_RDTSCP; } /* Exposing INVPCID only when PCID is exposed */ best = kvm_find_cpuid_entry(vcpu, 0x7, 0); if (vmx_invpcid_supported() && - best && (best->ebx & bit(X86_FEATURE_INVPCID)) && - guest_cpuid_has_pcid(vcpu)) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } else { - if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) || + !guest_cpuid_has_pcid(vcpu))) { + secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID; + if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } + + vmcs_set_secondary_exec_control(secondary_exec_ctl); + + if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) { + if (guest_cpuid_has_pcommit(vcpu)) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_PCOMMIT; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_PCOMMIT; + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9307,13 +9547,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (cpu_has_secondary_exec_ctrls()) { exec_control = vmx_secondary_exec_control(vmx); - if (!vmx->rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_RDTSCP; + /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_APIC_REGISTER_VIRT); + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_PCOMMIT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -9332,7 +9572,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && - (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { + cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; kvm_vcpu_reload_apic_access_page(vcpu); @@ -9442,12 +9682,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (enable_vpid) { /* - * Trivially support vpid by letting L2s share their parent - * L1's vpid. TODO: move to a more elaborate solution, giving - * each L2 its own vpid and exposing the vpid feature to L1. + * There is no direct mapping between vpid02 and vpid12, the + * vpid02 is per-vCPU for L0 and reused while the value of + * vpid12 is changed w/ one invvpid during nested vmentry. + * The vpid12 is allocated by L1 for L2, so it will not + * influence global bitmap(for vpid01 and vpid02 allocation) + * even if spawn a lot of nested vCPUs. */ - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx_flush_tlb(vcpu); + if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + vmx->nested.last_vpid = vmcs12->virtual_processor_id; + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); + } + } else { + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmx_flush_tlb(vcpu); + } + } if (nested_cpu_has_ept(vmcs12)) { @@ -10287,6 +10539,201 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +/* + * This routine does the following things for vCPU which is going + * to be blocked if VT-d PI is enabled. + * - Store the vCPU to the wakeup list, so when interrupts happen + * we can find the right vCPU to wake up. + * - Change the Posted-interrupt descriptor as below: + * 'NDST' <-- vcpu->pre_pcpu + * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR + * - If 'ON' is set during this process, which means at least one + * interrupt is posted for this vCPU, we cannot block it, in + * this case, return 1, otherwise, return 0. + * + */ +static int vmx_pre_block(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + unsigned int dest; + struct pi_desc old, new; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + vcpu->pre_pcpu = vcpu->cpu; + spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + + do { + old.control = new.control = pi_desc->control; + + /* + * We should not block the vCPU if + * an interrupt is posted for it. + */ + if (pi_test_on(pi_desc) == 1) { + spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + + return 1; + } + + WARN((pi_desc->sn == 1), + "Warning: SN field of posted-interrupts " + "is set before blocking\n"); + + /* + * Since vCPU can be preempted during this process, + * vcpu->cpu could be different with pre_pcpu, we + * need to set pre_pcpu as the destination of wakeup + * notification event, then we can find the right vCPU + * to wakeup in wakeup handler if interrupts happen + * when the vCPU is in blocked state. + */ + dest = cpu_physical_id(vcpu->pre_pcpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'wakeup vector' */ + new.nv = POSTED_INTR_WAKEUP_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + return 0; +} + +static void vmx_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + unsigned long flags; + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + if(vcpu->pre_pcpu != -1) { + spin_lock_irqsave( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + } +} + +/* + * vmx_update_pi_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + struct vcpu_data vcpu_info; + int idx, ret = -EINVAL; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + /* + * VT-d PI cannot support posting multicast/broadcast + * interrupts to a vCPU, we still use interrupt remapping + * for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + * + * We will support full lowest-priority interrupt later. + */ + + kvm_set_msi_irq(e, &irq); + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) + continue; + + vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); + vcpu_info.vector = irq.vector; + + trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi, + vcpu_info.vector, vcpu_info.pi_desc_addr, set); + + if (set) + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + else { + /* suppress notification event before unposting */ + pi_set_sn(vcpu_to_pi_desc(vcpu)); + ret = irq_set_vcpu_affinity(host_irq, NULL); + pi_clear_sn(vcpu_to_pi_desc(vcpu)); + } + + if (ret < 0) { + printk(KERN_INFO "%s: failed to update PI IRTE\n", + __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -10356,7 +10803,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .vm_has_apicv = vmx_vm_has_apicv, + .cpu_uses_apicv = vmx_cpu_uses_apicv, .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, @@ -10403,7 +10850,12 @@ static struct kvm_x86_ops vmx_x86_ops = { .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .pre_block = vmx_pre_block, + .post_block = vmx_post_block, + .pmu_ops = &intel_pmu_ops, + + .update_pi_irte = vmx_update_pi_irte, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 991466bf8dee..9e9c226cb79d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -51,6 +51,8 @@ #include <linux/pci.h> #include <linux/timekeeper_internal.h> #include <linux/pvclock_gtod.h> +#include <linux/kvm_irqfd.h> +#include <linux/irqbypass.h> #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -64,6 +66,7 @@ #include <asm/fpu/internal.h> /* Ugh! */ #include <asm/pvclock.h> #include <asm/div64.h> +#include <asm/irq_remapping.h> #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 @@ -788,7 +791,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) return 1; - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; @@ -798,7 +801,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) { - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) return kvm_lapic_get_cr8(vcpu); else return vcpu->arch.cr8; @@ -952,6 +955,9 @@ static u32 emulated_msrs[] = { HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, + HV_X64_MSR_RESET, + HV_X64_MSR_VP_INDEX, + HV_X64_MSR_VP_RUNTIME, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, @@ -1708,8 +1714,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->pvclock_set_guest_stopped_request = false; } - pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO; - /* If the host uses TSC clocksource, then it is stable */ if (use_master_clock) pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; @@ -1899,6 +1903,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu) static void record_steal_time(struct kvm_vcpu *vcpu) { + accumulate_steal_time(vcpu); + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -2007,8 +2013,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) &vcpu->requests); ka->boot_vcpu_runs_old_kvmclock = tmp; - - ka->kvmclock_offset = -get_kernel_ns(); } vcpu->arch.time = data; @@ -2051,12 +2055,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & KVM_MSR_ENABLED)) break; - vcpu->arch.st.last_steal = current->sched_info.run_delay; - - preempt_disable(); - accumulate_steal_time(vcpu); - preempt_enable(); - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; @@ -2452,6 +2450,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: + case KVM_CAP_SPLIT_IRQCHIP: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -2631,7 +2630,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->cpu = cpu; } - accumulate_steal_time(vcpu); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } @@ -2665,12 +2663,24 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, { if (irq->irq >= KVM_NR_INTERRUPTS) return -EINVAL; - if (irqchip_in_kernel(vcpu->kvm)) + + if (!irqchip_in_kernel(vcpu->kvm)) { + kvm_queue_interrupt(vcpu, irq->irq, false); + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; + } + + /* + * With in-kernel LAPIC, we only use this to inject EXTINT, so + * fail for in-kernel 8259. + */ + if (pic_in_kernel(vcpu->kvm)) return -ENXIO; - kvm_queue_interrupt(vcpu, irq->irq, false); - kvm_make_request(KVM_REQ_EVENT, vcpu); + if (vcpu->arch.pending_external_vector != -1) + return -EEXIST; + vcpu->arch.pending_external_vector = irq->irq; return 0; } @@ -3179,7 +3189,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vapic_addr va; r = -EINVAL; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) goto out; r = -EFAULT; if (copy_from_user(&va, argp, sizeof va)) @@ -3559,6 +3569,28 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, kvm->arch.disabled_quirks = cap->args[0]; r = 0; break; + case KVM_CAP_SPLIT_IRQCHIP: { + mutex_lock(&kvm->lock); + r = -EINVAL; + if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS) + goto split_irqchip_unlock; + r = -EEXIST; + if (irqchip_in_kernel(kvm)) + goto split_irqchip_unlock; + if (atomic_read(&kvm->online_vcpus)) + goto split_irqchip_unlock; + r = kvm_setup_empty_irq_routing(kvm); + if (r) + goto split_irqchip_unlock; + /* Pairs with irqchip_in_kernel. */ + smp_wmb(); + kvm->arch.irqchip_split = true; + kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; + r = 0; +split_irqchip_unlock: + mutex_unlock(&kvm->lock); + break; + } default: r = -EINVAL; break; @@ -3672,7 +3704,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm)) + if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -3696,7 +3728,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm)) + if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) @@ -5670,7 +5702,7 @@ void kvm_arch_exit(void) int kvm_vcpu_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; - if (irqchip_in_kernel(vcpu->kvm)) { + if (lapic_in_kernel(vcpu)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; return 1; } else { @@ -5777,9 +5809,15 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) */ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) { - return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && - vcpu->run->request_interrupt_window && - kvm_arch_interrupt_allowed(vcpu)); + if (!vcpu->run->request_interrupt_window || pic_in_kernel(vcpu->kvm)) + return false; + + if (kvm_cpu_has_interrupt(vcpu)) + return false; + + return (irqchip_split(vcpu->kvm) + ? kvm_apic_accept_pic_intr(vcpu) + : kvm_arch_interrupt_allowed(vcpu)); } static void post_kvm_run_save(struct kvm_vcpu *vcpu) @@ -5790,13 +5828,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_run->ready_for_interrupt_injection = 1; - else + if (!irqchip_in_kernel(vcpu->kvm)) kvm_run->ready_for_interrupt_injection = kvm_arch_interrupt_allowed(vcpu) && !kvm_cpu_has_interrupt(vcpu) && !kvm_event_needs_reinjection(vcpu); + else if (!pic_in_kernel(vcpu->kvm)) + kvm_run->ready_for_interrupt_injection = + kvm_apic_accept_pic_intr(vcpu) && + !kvm_cpu_has_interrupt(vcpu); + else + kvm_run->ready_for_interrupt_injection = 1; } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -6147,18 +6189,18 @@ static void process_smi(struct kvm_vcpu *vcpu) static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { - u64 eoi_exit_bitmap[4]; - u32 tmr[8]; - if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - memset(eoi_exit_bitmap, 0, 32); - memset(tmr, 0, 32); + memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8); - kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr); - kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); - kvm_apic_update_tmr(vcpu, tmr); + if (irqchip_split(vcpu->kvm)) + kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap); + else { + kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); + } + kvm_x86_ops->load_eoi_exitmap(vcpu); } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) @@ -6171,7 +6213,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { struct page *page = NULL; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) return; if (!kvm_x86_ops->set_apic_access_page_addr) @@ -6209,7 +6251,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; - bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && + bool req_int_win = !lapic_in_kernel(vcpu) && vcpu->run->request_interrupt_window; bool req_immediate_exit = false; @@ -6261,6 +6303,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_pmu_handle_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) kvm_pmu_deliver_pmi(vcpu); + if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { + BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); + if (test_bit(vcpu->arch.pending_ioapic_eoi, + (void *) vcpu->arch.eoi_exit_bitmap)) { + vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI; + vcpu->run->eoi.vector = + vcpu->arch.pending_ioapic_eoi; + r = 0; + goto out; + } + } if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) vcpu_scan_ioapic(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) @@ -6271,6 +6324,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; + r = 0; + goto out; + } + } + + /* + * KVM_REQ_EVENT is not set when posted interrupts are set by + * VT-d hardware, so we have to update RVI unconditionally. + */ + if (kvm_lapic_enabled(vcpu)) { + /* + * Update architecture specific hints for APIC + * virtual interrupt delivery. + */ + if (kvm_x86_ops->hwapic_irr_update) + kvm_x86_ops->hwapic_irr_update(vcpu, + kvm_lapic_find_highest_irr(vcpu)); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -6289,13 +6362,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { - /* - * Update architecture specific hints for APIC - * virtual interrupt delivery. - */ - if (kvm_x86_ops->hwapic_irr_update) - kvm_x86_ops->hwapic_irr_update(vcpu, - kvm_lapic_find_highest_irr(vcpu)); update_cr8_intercept(vcpu); kvm_lapic_sync_to_vapic(vcpu); } @@ -6431,10 +6497,15 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { - if (!kvm_arch_vcpu_runnable(vcpu)) { + if (!kvm_arch_vcpu_runnable(vcpu) && + (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_x86_ops->post_block) + kvm_x86_ops->post_block(vcpu); + if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; } @@ -6457,6 +6528,12 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) return 1; } +static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && + !vcpu->arch.apf.halted); +} + static int vcpu_run(struct kvm_vcpu *vcpu) { int r; @@ -6465,11 +6542,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu) vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); for (;;) { - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) + if (kvm_vcpu_running(vcpu)) { r = vcpu_enter_guest(vcpu); - else + } else { r = vcpu_block(kvm, vcpu); + } + if (r <= 0) break; @@ -6478,8 +6556,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu) kvm_inject_pending_timer_irqs(vcpu); if (dm_request_for_irq_injection(vcpu)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; + r = 0; + vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; ++vcpu->stat.request_irq_exits; break; } @@ -6606,7 +6684,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } /* re-sync apic's tpr */ - if (!irqchip_in_kernel(vcpu->kvm)) { + if (!lapic_in_kernel(vcpu)) { if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { r = -EINVAL; goto out; @@ -7306,7 +7384,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { - return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); + return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu); } struct static_key kvm_no_apic_vcpu __read_mostly; @@ -7375,6 +7453,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); + vcpu->arch.pending_external_vector = -1; + return 0; fail_free_mce_banks: @@ -7400,7 +7480,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) static_key_slow_dec(&kvm_no_apic_vcpu); } @@ -7478,34 +7558,66 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } -int __x86_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) +int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; + unsigned long hva; + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *slot, old; /* Called with kvm->slots_lock held. */ - BUG_ON(mem->slot >= KVM_MEM_SLOTS_NUM); + if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) + return -EINVAL; + + slot = id_to_memslot(slots, id); + if (size) { + if (WARN_ON(slot->npages)) + return -EEXIST; + + /* + * MAP_SHARED to prevent internal slot pages from being moved + * by fork()/COW. + */ + hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0); + if (IS_ERR((void *)hva)) + return PTR_ERR((void *)hva); + } else { + if (!slot->npages) + return 0; + hva = 0; + } + + old = *slot; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_userspace_memory_region m = *mem; + struct kvm_userspace_memory_region m; - m.slot |= i << 16; + m.slot = id | (i << 16); + m.flags = 0; + m.guest_phys_addr = gpa; + m.userspace_addr = hva; + m.memory_size = size; r = __kvm_set_memory_region(kvm, &m); if (r < 0) return r; } + if (!size) { + r = vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); + WARN_ON(r < 0); + } + return 0; } EXPORT_SYMBOL_GPL(__x86_set_memory_region); -int x86_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) +int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int r; mutex_lock(&kvm->slots_lock); - r = __x86_set_memory_region(kvm, mem); + r = __x86_set_memory_region(kvm, id, gpa, size); mutex_unlock(&kvm->slots_lock); return r; @@ -7520,16 +7632,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) * unless the the memory map has changed due to process exit * or fd copying. */ - struct kvm_userspace_memory_region mem; - memset(&mem, 0, sizeof(mem)); - mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; - x86_set_memory_region(kvm, &mem); - - mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; - x86_set_memory_region(kvm, &mem); - - mem.slot = TSS_PRIVATE_MEMSLOT; - x86_set_memory_region(kvm, &mem); + x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); + x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0); + x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); } kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); @@ -7632,27 +7737,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { - /* - * Only private memory slots need to be mapped here since - * KVM_SET_MEMORY_REGION ioctl is no longer supported. - */ - if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) { - unsigned long userspace_addr; - - /* - * MAP_SHARED to prevent internal slot pages from being moved - * by fork()/COW. - */ - userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, 0); - - if (IS_ERR((void *)userspace_addr)) - return PTR_ERR((void *)userspace_addr); - - memslot->userspace_addr = userspace_addr; - } - return 0; } @@ -7714,17 +7798,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, { int nr_mmu_pages = 0; - if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) { - int ret; - - ret = vm_munmap(old->userspace_addr, - old->npages * PAGE_SIZE); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } - if (!kvm->arch.n_requested_mmu_pages) nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); @@ -7773,19 +7846,36 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, kvm_mmu_invalidate_zap_all_pages(kvm); } +static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +{ + if (!list_empty_careful(&vcpu->async_pf.done)) + return true; + + if (kvm_apic_has_events(vcpu)) + return true; + + if (vcpu->arch.pv.pv_unhalted) + return true; + + if (atomic_read(&vcpu->arch.nmi_queued)) + return true; + + if (test_bit(KVM_REQ_SMI, &vcpu->requests)) + return true; + + if (kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_has_interrupt(vcpu)) + return true; + + return false; +} + int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) kvm_x86_ops->check_nested_events(vcpu, false); - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) - || !list_empty_careful(&vcpu->async_pf.done) - || kvm_apic_has_events(vcpu) - || vcpu->arch.pv.pv_unhalted - || atomic_read(&vcpu->arch.nmi_queued) || - (kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_has_interrupt(vcpu)); + return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) @@ -8017,7 +8107,59 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + if (kvm_x86_ops->update_pi_irte) { + irqfd->producer = prod; + return kvm_x86_ops->update_pi_irte(irqfd->kvm, + prod->irq, irqfd->gsi, 1); + } + + return -EINVAL; +} + +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + int ret; + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + if (!kvm_x86_ops->update_pi_irte) { + WARN_ON(irqfd->producer != NULL); + return; + } + + WARN_ON(irqfd->producer != prod); + irqfd->producer = NULL; + + /* + * When producer of consumer is unregistered, we change back to + * remapped mode, so we can re-use the current implementation + * when the irq is masked/disabed or the consumer side (KVM + * int this case doesn't want to receive the interrupts. + */ + ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); + if (ret) + printk(KERN_INFO "irq bypass consumer (token %p) unregistration" + " fails: %d\n", irqfd->consumer.token, ret); +} + +int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + if (!kvm_x86_ops->update_pi_irte) + return -EINVAL; + + return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); +} + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); @@ -8032,3 +8174,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 913455a5fd40..8adaaeae3268 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -22,7 +22,7 @@ int irq_remap_broken; int disable_sourceid_checking; int no_x2apic_optout; -int disable_irq_post = 1; +int disable_irq_post = 0; static int disable_irq_remap; static struct irq_remap_ops *remap_ops; @@ -58,14 +58,18 @@ static __init int setup_irqremap(char *str) return -EINVAL; while (*str) { - if (!strncmp(str, "on", 2)) + if (!strncmp(str, "on", 2)) { disable_irq_remap = 0; - else if (!strncmp(str, "off", 3)) + disable_irq_post = 0; + } else if (!strncmp(str, "off", 3)) { disable_irq_remap = 1; - else if (!strncmp(str, "nosid", 5)) + disable_irq_post = 1; + } else if (!strncmp(str, "nosid", 5)) disable_sourceid_checking = 1; else if (!strncmp(str, "no_x2apic_optout", 16)) no_x2apic_optout = 1; + else if (!strncmp(str, "nopost", 6)) + disable_irq_post = 1; str += strcspn(str, ","); while (*str == ',') diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 454017928ed0..850d86ca685b 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -33,3 +33,4 @@ menuconfig VFIO source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" +source "virt/lib/Kconfig" diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 579d83bf5358..02912f180c6d 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -2,6 +2,7 @@ config VFIO_PCI tristate "VFIO support for PCI devices" depends on VFIO && PCI && EVENTFD select VFIO_VIRQFD + select IRQ_BYPASS_MANAGER help Support for the PCI VFIO bus driver. This is required to make use of PCI drivers using the VFIO framework. diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 1f577b4ac126..3b3ba15558b7 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -319,6 +319,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, if (vdev->ctx[vector].trigger) { free_irq(irq, vdev->ctx[vector].trigger); + irq_bypass_unregister_producer(&vdev->ctx[vector].producer); kfree(vdev->ctx[vector].name); eventfd_ctx_put(vdev->ctx[vector].trigger); vdev->ctx[vector].trigger = NULL; @@ -360,6 +361,14 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, return ret; } + vdev->ctx[vector].producer.token = trigger; + vdev->ctx[vector].producer.irq = irq; + ret = irq_bypass_register_producer(&vdev->ctx[vector].producer); + if (unlikely(ret)) + dev_info(&pdev->dev, + "irq bypass producer (token %p) registration fails: %d\n", + vdev->ctx[vector].producer.token, ret); + vdev->ctx[vector].trigger = trigger; return 0; diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index ae0e1b4c1711..0e7394f8f69b 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -13,6 +13,7 @@ #include <linux/mutex.h> #include <linux/pci.h> +#include <linux/irqbypass.h> #ifndef VFIO_PCI_PRIVATE_H #define VFIO_PCI_PRIVATE_H @@ -29,6 +30,7 @@ struct vfio_pci_irq_ctx { struct virqfd *mask; char *name; bool masked; + struct irq_bypass_producer producer; }; struct vfio_pci_device { diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h new file mode 100644 index 000000000000..1551b5b2f4c2 --- /dev/null +++ b/include/linux/irqbypass.h @@ -0,0 +1,90 @@ +/* + * IRQ offload/bypass manager + * + * Copyright (C) 2015 Red Hat, Inc. + * Copyright (c) 2015 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef IRQBYPASS_H +#define IRQBYPASS_H + +#include <linux/list.h> + +struct irq_bypass_consumer; + +/* + * Theory of operation + * + * The IRQ bypass manager is a simple set of lists and callbacks that allows + * IRQ producers (ex. physical interrupt sources) to be matched to IRQ + * consumers (ex. virtualization hardware that allows IRQ bypass or offload) + * via a shared token (ex. eventfd_ctx). Producers and consumers register + * independently. When a token match is found, the optional @stop callback + * will be called for each participant. The pair will then be connected via + * the @add_* callbacks, and finally the optional @start callback will allow + * any final coordination. When either participant is unregistered, the + * process is repeated using the @del_* callbacks in place of the @add_* + * callbacks. Match tokens must be unique per producer/consumer, 1:N pairings + * are not supported. + */ + +/** + * struct irq_bypass_producer - IRQ bypass producer definition + * @node: IRQ bypass manager private list management + * @token: opaque token to match between producer and consumer + * @irq: Linux IRQ number for the producer device + * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional) + * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional) + * @stop: Perform any quiesce operations necessary prior to add/del (optional) + * @start: Perform any startup operations necessary after add/del (optional) + * + * The IRQ bypass producer structure represents an interrupt source for + * participation in possible host bypass, for instance an interrupt vector + * for a physical device assigned to a VM. + */ +struct irq_bypass_producer { + struct list_head node; + void *token; + int irq; + int (*add_consumer)(struct irq_bypass_producer *, + struct irq_bypass_consumer *); + void (*del_consumer)(struct irq_bypass_producer *, + struct irq_bypass_consumer *); + void (*stop)(struct irq_bypass_producer *); + void (*start)(struct irq_bypass_producer *); +}; + +/** + * struct irq_bypass_consumer - IRQ bypass consumer definition + * @node: IRQ bypass manager private list management + * @token: opaque token to match between producer and consumer + * @add_producer: Connect the IRQ consumer to an IRQ producer + * @del_producer: Disconnect the IRQ consumer from an IRQ producer + * @stop: Perform any quiesce operations necessary prior to add/del (optional) + * @start: Perform any startup operations necessary after add/del (optional) + * + * The IRQ bypass consumer structure represents an interrupt sink for + * participation in possible host bypass, for instance a hypervisor may + * support offloads to allow bypassing the host entirely or offload + * portions of the interrupt handling to the VM. + */ +struct irq_bypass_consumer { + struct list_head node; + void *token; + int (*add_producer)(struct irq_bypass_consumer *, + struct irq_bypass_producer *); + void (*del_producer)(struct irq_bypass_consumer *, + struct irq_bypass_producer *); + void (*stop)(struct irq_bypass_consumer *); + void (*start)(struct irq_bypass_consumer *); +}; + +int irq_bypass_register_producer(struct irq_bypass_producer *); +void irq_bypass_unregister_producer(struct irq_bypass_producer *); +int irq_bypass_register_consumer(struct irq_bypass_consumer *); +void irq_bypass_unregister_consumer(struct irq_bypass_consumer *); + +#endif /* IRQBYPASS_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1bef9e21e725..eba9caebc9c1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -24,6 +24,7 @@ #include <linux/err.h> #include <linux/irqflags.h> #include <linux/context_tracking.h> +#include <linux/irqbypass.h> #include <asm/signal.h> #include <linux/kvm.h> @@ -140,6 +141,8 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_APIC_PAGE_RELOAD 25 #define KVM_REQ_SMI 26 #define KVM_REQ_HV_CRASH 27 +#define KVM_REQ_IOAPIC_EOI_EXIT 28 +#define KVM_REQ_HV_RESET 29 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -231,6 +234,9 @@ struct kvm_vcpu { unsigned long requests; unsigned long guest_debug; + int pre_pcpu; + struct list_head blocked_vcpu_list; + struct mutex mutex; struct kvm_run *run; @@ -329,6 +335,18 @@ struct kvm_kernel_irq_routing_entry { struct hlist_node link; }; +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING +struct kvm_irq_routing_table { + int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; + u32 nr_rt_entries; + /* + * Array indexed by gsi. Each entry contains list of irq chips + * the gsi is connected to. + */ + struct hlist_head map[0]; +}; +#endif + #ifndef KVM_PRIVATE_MEM_SLOTS #define KVM_PRIVATE_MEM_SLOTS 0 #endif @@ -455,10 +473,14 @@ void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); +void kvm_arch_irq_routing_update(struct kvm *kvm); #else static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) { } +static inline void kvm_arch_irq_routing_update(struct kvm *kvm) +{ +} #endif #ifdef CONFIG_HAVE_KVM_IRQFD @@ -806,7 +828,12 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level, bool line_status); + +int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm, + int irq_source_id, int level, bool line_status); + bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); +void kvm_notify_acked_gsi(struct kvm *kvm, int gsi); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); @@ -1002,6 +1029,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) #endif int kvm_setup_default_irq_routing(struct kvm *kvm); +int kvm_setup_empty_irq_routing(struct kvm *kvm); int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, @@ -1144,5 +1172,15 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ -#endif +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, + struct irq_bypass_producer *); +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, + struct irq_bypass_producer *); +void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); +void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); +int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); +#endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ +#endif diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h new file mode 100644 index 000000000000..0c1de05098c8 --- /dev/null +++ b/include/linux/kvm_irqfd.h @@ -0,0 +1,71 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * irqfd: Allows an fd to be used to inject an interrupt to the guest + * Credit goes to Avi Kivity for the original idea. + */ + +#ifndef __LINUX_KVM_IRQFD_H +#define __LINUX_KVM_IRQFD_H + +#include <linux/kvm_host.h> +#include <linux/poll.h> + +/* + * Resampling irqfds are a special variety of irqfds used to emulate + * level triggered interrupts. The interrupt is asserted on eventfd + * trigger. On acknowledgment through the irq ack notifier, the + * interrupt is de-asserted and userspace is notified through the + * resamplefd. All resamplers on the same gsi are de-asserted + * together, so we don't need to track the state of each individual + * user. We can also therefore share the same irq source ID. + */ +struct kvm_kernel_irqfd_resampler { + struct kvm *kvm; + /* + * List of resampling struct _irqfd objects sharing this gsi. + * RCU list modified under kvm->irqfds.resampler_lock + */ + struct list_head list; + struct kvm_irq_ack_notifier notifier; + /* + * Entry in list of kvm->irqfd.resampler_list. Use for sharing + * resamplers among irqfds on the same gsi. + * Accessed and modified under kvm->irqfds.resampler_lock + */ + struct list_head link; +}; + +struct kvm_kernel_irqfd { + /* Used for MSI fast-path */ + struct kvm *kvm; + wait_queue_t wait; + /* Update side is protected by irqfds.lock */ + struct kvm_kernel_irq_routing_entry irq_entry; + seqcount_t irq_entry_sc; + /* Used for level IRQ fast-path */ + int gsi; + struct work_struct inject; + /* The resampler used by this irqfd (resampler-only) */ + struct kvm_kernel_irqfd_resampler *resampler; + /* Eventfd notified on resample (resampler-only) */ + struct eventfd_ctx *resamplefd; + /* Entry in list of irqfds for a resampler (resampler-only) */ + struct list_head resampler_link; + /* Used for setup/shutdown */ + struct eventfd_ctx *eventfd; + struct list_head list; + poll_table pt; + struct work_struct shutdown; + struct irq_bypass_consumer consumer; + struct irq_bypass_producer *producer; +}; + +#endif /* __LINUX_KVM_IRQFD_H */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a9256f0331ae..03f3618612aa 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -183,6 +183,7 @@ struct kvm_s390_skeys { #define KVM_EXIT_EPR 23 #define KVM_EXIT_SYSTEM_EVENT 24 #define KVM_EXIT_S390_STSI 25 +#define KVM_EXIT_IOAPIC_EOI 26 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -333,6 +334,10 @@ struct kvm_run { __u8 sel1; __u16 sel2; } s390_stsi; + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; /* Fix the size of the union. */ char padding[256]; }; @@ -824,6 +829,8 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_MULTI_ADDRESS_SPACE 118 #define KVM_CAP_GUEST_DEBUG_HW_BPS 119 #define KVM_CAP_GUEST_DEBUG_HW_WPS 120 +#define KVM_CAP_SPLIT_IRQCHIP 121 +#define KVM_CAP_IOEVENTFD_ANY_LENGTH 122 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8cbc3db671df..26a54461bf59 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) *ut = p->utime; *st = p->stime; } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { @@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { diff --git a/tools/lib/traceevent/plugin_kvm.c b/tools/lib/traceevent/plugin_kvm.c index 88fe83dff7cd..18536f756577 100644 --- a/tools/lib/traceevent/plugin_kvm.c +++ b/tools/lib/traceevent/plugin_kvm.c @@ -124,7 +124,10 @@ static const char *disassemble(unsigned char *insn, int len, uint64_t rip, _ER(WBINVD, 54) \ _ER(XSETBV, 55) \ _ER(APIC_WRITE, 56) \ - _ER(INVPCID, 58) + _ER(INVPCID, 58) \ + _ER(PML_FULL, 62) \ + _ER(XSAVES, 63) \ + _ER(XRSTORS, 64) #define SVM_EXIT_REASONS \ _ER(EXIT_READ_CR0, 0x000) \ @@ -352,15 +355,18 @@ static int kvm_nested_vmexit_handler(struct trace_seq *s, struct pevent_record * union kvm_mmu_page_role { unsigned word; struct { - unsigned glevels:4; unsigned level:4; + unsigned cr4_pae:1; unsigned quadrant:2; - unsigned pad_for_nice_hex_output:6; unsigned direct:1; unsigned access:3; unsigned invalid:1; - unsigned cr4_pge:1; unsigned nxe:1; + unsigned cr0_wp:1; + unsigned smep_and_not_wp:1; + unsigned smap_and_not_wp:1; + unsigned pad_for_nice_hex_output:8; + unsigned smm:8; }; }; @@ -385,15 +391,18 @@ static int kvm_mmu_print_role(struct trace_seq *s, struct pevent_record *record, if (pevent_is_file_bigendian(event->pevent) == pevent_is_host_bigendian(event->pevent)) { - trace_seq_printf(s, "%u/%u q%u%s %s%s %spge %snxe", + trace_seq_printf(s, "%u q%u%s %s%s %spae %snxe %swp%s%s%s", role.level, - role.glevels, role.quadrant, role.direct ? " direct" : "", access_str[role.access], role.invalid ? " invalid" : "", - role.cr4_pge ? "" : "!", - role.nxe ? "" : "!"); + role.cr4_pae ? "" : "!", + role.nxe ? "" : "!", + role.cr0_wp ? "" : "!", + role.smep_and_not_wp ? " smep" : "", + role.smap_and_not_wp ? " smap" : "", + role.smm ? " smm" : ""); } else trace_seq_printf(s, "WORD: %08x", role.word); diff --git a/virt/Makefile b/virt/Makefile new file mode 100644 index 000000000000..be783472ac81 --- /dev/null +++ b/virt/Makefile @@ -0,0 +1 @@ +obj-y += lib/ diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index e2c876d5a03b..9f8014dda2cf 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -47,3 +47,6 @@ config KVM_GENERIC_DIRTYLOG_READ_PROTECT config KVM_COMPAT def_bool y depends on COMPAT && !S390 + +config HAVE_KVM_IRQ_BYPASS + bool diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 44660aee335f..77d42be6970e 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -94,6 +94,10 @@ static void async_pf_execute(struct work_struct *work) trace_kvm_async_pf_completed(addr, gva); + /* + * This memory barrier pairs with prepare_to_wait's set_current_state() + */ + smp_mb(); if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 79db45336e3a..e29fd2640709 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -23,6 +23,7 @@ #include <linux/kvm_host.h> #include <linux/kvm.h> +#include <linux/kvm_irqfd.h> #include <linux/workqueue.h> #include <linux/syscalls.h> #include <linux/wait.h> @@ -34,73 +35,20 @@ #include <linux/srcu.h> #include <linux/slab.h> #include <linux/seqlock.h> +#include <linux/irqbypass.h> #include <trace/events/kvm.h> #include <kvm/iodev.h> #ifdef CONFIG_HAVE_KVM_IRQFD -/* - * -------------------------------------------------------------------- - * irqfd: Allows an fd to be used to inject an interrupt to the guest - * - * Credit goes to Avi Kivity for the original idea. - * -------------------------------------------------------------------- - */ - -/* - * Resampling irqfds are a special variety of irqfds used to emulate - * level triggered interrupts. The interrupt is asserted on eventfd - * trigger. On acknowledgement through the irq ack notifier, the - * interrupt is de-asserted and userspace is notified through the - * resamplefd. All resamplers on the same gsi are de-asserted - * together, so we don't need to track the state of each individual - * user. We can also therefore share the same irq source ID. - */ -struct _irqfd_resampler { - struct kvm *kvm; - /* - * List of resampling struct _irqfd objects sharing this gsi. - * RCU list modified under kvm->irqfds.resampler_lock - */ - struct list_head list; - struct kvm_irq_ack_notifier notifier; - /* - * Entry in list of kvm->irqfd.resampler_list. Use for sharing - * resamplers among irqfds on the same gsi. - * Accessed and modified under kvm->irqfds.resampler_lock - */ - struct list_head link; -}; - -struct _irqfd { - /* Used for MSI fast-path */ - struct kvm *kvm; - wait_queue_t wait; - /* Update side is protected by irqfds.lock */ - struct kvm_kernel_irq_routing_entry irq_entry; - seqcount_t irq_entry_sc; - /* Used for level IRQ fast-path */ - int gsi; - struct work_struct inject; - /* The resampler used by this irqfd (resampler-only) */ - struct _irqfd_resampler *resampler; - /* Eventfd notified on resample (resampler-only) */ - struct eventfd_ctx *resamplefd; - /* Entry in list of irqfds for a resampler (resampler-only) */ - struct list_head resampler_link; - /* Used for setup/shutdown */ - struct eventfd_ctx *eventfd; - struct list_head list; - poll_table pt; - struct work_struct shutdown; -}; static struct workqueue_struct *irqfd_cleanup_wq; static void irqfd_inject(struct work_struct *work) { - struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); + struct kvm_kernel_irqfd *irqfd = + container_of(work, struct kvm_kernel_irqfd, inject); struct kvm *kvm = irqfd->kvm; if (!irqfd->resampler) { @@ -121,12 +69,13 @@ irqfd_inject(struct work_struct *work) static void irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) { - struct _irqfd_resampler *resampler; + struct kvm_kernel_irqfd_resampler *resampler; struct kvm *kvm; - struct _irqfd *irqfd; + struct kvm_kernel_irqfd *irqfd; int idx; - resampler = container_of(kian, struct _irqfd_resampler, notifier); + resampler = container_of(kian, + struct kvm_kernel_irqfd_resampler, notifier); kvm = resampler->kvm; kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, @@ -141,9 +90,9 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) } static void -irqfd_resampler_shutdown(struct _irqfd *irqfd) +irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) { - struct _irqfd_resampler *resampler = irqfd->resampler; + struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler; struct kvm *kvm = resampler->kvm; mutex_lock(&kvm->irqfds.resampler_lock); @@ -168,7 +117,8 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd) static void irqfd_shutdown(struct work_struct *work) { - struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); + struct kvm_kernel_irqfd *irqfd = + container_of(work, struct kvm_kernel_irqfd, shutdown); u64 cnt; /* @@ -191,6 +141,9 @@ irqfd_shutdown(struct work_struct *work) /* * It is now safe to release the object's resources */ +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS + irq_bypass_unregister_consumer(&irqfd->consumer); +#endif eventfd_ctx_put(irqfd->eventfd); kfree(irqfd); } @@ -198,7 +151,7 @@ irqfd_shutdown(struct work_struct *work) /* assumes kvm->irqfds.lock is held */ static bool -irqfd_is_active(struct _irqfd *irqfd) +irqfd_is_active(struct kvm_kernel_irqfd *irqfd) { return list_empty(&irqfd->list) ? false : true; } @@ -209,7 +162,7 @@ irqfd_is_active(struct _irqfd *irqfd) * assumes kvm->irqfds.lock is held */ static void -irqfd_deactivate(struct _irqfd *irqfd) +irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) { BUG_ON(!irqfd_is_active(irqfd)); @@ -218,13 +171,23 @@ irqfd_deactivate(struct _irqfd *irqfd) queue_work(irqfd_cleanup_wq, &irqfd->shutdown); } +int __attribute__((weak)) kvm_arch_set_irq( + struct kvm_kernel_irq_routing_entry *irq, + struct kvm *kvm, int irq_source_id, + int level, + bool line_status) +{ + return -EWOULDBLOCK; +} + /* * Called with wqh->lock held and interrupts disabled */ static int irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) { - struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); + struct kvm_kernel_irqfd *irqfd = + container_of(wait, struct kvm_kernel_irqfd, wait); unsigned long flags = (unsigned long)key; struct kvm_kernel_irq_routing_entry irq; struct kvm *kvm = irqfd->kvm; @@ -241,7 +204,9 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) if (irq.type == KVM_IRQ_ROUTING_MSI) kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); - else + else if (kvm_arch_set_irq(&irq, kvm, + KVM_USERSPACE_IRQ_SOURCE_ID, 1, + false) == -EWOULDBLOCK) schedule_work(&irqfd->inject); srcu_read_unlock(&kvm->irq_srcu, idx); } @@ -274,37 +239,54 @@ static void irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { - struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); + struct kvm_kernel_irqfd *irqfd = + container_of(pt, struct kvm_kernel_irqfd, pt); add_wait_queue(wqh, &irqfd->wait); } /* Must be called under irqfds.lock */ -static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd) +static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) { struct kvm_kernel_irq_routing_entry *e; struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - int i, n_entries; + int n_entries; n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); write_seqcount_begin(&irqfd->irq_entry_sc); - irqfd->irq_entry.type = 0; - e = entries; - for (i = 0; i < n_entries; ++i, ++e) { - /* Only fast-path MSI. */ - if (e->type == KVM_IRQ_ROUTING_MSI) - irqfd->irq_entry = *e; - } + if (n_entries == 1) + irqfd->irq_entry = *e; + else + irqfd->irq_entry.type = 0; write_seqcount_end(&irqfd->irq_entry_sc); } +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +void __attribute__((weak)) kvm_arch_irq_bypass_stop( + struct irq_bypass_consumer *cons) +{ +} + +void __attribute__((weak)) kvm_arch_irq_bypass_start( + struct irq_bypass_consumer *cons) +{ +} + +int __attribute__((weak)) kvm_arch_update_irqfd_routing( + struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + return 0; +} +#endif + static int kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { - struct _irqfd *irqfd, *tmp; + struct kvm_kernel_irqfd *irqfd, *tmp; struct fd f; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; int ret; @@ -340,7 +322,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) irqfd->eventfd = eventfd; if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { - struct _irqfd_resampler *resampler; + struct kvm_kernel_irqfd_resampler *resampler; resamplefd = eventfd_ctx_fdget(args->resamplefd); if (IS_ERR(resamplefd)) { @@ -428,6 +410,17 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) * we might race against the POLLHUP */ fdput(f); +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS + irqfd->consumer.token = (void *)irqfd->eventfd; + irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; + irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; + irqfd->consumer.stop = kvm_arch_irq_bypass_stop; + irqfd->consumer.start = kvm_arch_irq_bypass_start; + ret = irq_bypass_register_consumer(&irqfd->consumer); + if (ret) + pr_info("irq bypass consumer (token %p) registration fails: %d\n", + irqfd->consumer.token, ret); +#endif return 0; @@ -469,9 +462,18 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) } EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); -void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) +void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) { struct kvm_irq_ack_notifier *kian; + + hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, + link) + if (kian->gsi == gsi) + kian->irq_acked(kian); +} + +void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) +{ int gsi, idx; trace_kvm_ack_irq(irqchip, pin); @@ -479,10 +481,7 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) idx = srcu_read_lock(&kvm->irq_srcu); gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); if (gsi != -1) - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) - if (kian->gsi == gsi) - kian->irq_acked(kian); + kvm_notify_acked_gsi(kvm, gsi); srcu_read_unlock(&kvm->irq_srcu, idx); } @@ -525,7 +524,7 @@ kvm_eventfd_init(struct kvm *kvm) static int kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) { - struct _irqfd *irqfd, *tmp; + struct kvm_kernel_irqfd *irqfd, *tmp; struct eventfd_ctx *eventfd; eventfd = eventfd_ctx_fdget(args->fd); @@ -581,7 +580,7 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) void kvm_irqfd_release(struct kvm *kvm) { - struct _irqfd *irqfd, *tmp; + struct kvm_kernel_irqfd *irqfd, *tmp; spin_lock_irq(&kvm->irqfds.lock); @@ -604,13 +603,23 @@ kvm_irqfd_release(struct kvm *kvm) */ void kvm_irq_routing_update(struct kvm *kvm) { - struct _irqfd *irqfd; + struct kvm_kernel_irqfd *irqfd; spin_lock_irq(&kvm->irqfds.lock); - list_for_each_entry(irqfd, &kvm->irqfds.items, list) + list_for_each_entry(irqfd, &kvm->irqfds.items, list) { irqfd_update(kvm, irqfd); +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS + if (irqfd->producer) { + int ret = kvm_arch_update_irqfd_routing( + irqfd->kvm, irqfd->producer->irq, + irqfd->gsi, 1); + WARN_ON(ret); + } +#endif + } + spin_unlock_irq(&kvm->irqfds.lock); } @@ -914,9 +923,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) return -EINVAL; /* ioeventfd with no length can't be combined with DATAMATCH */ - if (!args->len && - args->flags & (KVM_IOEVENTFD_FLAG_PIO | - KVM_IOEVENTFD_FLAG_DATAMATCH)) + if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)) return -EINVAL; ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args); diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index d7ea8e20dae4..f0b08a2a48ba 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -31,16 +31,6 @@ #include <trace/events/kvm.h> #include "irq.h" -struct kvm_irq_routing_table { - int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; - u32 nr_rt_entries; - /* - * Array indexed by gsi. Each entry contains list of irq chips - * the gsi is connected to. - */ - struct hlist_head map[0]; -}; - int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) { @@ -154,11 +144,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, /* * Do not allow GSI to be mapped to the same irqchip more than once. - * Allow only one to one mapping between GSI and MSI. + * Allow only one to one mapping between GSI and non-irqchip routing. */ hlist_for_each_entry(ei, &rt->map[ue->gsi], link) - if (ei->type == KVM_IRQ_ROUTING_MSI || - ue->type == KVM_IRQ_ROUTING_MSI || + if (ei->type != KVM_IRQ_ROUTING_IRQCHIP || + ue->type != KVM_IRQ_ROUTING_IRQCHIP || ue->u.irqchip.irqchip == ei->irqchip.irqchip) return r; @@ -231,6 +221,8 @@ int kvm_set_irq_routing(struct kvm *kvm, kvm_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); + kvm_arch_irq_routing_update(kvm); + synchronize_srcu_expedited(&kvm->irq_srcu); new = old; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8db1d9361993..a75502c93c3e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -230,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) init_waitqueue_head(&vcpu->wq); kvm_async_pf_vcpu_init(vcpu); + vcpu->pre_pcpu = -1; + INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); + page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { r = -ENOMEM; @@ -2718,6 +2721,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IRQFD: case KVM_CAP_IRQFD_RESAMPLE: #endif + case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: return 1; #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING @@ -3341,7 +3345,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; - new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * + new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) * sizeof(struct kvm_io_range)), GFP_KERNEL); if (!new_bus) return -ENOMEM; @@ -3373,7 +3377,7 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, if (r) return r; - new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) * + new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * sizeof(struct kvm_io_range)), GFP_KERNEL); if (!new_bus) return -ENOMEM; diff --git a/virt/lib/Kconfig b/virt/lib/Kconfig new file mode 100644 index 000000000000..89a414f815d2 --- /dev/null +++ b/virt/lib/Kconfig @@ -0,0 +1,2 @@ +config IRQ_BYPASS_MANAGER + tristate diff --git a/virt/lib/Makefile b/virt/lib/Makefile new file mode 100644 index 000000000000..901228d1ffbc --- /dev/null +++ b/virt/lib/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_IRQ_BYPASS_MANAGER) += irqbypass.o diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c new file mode 100644 index 000000000000..09a03b5a21ff --- /dev/null +++ b/virt/lib/irqbypass.c @@ -0,0 +1,257 @@ +/* + * IRQ offload/bypass manager + * + * Copyright (C) 2015 Red Hat, Inc. + * Copyright (c) 2015 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Various virtualization hardware acceleration techniques allow bypassing or + * offloading interrupts received from devices around the host kernel. Posted + * Interrupts on Intel VT-d systems can allow interrupts to be received + * directly by a virtual machine. ARM IRQ Forwarding allows forwarded physical + * interrupts to be directly deactivated by the guest. This manager allows + * interrupt producers and consumers to find each other to enable this sort of + * bypass. + */ + +#include <linux/irqbypass.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/mutex.h> + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("IRQ bypass manager utility module"); + +static LIST_HEAD(producers); +static LIST_HEAD(consumers); +static DEFINE_MUTEX(lock); + +/* @lock must be held when calling connect */ +static int __connect(struct irq_bypass_producer *prod, + struct irq_bypass_consumer *cons) +{ + int ret = 0; + + if (prod->stop) + prod->stop(prod); + if (cons->stop) + cons->stop(cons); + + if (prod->add_consumer) + ret = prod->add_consumer(prod, cons); + + if (!ret) { + ret = cons->add_producer(cons, prod); + if (ret && prod->del_consumer) + prod->del_consumer(prod, cons); + } + + if (cons->start) + cons->start(cons); + if (prod->start) + prod->start(prod); + + return ret; +} + +/* @lock must be held when calling disconnect */ +static void __disconnect(struct irq_bypass_producer *prod, + struct irq_bypass_consumer *cons) +{ + if (prod->stop) + prod->stop(prod); + if (cons->stop) + cons->stop(cons); + + cons->del_producer(cons, prod); + + if (prod->del_consumer) + prod->del_consumer(prod, cons); + + if (cons->start) + cons->start(cons); + if (prod->start) + prod->start(prod); +} + +/** + * irq_bypass_register_producer - register IRQ bypass producer + * @producer: pointer to producer structure + * + * Add the provided IRQ producer to the list of producers and connect + * with any matching token found on the IRQ consumers list. + */ +int irq_bypass_register_producer(struct irq_bypass_producer *producer) +{ + struct irq_bypass_producer *tmp; + struct irq_bypass_consumer *consumer; + + might_sleep(); + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&lock); + + list_for_each_entry(tmp, &producers, node) { + if (tmp->token == producer->token) { + mutex_unlock(&lock); + module_put(THIS_MODULE); + return -EBUSY; + } + } + + list_for_each_entry(consumer, &consumers, node) { + if (consumer->token == producer->token) { + int ret = __connect(producer, consumer); + if (ret) { + mutex_unlock(&lock); + module_put(THIS_MODULE); + return ret; + } + break; + } + } + + list_add(&producer->node, &producers); + + mutex_unlock(&lock); + + return 0; +} +EXPORT_SYMBOL_GPL(irq_bypass_register_producer); + +/** + * irq_bypass_unregister_producer - unregister IRQ bypass producer + * @producer: pointer to producer structure + * + * Remove a previously registered IRQ producer from the list of producers + * and disconnect it from any connected IRQ consumer. + */ +void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) +{ + struct irq_bypass_producer *tmp; + struct irq_bypass_consumer *consumer; + + might_sleep(); + + if (!try_module_get(THIS_MODULE)) + return; /* nothing in the list anyway */ + + mutex_lock(&lock); + + list_for_each_entry(tmp, &producers, node) { + if (tmp->token != producer->token) + continue; + + list_for_each_entry(consumer, &consumers, node) { + if (consumer->token == producer->token) { + __disconnect(producer, consumer); + break; + } + } + + list_del(&producer->node); + module_put(THIS_MODULE); + break; + } + + mutex_unlock(&lock); + + module_put(THIS_MODULE); +} +EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); + +/** + * irq_bypass_register_consumer - register IRQ bypass consumer + * @consumer: pointer to consumer structure + * + * Add the provided IRQ consumer to the list of consumers and connect + * with any matching token found on the IRQ producer list. + */ +int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) +{ + struct irq_bypass_consumer *tmp; + struct irq_bypass_producer *producer; + + if (!consumer->add_producer || !consumer->del_producer) + return -EINVAL; + + might_sleep(); + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&lock); + + list_for_each_entry(tmp, &consumers, node) { + if (tmp->token == consumer->token) { + mutex_unlock(&lock); + module_put(THIS_MODULE); + return -EBUSY; + } + } + + list_for_each_entry(producer, &producers, node) { + if (producer->token == consumer->token) { + int ret = __connect(producer, consumer); + if (ret) { + mutex_unlock(&lock); + module_put(THIS_MODULE); + return ret; + } + break; + } + } + + list_add(&consumer->node, &consumers); + + mutex_unlock(&lock); + + return 0; +} +EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); + +/** + * irq_bypass_unregister_consumer - unregister IRQ bypass consumer + * @consumer: pointer to consumer structure + * + * Remove a previously registered IRQ consumer from the list of consumers + * and disconnect it from any connected IRQ producer. + */ +void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) +{ + struct irq_bypass_consumer *tmp; + struct irq_bypass_producer *producer; + + might_sleep(); + + if (!try_module_get(THIS_MODULE)) + return; /* nothing in the list anyway */ + + mutex_lock(&lock); + + list_for_each_entry(tmp, &consumers, node) { + if (tmp->token != consumer->token) + continue; + + list_for_each_entry(producer, &producers, node) { + if (producer->token == consumer->token) { + __disconnect(producer, consumer); + break; + } + } + + list_del(&consumer->node); + module_put(THIS_MODULE); + break; + } + + mutex_unlock(&lock); + + module_put(THIS_MODULE); +} +EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); |