diff options
111 files changed, 6437 insertions, 2379 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index df98b6304769..cd209f7730af 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -123,6 +123,37 @@ memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the flag KVM_VM_MIPS_VZ. +On arm64, the physical address size for a VM (IPA Size limit) is limited +to 40bits by default. The limit can be configured if the host supports the +extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use +KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type +identifier, where IPA_Bits is the maximum width of any physical +address used by the VM. The IPA_Bits is encoded in bits[7-0] of the +machine type identifier. + +e.g, to configure a guest to use 48bit physical address size : + + vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48)); + +The requested size (IPA_Bits) must be : + 0 - Implies default size, 40bits (for backward compatibility) + + or + + N - Implies N bits, where N is a positive integer such that, + 32 <= N <= Host_IPA_Limit + +Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and +is dependent on the CPU capability and the kernel configuration. The limit can +be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION +ioctl() at run-time. + +Please note that configuring the IPA size does not affect the capability +exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects +size of the address translated by the stage2 level (guest physical to +host physical address translations). + + 4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST @@ -850,7 +881,7 @@ struct kvm_vcpu_events { __u8 injected; __u8 nr; __u8 has_error_code; - __u8 pad; + __u8 pending; __u32 error_code; } exception; struct { @@ -873,15 +904,23 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; + __u8 reserved[27]; + __u8 exception_has_payload; + __u64 exception_payload; }; -Only two fields are defined in the flags field: +The following bits are defined in the flags field: -- KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that +- KVM_VCPUEVENT_VALID_SHADOW may be set to signal that interrupt.shadow contains a valid state. -- KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that - smi contains a valid state. +- KVM_VCPUEVENT_VALID_SMM may be set to signal that smi contains a + valid state. + +- KVM_VCPUEVENT_VALID_PAYLOAD may be set to signal that the + exception_has_payload, exception_payload, and exception.pending + fields contain a valid state. This bit will be set whenever + KVM_CAP_EXCEPTION_PAYLOAD is enabled. ARM/ARM64: @@ -961,6 +1000,11 @@ shall be written into the VCPU. KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available. +If KVM_CAP_EXCEPTION_PAYLOAD is enabled, KVM_VCPUEVENT_VALID_PAYLOAD +can be set in the flags field to signal that the +exception_has_payload, exception_payload, and exception.pending fields +contain a valid state and shall be written into the VCPU. + ARM/ARM64: Set the pending SError exception state for this VCPU. It is not possible to @@ -3681,6 +3725,34 @@ Returns: 0 on success, -1 on error This copies the vcpu's kvm_nested_state struct from userspace to the kernel. For the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE. +4.116 KVM_(UN)REGISTER_COALESCED_MMIO + +Capability: KVM_CAP_COALESCED_MMIO (for coalesced mmio) + KVM_CAP_COALESCED_PIO (for coalesced pio) +Architectures: all +Type: vm ioctl +Parameters: struct kvm_coalesced_mmio_zone +Returns: 0 on success, < 0 on error + +Coalesced I/O is a performance optimization that defers hardware +register write emulation so that userspace exits are avoided. It is +typically used to reduce the overhead of emulating frequently accessed +hardware registers. + +When a hardware register is configured for coalesced I/O, write accesses +do not exit to userspace and their value is recorded in a ring buffer +that is shared between kernel and userspace. + +Coalesced I/O is used if one or more write accesses to a hardware +register can be deferred until a read or a write to another hardware +register on the same device. This last access will cause a vmexit and +userspace will process accesses from the ring buffer before emulating +it. That will avoid exiting to userspace on repeated writes. + +Coalesced pio is based on coalesced mmio. There is little difference +between coalesced mmio and pio except that coalesced pio records accesses +to I/O ports. + 5. The kvm_run structure ------------------------ @@ -4527,7 +4599,7 @@ hpage module parameter is not set to 1, -EINVAL is returned. While it is generally possible to create a huge page backed VM without this capability, the VM will not be able to run. -7.14 KVM_CAP_MSR_PLATFORM_INFO +7.15 KVM_CAP_MSR_PLATFORM_INFO Architectures: x86 Parameters: args[0] whether feature should be enabled or not @@ -4550,6 +4622,31 @@ state). Enabling this capability on a VM depends on the CPU having the necessary functionality and on the facility being enabled with a kvm-hv module parameter. +7.17 KVM_CAP_EXCEPTION_PAYLOAD + +Architectures: x86 +Parameters: args[0] whether feature should be enabled or not + +With this capability enabled, CR2 will not be modified prior to the +emulated VM-exit when L1 intercepts a #PF exception that occurs in +L2. Similarly, for kvm-intel only, DR6 will not be modified prior to +the emulated VM-exit when L1 intercepts a #DB exception that occurs in +L2. As a result, when KVM_GET_VCPU_EVENTS reports a pending #PF (or +#DB) exception for L2, exception.has_payload will be set and the +faulting address (or the new DR6 bits*) will be reported in the +exception_payload field. Similarly, when userspace injects a #PF (or +#DB) into L2 using KVM_SET_VCPU_EVENTS, it is expected to set +exception.has_payload and to put the faulting address (or the new DR6 +bits*) in the exception_payload field. + +This capability also enables exception.pending in struct +kvm_vcpu_events, which allows userspace to distinguish between pending +and injected exceptions. + + +* For the new DR6 bits, note that bit 16 is set iff the #DB exception + will clear DR6.RTM. + 8. Other capabilities. ---------------------- @@ -4791,3 +4888,10 @@ CPU when the exception is taken. If this virtual SError is taken to EL1 using AArch64, this value will be reported in the ISS field of ESR_ELx. See KVM_CAP_VCPU_EVENTS for more details. +8.20 KVM_CAP_HYPERV_SEND_IPI + +Architectures: x86 + +This capability indicates that KVM supports paravirtualized Hyper-V IPI send +hypercalls: +HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx. diff --git a/MAINTAINERS b/MAINTAINERS index 1610fb26bdac..86e019c7b0fa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12260,6 +12260,7 @@ F: Documentation/networking/rds.txt RDT - RESOURCE ALLOCATION M: Fenghua Yu <[email protected]> +M: Reinette Chatre <[email protected]> S: Supported F: arch/x86/kernel/cpu/intel_rdt* @@ -15924,6 +15925,7 @@ F: net/x25/ X86 ARCHITECTURE (32-BIT AND 64-BIT) M: Thomas Gleixner <[email protected]> M: Ingo Molnar <[email protected]> +M: Borislav Petkov <[email protected]> R: "H. Peter Anvin" <[email protected]> @@ -15952,6 +15954,15 @@ M: Borislav Petkov <[email protected]> S: Maintained F: arch/x86/kernel/cpu/microcode/* +X86 MM +M: Dave Hansen <[email protected]> +M: Andy Lutomirski <[email protected]> +M: Peter Zijlstra <[email protected]> +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/mm +S: Maintained +F: arch/x86/mm/ + X86 PLATFORM DRIVERS M: Darren Hart <[email protected]> M: Andy Shevchenko <[email protected]> @@ -2,7 +2,7 @@ VERSION = 4 PATCHLEVEL = 19 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Merciless Moray # *DOCUMENTATION* diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index 3ab8b3781bfe..c3f1f9b304b7 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -133,8 +133,7 @@ * space. */ #define KVM_PHYS_SHIFT (40) -#define KVM_PHYS_SIZE (_AC(1, ULL) << KVM_PHYS_SHIFT) -#define KVM_PHYS_MASK (KVM_PHYS_SIZE - _AC(1, ULL)) + #define PTRS_PER_S2_PGD (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30)) /* Virtualization Translation Control Register (VTCR) bits */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 3ad482d2f1eb..5ca5d9af0c26 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -273,7 +273,7 @@ static inline void __cpu_init_stage2(void) kvm_call_hyp(__init_stage2_translation); } -static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) +static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) { return 0; } @@ -354,4 +354,15 @@ static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {} struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); +static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) +{ + /* + * On 32bit ARM, VMs get a static 40bit IPA stage2 setup, + * so any non-zero value used as type is illegal. + */ + if (type) + return -EINVAL; + return 0; +} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 265ea9cf7df7..5ad1a54f98dc 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -35,16 +35,12 @@ addr; \ }) -/* - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels. - */ -#define KVM_MMU_CACHE_MIN_PAGES 2 - #ifndef __ASSEMBLY__ #include <linux/highmem.h> #include <asm/cacheflush.h> #include <asm/cputype.h> +#include <asm/kvm_arm.h> #include <asm/kvm_hyp.h> #include <asm/pgalloc.h> #include <asm/stage2_pgtable.h> @@ -52,6 +48,13 @@ /* Ensure compatibility with arm64 */ #define VA_BITS 32 +#define kvm_phys_shift(kvm) KVM_PHYS_SHIFT +#define kvm_phys_size(kvm) (1ULL << kvm_phys_shift(kvm)) +#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - 1ULL) +#define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK + +#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t)) + int create_hyp_mappings(void *from, void *to, pgprot_t prot); int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, @@ -355,6 +358,8 @@ static inline int hyp_map_aux_data(void) #define kvm_phys_to_vttbr(addr) (addr) +static inline void kvm_set_ipa_limit(void) {} + #endif /* !__ASSEMBLY__ */ #endif /* __ARM_KVM_MMU_H__ */ diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h index 460d616bb2d6..f6a7ea805232 100644 --- a/arch/arm/include/asm/stage2_pgtable.h +++ b/arch/arm/include/asm/stage2_pgtable.h @@ -19,43 +19,53 @@ #ifndef __ARM_S2_PGTABLE_H_ #define __ARM_S2_PGTABLE_H_ -#define stage2_pgd_none(pgd) pgd_none(pgd) -#define stage2_pgd_clear(pgd) pgd_clear(pgd) -#define stage2_pgd_present(pgd) pgd_present(pgd) -#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) -#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(pud) pud_free(NULL, pud) - -#define stage2_pud_none(pud) pud_none(pud) -#define stage2_pud_clear(pud) pud_clear(pud) -#define stage2_pud_present(pud) pud_present(pud) -#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) -#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) - -#define stage2_pud_huge(pud) pud_huge(pud) +/* + * kvm_mmu_cache_min_pages() is the number of pages required + * to install a stage-2 translation. We pre-allocate the entry + * level table at VM creation. Since we have a 3 level page-table, + * we need only two pages to add a new mapping. + */ +#define kvm_mmu_cache_min_pages(kvm) 2 + +#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) +#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) +#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) +#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud) +#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address) +#define stage2_pud_free(kvm, pud) pud_free(NULL, pud) + +#define stage2_pud_none(kvm, pud) pud_none(pud) +#define stage2_pud_clear(kvm, pud) pud_clear(pud) +#define stage2_pud_present(kvm, pud) pud_present(pud) +#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd) +#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address) +#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd) + +#define stage2_pud_huge(kvm, pud) pud_huge(pud) /* Open coded p*d_addr_end that can deal with 64bit addresses */ -static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK; return (boundary - 1 < end - 1) ? boundary : end; } -#define stage2_pud_addr_end(addr, end) (end) +#define stage2_pud_addr_end(kvm, addr, end) (end) -static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK; return (boundary - 1 < end - 1) ? boundary : end; } -#define stage2_pgd_index(addr) pgd_index(addr) +#define stage2_pgd_index(kvm, addr) pgd_index(addr) -#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) -#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) -#define stage2_pud_table_empty(pudp) false +#define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) +#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) +#define stage2_pud_table_empty(kvm, pudp) false #endif /* __ARM_S2_PGTABLE_H_ */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 1717ba1db35d..072cc1c970c2 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -530,6 +530,26 @@ void arm64_set_ssbd_mitigation(bool state); static inline void arm64_set_ssbd_mitigation(bool state) {} #endif +static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) +{ + switch (parange) { + case 0: return 32; + case 1: return 36; + case 2: return 40; + case 3: return 42; + case 4: return 44; + case 5: return 48; + case 6: return 52; + /* + * A future PE could use a value unknown to the kernel. + * However, by the "D10.1.4 Principles of the ID scheme + * for fields in ID registers", ARM DDI 0487C.a, any new + * value is guaranteed to be higher than what we know already. + * As a safe limit, we return the limit supported by the kernel. + */ + default: return CONFIG_ARM64_PA_BITS; + } +} #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index aa45df752a16..6e324d1f1231 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -107,6 +107,7 @@ #define VTCR_EL2_RES1 (1 << 31) #define VTCR_EL2_HD (1 << 22) #define VTCR_EL2_HA (1 << 21) +#define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT #define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK #define VTCR_EL2_TG0_MASK TCR_TG0_MASK #define VTCR_EL2_TG0_4K TCR_TG0_4K @@ -120,62 +121,149 @@ #define VTCR_EL2_IRGN0_WBWA TCR_IRGN0_WBWA #define VTCR_EL2_SL0_SHIFT 6 #define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT) -#define VTCR_EL2_SL0_LVL1 (1 << VTCR_EL2_SL0_SHIFT) #define VTCR_EL2_T0SZ_MASK 0x3f -#define VTCR_EL2_T0SZ_40B 24 #define VTCR_EL2_VS_SHIFT 19 #define VTCR_EL2_VS_8BIT (0 << VTCR_EL2_VS_SHIFT) #define VTCR_EL2_VS_16BIT (1 << VTCR_EL2_VS_SHIFT) +#define VTCR_EL2_T0SZ(x) TCR_T0SZ(x) + /* * We configure the Stage-2 page tables to always restrict the IPA space to be * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are * not known to exist and will break with this configuration. * - * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time - * (see hyp-init.S). + * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2(). * * Note that when using 4K pages, we concatenate two first level page tables * together. With 16K pages, we concatenate 16 first level page tables. * - * The magic numbers used for VTTBR_X in this patch can be found in Tables - * D4-23 and D4-25 in ARM DDI 0487A.b. */ -#define VTCR_EL2_T0SZ_IPA VTCR_EL2_T0SZ_40B #define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \ VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1) -#ifdef CONFIG_ARM64_64K_PAGES /* - * Stage2 translation configuration: - * 64kB pages (TG0 = 1) - * 2 level page tables (SL = 1) + * VTCR_EL2:SL0 indicates the entry level for Stage2 translation. + * Interestingly, it depends on the page size. + * See D.10.2.121, VTCR_EL2, in ARM DDI 0487C.a + * + * ----------------------------------------- + * | Entry level | 4K | 16K/64K | + * ------------------------------------------ + * | Level: 0 | 2 | - | + * ------------------------------------------ + * | Level: 1 | 1 | 2 | + * ------------------------------------------ + * | Level: 2 | 0 | 1 | + * ------------------------------------------ + * | Level: 3 | - | 0 | + * ------------------------------------------ + * + * The table roughly translates to : + * + * SL0(PAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level + * + * Where TGRAN_SL0_BASE is a magic number depending on the page size: + * TGRAN_SL0_BASE(4K) = 2 + * TGRAN_SL0_BASE(16K) = 3 + * TGRAN_SL0_BASE(64K) = 3 + * provided we take care of ruling out the unsupported cases and + * Entry_Level = 4 - Number_of_levels. + * */ -#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1) -#define VTTBR_X_TGRAN_MAGIC 38 +#ifdef CONFIG_ARM64_64K_PAGES + +#define VTCR_EL2_TGRAN VTCR_EL2_TG0_64K +#define VTCR_EL2_TGRAN_SL0_BASE 3UL + #elif defined(CONFIG_ARM64_16K_PAGES) -/* - * Stage2 translation configuration: - * 16kB pages (TG0 = 2) - * 2 level page tables (SL = 1) - */ -#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1) -#define VTTBR_X_TGRAN_MAGIC 42 + +#define VTCR_EL2_TGRAN VTCR_EL2_TG0_16K +#define VTCR_EL2_TGRAN_SL0_BASE 3UL + #else /* 4K */ -/* - * Stage2 translation configuration: - * 4kB pages (TG0 = 0) - * 3 level page tables (SL = 1) - */ -#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1) -#define VTTBR_X_TGRAN_MAGIC 37 + +#define VTCR_EL2_TGRAN VTCR_EL2_TG0_4K +#define VTCR_EL2_TGRAN_SL0_BASE 2UL + #endif -#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS) -#define VTTBR_X (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA) +#define VTCR_EL2_LVLS_TO_SL0(levels) \ + ((VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels))) << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_SL0_TO_LVLS(sl0) \ + ((sl0) + 4 - VTCR_EL2_TGRAN_SL0_BASE) +#define VTCR_EL2_LVLS(vtcr) \ + VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT) + +#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN) +#define VTCR_EL2_IPA(vtcr) (64 - ((vtcr) & VTCR_EL2_T0SZ_MASK)) + +/* + * ARM VMSAv8-64 defines an algorithm for finding the translation table + * descriptors in section D4.2.8 in ARM DDI 0487C.a. + * + * The algorithm defines the expectations on the translation table + * addresses for each level, based on PAGE_SIZE, entry level + * and the translation table size (T0SZ). The variable "x" in the + * algorithm determines the alignment of a table base address at a given + * level and thus determines the alignment of VTTBR:BADDR for stage2 + * page table entry level. + * Since the number of bits resolved at the entry level could vary + * depending on the T0SZ, the value of "x" is defined based on a + * Magic constant for a given PAGE_SIZE and Entry Level. The + * intermediate levels must be always aligned to the PAGE_SIZE (i.e, + * x = PAGE_SHIFT). + * + * The value of "x" for entry level is calculated as : + * x = Magic_N - T0SZ + * + * where Magic_N is an integer depending on the page size and the entry + * level of the page table as below: + * + * -------------------------------------------- + * | Entry level | 4K 16K 64K | + * -------------------------------------------- + * | Level: 0 (4 levels) | 28 | - | - | + * -------------------------------------------- + * | Level: 1 (3 levels) | 37 | 31 | 25 | + * -------------------------------------------- + * | Level: 2 (2 levels) | 46 | 42 | 38 | + * -------------------------------------------- + * | Level: 3 (1 level) | - | 53 | 51 | + * -------------------------------------------- + * + * We have a magic formula for the Magic_N below: + * + * Magic_N(PAGE_SIZE, Level) = 64 - ((PAGE_SHIFT - 3) * Number_of_levels) + * + * where Number_of_levels = (4 - Level). We are only interested in the + * value for Entry_Level for the stage2 page table. + * + * So, given that T0SZ = (64 - IPA_SHIFT), we can compute 'x' as follows: + * + * x = (64 - ((PAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT) + * = IPA_SHIFT - ((PAGE_SHIFT - 3) * Number of levels) + * + * Here is one way to explain the Magic Formula: + * + * x = log2(Size_of_Entry_Level_Table) + * + * Since, we can resolve (PAGE_SHIFT - 3) bits at each level, and another + * PAGE_SHIFT bits in the PTE, we have : + * + * Bits_Entry_level = IPA_SHIFT - ((PAGE_SHIFT - 3) * (n - 1) + PAGE_SHIFT) + * = IPA_SHIFT - (PAGE_SHIFT - 3) * n - 3 + * where n = number of levels, and since each pointer is 8bytes, we have: + * + * x = Bits_Entry_Level + 3 + * = IPA_SHIFT - (PAGE_SHIFT - 3) * n + * + * The only constraint here is that, we have to find the number of page table + * levels for a given IPA size (which we do, see stage2_pt_levels()) + */ +#define ARM64_VTTBR_X(ipa, levels) ((ipa) - ((levels) * (PAGE_SHIFT - 3))) -#define VTTBR_BADDR_MASK (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X) #define VTTBR_VMID_SHIFT (UL(48)) #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT) @@ -223,6 +311,13 @@ /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) +/* + * We have + * PAR [PA_Shift - 1 : 12] = PA [PA_Shift - 1 : 12] + * HPFAR [PA_Shift - 9 : 4] = FIPA [PA_Shift - 1 : 12] + */ +#define PAR_TO_HPFAR(par) \ + (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) #define kvm_arm_exception_type \ {0, "IRQ" }, \ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 102b5a5c47b6..aea01a09eb94 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -30,6 +30,7 @@ #define ARM_EXCEPTION_IRQ 0 #define ARM_EXCEPTION_EL1_SERROR 1 #define ARM_EXCEPTION_TRAP 2 +#define ARM_EXCEPTION_IL 3 /* The hyp-stub will return this for any kvm_call_hyp() call */ #define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR @@ -72,8 +73,6 @@ extern void __vgic_v3_init_lrs(void); extern u32 __kvm_get_mdcr_el2(void); -extern u32 __init_stage2_translation(void); - /* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */ #define __hyp_this_cpu_ptr(sym) \ ({ \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3d6d7336f871..f84052f306af 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -53,7 +53,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); -int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext); +int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); struct kvm_arch { @@ -61,11 +61,13 @@ struct kvm_arch { u64 vmid_gen; u32 vmid; - /* 1-level 2nd stage table, protected by kvm->mmu_lock */ + /* stage2 entry level table */ pgd_t *pgd; /* VTTBR value associated with above pgd and vmid */ u64 vttbr; + /* VTCR_EL2 value for this VM */ + u64 vtcr; /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; @@ -440,13 +442,7 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); -static inline void __cpu_init_stage2(void) -{ - u32 parange = kvm_call_hyp(__init_stage2_translation); - - WARN_ONCE(parange < 40, - "PARange is %d bits, unsupported configuration!", parange); -} +static inline void __cpu_init_stage2(void) {} /* Guest/host FPSIMD coordination helpers */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); @@ -509,8 +505,12 @@ static inline int kvm_arm_have_ssbd(void) void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu); void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu); +void kvm_set_ipa_limit(void); + #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); +int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 384c34397619..23aca66767f9 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -155,5 +155,15 @@ void deactivate_traps_vhe_put(void); u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); void __noreturn __hyp_do_panic(unsigned long, ...); +/* + * Must be called from hyp code running at EL2 with an updated VTTBR + * and interrupts disabled. + */ +static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm) +{ + write_sysreg(kvm->arch.vtcr, vtcr_el2); + write_sysreg(kvm->arch.vttbr, vttbr_el2); +} + #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index d6fff7de5539..77b1af9e64db 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -141,8 +141,16 @@ static inline unsigned long __kern_hyp_va(unsigned long v) * We currently only support a 40bit IPA. */ #define KVM_PHYS_SHIFT (40) -#define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT) -#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL) + +#define kvm_phys_shift(kvm) VTCR_EL2_IPA(kvm->arch.vtcr) +#define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm)) +#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) + +static inline bool kvm_page_empty(void *ptr) +{ + struct page *ptr_page = virt_to_page(ptr); + return page_count(ptr_page) == 1; +} #include <asm/stage2_pgtable.h> @@ -238,12 +246,6 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp) return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); } -static inline bool kvm_page_empty(void *ptr) -{ - struct page *ptr_page = virt_to_page(ptr); - return page_count(ptr_page) == 1; -} - #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED @@ -517,5 +519,29 @@ static inline int hyp_map_aux_data(void) #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) +/* + * Get the magic number 'x' for VTTBR:BADDR of this KVM instance. + * With v8.2 LVA extensions, 'x' should be a minimum of 6 with + * 52bit IPS. + */ +static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels) +{ + int x = ARM64_VTTBR_X(ipa_shift, levels); + + return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x; +} + +static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels) +{ + unsigned int x = arm64_vttbr_x(ipa_shift, levels); + + return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x); +} + +static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm) +{ + return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); +} + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 177b851ca6d9..ff35ac1258eb 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -25,6 +25,9 @@ #define CurrentEL_EL1 (1 << 2) #define CurrentEL_EL2 (2 << 2) +/* Additional SPSR bits not exposed in the UABI */ +#define PSR_IL_BIT (1 << 20) + /* AArch32-specific ptrace requests */ #define COMPAT_PTRACE_GETREGS 12 #define COMPAT_PTRACE_SETREGS 13 diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h deleted file mode 100644 index 2656a0fd05a6..000000000000 --- a/arch/arm64/include/asm/stage2_pgtable-nopmd.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (C) 2016 - ARM Ltd - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#ifndef __ARM64_S2_PGTABLE_NOPMD_H_ -#define __ARM64_S2_PGTABLE_NOPMD_H_ - -#include <asm/stage2_pgtable-nopud.h> - -#define __S2_PGTABLE_PMD_FOLDED - -#define S2_PMD_SHIFT S2_PUD_SHIFT -#define S2_PTRS_PER_PMD 1 -#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) -#define S2_PMD_MASK (~(S2_PMD_SIZE-1)) - -#define stage2_pud_none(pud) (0) -#define stage2_pud_present(pud) (1) -#define stage2_pud_clear(pud) do { } while (0) -#define stage2_pud_populate(pud, pmd) do { } while (0) -#define stage2_pmd_offset(pud, address) ((pmd_t *)(pud)) - -#define stage2_pmd_free(pmd) do { } while (0) - -#define stage2_pmd_addr_end(addr, end) (end) - -#define stage2_pud_huge(pud) (0) -#define stage2_pmd_table_empty(pmdp) (0) - -#endif diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h deleted file mode 100644 index 5ee87b54ebf3..000000000000 --- a/arch/arm64/include/asm/stage2_pgtable-nopud.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (C) 2016 - ARM Ltd - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#ifndef __ARM64_S2_PGTABLE_NOPUD_H_ -#define __ARM64_S2_PGTABLE_NOPUD_H_ - -#define __S2_PGTABLE_PUD_FOLDED - -#define S2_PUD_SHIFT S2_PGDIR_SHIFT -#define S2_PTRS_PER_PUD 1 -#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) -#define S2_PUD_MASK (~(S2_PUD_SIZE-1)) - -#define stage2_pgd_none(pgd) (0) -#define stage2_pgd_present(pgd) (1) -#define stage2_pgd_clear(pgd) do { } while (0) -#define stage2_pgd_populate(pgd, pud) do { } while (0) - -#define stage2_pud_offset(pgd, address) ((pud_t *)(pgd)) - -#define stage2_pud_free(x) do { } while (0) - -#define stage2_pud_addr_end(addr, end) (end) -#define stage2_pud_table_empty(pmdp) (0) - -#endif diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 8b68099348e5..d352f6df8d2c 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -19,9 +19,17 @@ #ifndef __ARM64_S2_PGTABLE_H_ #define __ARM64_S2_PGTABLE_H_ +#include <linux/hugetlb.h> #include <asm/pgtable.h> /* + * PGDIR_SHIFT determines the size a top-level page table entry can map + * and depends on the number of levels in the page table. Compute the + * PGDIR_SHIFT for a given number of levels. + */ +#define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) + +/* * The hardware supports concatenation of up to 16 tables at stage2 entry level * and we use the feature whenever possible. * @@ -29,112 +37,208 @@ * On arm64, the smallest PAGE_SIZE supported is 4k, which means * (PAGE_SHIFT - 3) > 4 holds for all page sizes. * This implies, the total number of page table levels at stage2 expected - * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4) + * by the hardware is actually the number of levels required for (IPA_SHIFT - 4) * in normal translations(e.g, stage1), since we cannot have another level in - * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4). + * the range (IPA_SHIFT, IPA_SHIFT - 4). */ -#define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4) +#define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) +#define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) -/* - * With all the supported VA_BITs and 40bit guest IPA, the following condition - * is always true: - * - * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS - * - * We base our stage-2 page table walker helpers on this assumption and - * fall back to using the host version of the helper wherever possible. - * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back - * to using the host version, since it is guaranteed it is not folded at host. - * - * If the condition breaks in the future, we can rearrange the host level - * definitions and reuse them for stage2. Till then... - */ -#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS -#error "Unsupported combination of guest IPA and host VA_BITS." -#endif - -/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */ -#define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS) -#define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT) -#define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1)) +/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */ +#define stage2_pgdir_shift(kvm) pt_levels_pgdir_shift(kvm_stage2_levels(kvm)) +#define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm)) +#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1) /* * The number of PTRS across all concatenated stage2 tables given by the * number of bits resolved at the initial level. + * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA), + * in which case, stage2_pgd_ptrs will have one entry. */ -#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT)) +#define pgd_ptrs_shift(ipa, pgdir_shift) \ + ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0) +#define __s2_pgd_ptrs(ipa, lvls) \ + (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls)))) +#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t)) + +#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) +#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) /* - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation - * levels in addition to the PGD. + * kvm_mmmu_cache_min_pages() is the number of pages required to install + * a stage-2 translation. We pre-allocate the entry level page table at + * the VM creation. */ -#define KVM_MMU_CACHE_MIN_PAGES (STAGE2_PGTABLE_LEVELS - 1) +#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) - -#if STAGE2_PGTABLE_LEVELS > 3 +/* Stage2 PUD definitions when the level is present */ +static inline bool kvm_stage2_has_pud(struct kvm *kvm) +{ + return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3); +} #define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) -#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) +#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) #define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) -#define stage2_pgd_none(pgd) pgd_none(pgd) -#define stage2_pgd_clear(pgd) pgd_clear(pgd) -#define stage2_pgd_present(pgd) pgd_present(pgd) -#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) -#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(pud) pud_free(NULL, pud) +static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd) +{ + if (kvm_stage2_has_pud(kvm)) + return pgd_none(pgd); + else + return 0; +} -#define stage2_pud_table_empty(pudp) kvm_page_empty(pudp) +static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp) +{ + if (kvm_stage2_has_pud(kvm)) + pgd_clear(pgdp); +} -static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end) +static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd) { - phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; + if (kvm_stage2_has_pud(kvm)) + return pgd_present(pgd); + else + return 1; +} - return (boundary - 1 < end - 1) ? boundary : end; +static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud) +{ + if (kvm_stage2_has_pud(kvm)) + pgd_populate(NULL, pgd, pud); +} + +static inline pud_t *stage2_pud_offset(struct kvm *kvm, + pgd_t *pgd, unsigned long address) +{ + if (kvm_stage2_has_pud(kvm)) + return pud_offset(pgd, address); + else + return (pud_t *)pgd; } -#endif /* STAGE2_PGTABLE_LEVELS > 3 */ +static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) +{ + if (kvm_stage2_has_pud(kvm)) + pud_free(NULL, pud); +} +static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) +{ + if (kvm_stage2_has_pud(kvm)) + return kvm_page_empty(pudp); + else + return false; +} -#if STAGE2_PGTABLE_LEVELS > 2 +static inline phys_addr_t +stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +{ + if (kvm_stage2_has_pud(kvm)) { + phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; + } else { + return end; + } +} + +/* Stage2 PMD definitions when the level is present */ +static inline bool kvm_stage2_has_pmd(struct kvm *kvm) +{ + return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2); +} #define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) -#define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT) +#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) #define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) -#define stage2_pud_none(pud) pud_none(pud) -#define stage2_pud_clear(pud) pud_clear(pud) -#define stage2_pud_present(pud) pud_present(pud) -#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) -#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) +static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud) +{ + if (kvm_stage2_has_pmd(kvm)) + return pud_none(pud); + else + return 0; +} + +static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud) +{ + if (kvm_stage2_has_pmd(kvm)) + pud_clear(pud); +} -#define stage2_pud_huge(pud) pud_huge(pud) -#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) +static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud) +{ + if (kvm_stage2_has_pmd(kvm)) + return pud_present(pud); + else + return 1; +} -static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd) { - phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; + if (kvm_stage2_has_pmd(kvm)) + pud_populate(NULL, pud, pmd); +} - return (boundary - 1 < end - 1) ? boundary : end; +static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, + pud_t *pud, unsigned long address) +{ + if (kvm_stage2_has_pmd(kvm)) + return pmd_offset(pud, address); + else + return (pmd_t *)pud; } -#endif /* STAGE2_PGTABLE_LEVELS > 2 */ +static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd) +{ + if (kvm_stage2_has_pmd(kvm)) + pmd_free(NULL, pmd); +} + +static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud) +{ + if (kvm_stage2_has_pmd(kvm)) + return pud_huge(pud); + else + return 0; +} + +static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp) +{ + if (kvm_stage2_has_pmd(kvm)) + return kvm_page_empty(pmdp); + else + return 0; +} -#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) +static inline phys_addr_t +stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +{ + if (kvm_stage2_has_pmd(kvm)) { + phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; -#if STAGE2_PGTABLE_LEVELS == 2 -#include <asm/stage2_pgtable-nopmd.h> -#elif STAGE2_PGTABLE_LEVELS == 3 -#include <asm/stage2_pgtable-nopud.h> -#endif + return (boundary - 1 < end - 1) ? boundary : end; + } else { + return end; + } +} +static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep) +{ + return kvm_page_empty(ptep); +} -#define stage2_pgd_index(addr) (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) +static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr) +{ + return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1)); +} -static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { - phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK; + phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm); return (boundary - 1 < end - 1) ? boundary : end; } diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 07256b08226c..a74f84d09412 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -338,15 +338,15 @@ int __attribute_const__ kvm_target_cpu(void) return KVM_ARM_TARGET_CORTEX_A53; case ARM_CPU_PART_CORTEX_A57: return KVM_ARM_TARGET_CORTEX_A57; - }; + } break; case ARM_CPU_IMP_APM: switch (part_number) { case APM_CPU_PART_POTENZA: return KVM_ARM_TARGET_XGENE_POTENZA; - }; + } break; - }; + } /* Return a default generic target */ return KVM_ARM_TARGET_GENERIC_V8; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index e5e741bfffe1..35a81bebd02b 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -284,6 +284,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, */ run->exit_reason = KVM_EXIT_FAIL_ENTRY; return 0; + case ARM_EXCEPTION_IL: + /* + * We attempted an illegal exception return. Guest state must + * have been corrupted somehow. Give up. + */ + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + return -EINVAL; default: kvm_pr_unimpl("Unsupported exception type: %d", exception_index); diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index 2fabc2dc1966..82d1904328ad 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -19,7 +19,6 @@ obj-$(CONFIG_KVM_ARM_HOST) += switch.o obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o obj-$(CONFIG_KVM_ARM_HOST) += tlb.o obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o -obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o # KVM code is run at a different exception code with a different map, so # compiler instrumentation that inserts callbacks or checks into the code may diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 24b4fbafe3e4..b1f14f736962 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -162,6 +162,20 @@ el1_error: mov x0, #ARM_EXCEPTION_EL1_SERROR b __guest_exit +el2_sync: + /* Check for illegal exception return, otherwise panic */ + mrs x0, spsr_el2 + + /* if this was something else, then panic! */ + tst x0, #PSR_IL_BIT + b.eq __hyp_panic + + /* Let's attempt a recovery from the illegal exception return */ + get_vcpu_ptr x1, x0 + mov x0, #ARM_EXCEPTION_IL + b __guest_exit + + el2_error: ldp x0, x1, [sp], #16 @@ -240,7 +254,7 @@ ENTRY(__kvm_hyp_vector) invalid_vect el2t_fiq_invalid // FIQ EL2t invalid_vect el2t_error_invalid // Error EL2t - invalid_vect el2h_sync_invalid // Synchronous EL2h + valid_vect el2_sync // Synchronous EL2h invalid_vect el2h_irq_invalid // IRQ EL2h invalid_vect el2h_fiq_invalid // FIQ EL2h valid_vect el2_error // Error EL2h diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c deleted file mode 100644 index 603e1ee83e89..000000000000 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (C) 2016 - ARM Ltd - * Author: Marc Zyngier <[email protected]> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/types.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_asm.h> -#include <asm/kvm_hyp.h> - -u32 __hyp_text __init_stage2_translation(void) -{ - u64 val = VTCR_EL2_FLAGS; - u64 parange; - u64 tmp; - - /* - * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS - * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, while - * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2... - */ - parange = read_sysreg(id_aa64mmfr0_el1) & 7; - if (parange > ID_AA64MMFR0_PARANGE_MAX) - parange = ID_AA64MMFR0_PARANGE_MAX; - val |= parange << 16; - - /* Compute the actual PARange... */ - switch (parange) { - case 0: - parange = 32; - break; - case 1: - parange = 36; - break; - case 2: - parange = 40; - break; - case 3: - parange = 42; - break; - case 4: - parange = 44; - break; - case 5: - default: - parange = 48; - break; - } - - /* - * ... and clamp it to 40 bits, unless we have some braindead - * HW that implements less than that. In all cases, we'll - * return that value for the rest of the kernel to decide what - * to do. - */ - val |= 64 - (parange > 40 ? 40 : parange); - - /* - * Check the availability of Hardware Access Flag / Dirty Bit - * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2. - */ - tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf; - if (tmp) - val |= VTCR_EL2_HA; - - /* - * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS - * bit in VTCR_EL2. - */ - tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_VMIDBITS_SHIFT) & 0xf; - val |= (tmp == ID_AA64MMFR1_VMIDBITS_16) ? - VTCR_EL2_VS_16BIT : - VTCR_EL2_VS_8BIT; - - write_sysreg(val, vtcr_el2); - - return parange; -} diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index ca46153d7915..7cc175c88a37 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -198,7 +198,7 @@ void deactivate_traps_vhe_put(void) static void __hyp_text __activate_vm(struct kvm *kvm) { - write_sysreg(kvm->arch.vttbr, vttbr_el2); + __load_guest_stage2(kvm); } static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) @@ -263,7 +263,7 @@ static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar) return false; /* Translation failed, back to guest */ /* Convert PAR to HPFAR format */ - *hpfar = ((tmp >> 12) & ((1UL << 36) - 1)) << 4; + *hpfar = PAR_TO_HPFAR(tmp); return true; } diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index 9ce223944983..8dc285318204 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -152,8 +152,25 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) static void __hyp_text __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt) { + u64 pstate = ctxt->gp_regs.regs.pstate; + u64 mode = pstate & PSR_AA32_MODE_MASK; + + /* + * Safety check to ensure we're setting the CPU up to enter the guest + * in a less privileged mode. + * + * If we are attempting a return to EL2 or higher in AArch64 state, + * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that + * we'll take an illegal exception state exception immediately after + * the ERET to the guest. Attempts to return to AArch32 Hyp will + * result in an illegal exception return because EL2's execution state + * is determined by SCR_EL3.RW. + */ + if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t) + pstate = PSR_MODE_EL2h | PSR_IL_BIT; + write_sysreg_el2(ctxt->gp_regs.regs.pc, elr); - write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr); + write_sysreg_el2(pstate, spsr); if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2); diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index 131c7772703c..4dbd9c69a96d 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -30,7 +30,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm) * bits. Changing E2H is impossible (goodbye TTBR1_EL2), so * let's flip TGE before executing the TLB operation. */ - write_sysreg(kvm->arch.vttbr, vttbr_el2); + __load_guest_stage2(kvm); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; write_sysreg(val, hcr_el2); @@ -39,7 +39,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm) static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm) { - write_sysreg(kvm->arch.vttbr, vttbr_el2); + __load_guest_stage2(kvm); isb(); } diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e37c78bbe1ca..b72a3dd56204 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -26,6 +26,7 @@ #include <kvm/arm_arch_timer.h> +#include <asm/cpufeature.h> #include <asm/cputype.h> #include <asm/ptrace.h> #include <asm/kvm_arm.h> @@ -33,6 +34,9 @@ #include <asm/kvm_coproc.h> #include <asm/kvm_mmu.h> +/* Maximum phys_shift supported for any VM on this host */ +static u32 kvm_ipa_limit; + /* * ARMv8 Reset Values */ @@ -55,12 +59,12 @@ static bool cpu_has_32bit_el1(void) } /** - * kvm_arch_dev_ioctl_check_extension + * kvm_arch_vm_ioctl_check_extension * * We currently assume that the number of HW registers is uniform * across all CPUs (see cpuinfo_sanity_check). */ -int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) +int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; @@ -82,9 +86,11 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_VCPU_ATTRIBUTES: - case KVM_CAP_VCPU_EVENTS: r = 1; break; + case KVM_CAP_ARM_VM_IPA_SIZE: + r = kvm_ipa_limit; + break; default: r = 0; } @@ -133,3 +139,99 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) /* Reset timer */ return kvm_timer_vcpu_reset(vcpu); } + +void kvm_set_ipa_limit(void) +{ + unsigned int ipa_max, pa_max, va_max, parange; + + parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7; + pa_max = id_aa64mmfr0_parange_to_phys_shift(parange); + + /* Clamp the IPA limit to the PA size supported by the kernel */ + ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max; + /* + * Since our stage2 table is dependent on the stage1 page table code, + * we must always honor the following condition: + * + * Number of levels in Stage1 >= Number of levels in Stage2. + * + * So clamp the ipa limit further down to limit the number of levels. + * Since we can concatenate upto 16 tables at entry level, we could + * go upto 4bits above the maximum VA addressible with the current + * number of levels. + */ + va_max = PGDIR_SHIFT + PAGE_SHIFT - 3; + va_max += 4; + + if (va_max < ipa_max) + ipa_max = va_max; + + /* + * If the final limit is lower than the real physical address + * limit of the CPUs, report the reason. + */ + if (ipa_max < pa_max) + pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n", + (va_max < pa_max) ? "Virtual" : "Physical"); + + WARN(ipa_max < KVM_PHYS_SHIFT, + "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max); + kvm_ipa_limit = ipa_max; + kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit); +} + +/* + * Configure the VTCR_EL2 for this VM. The VTCR value is common + * across all the physical CPUs on the system. We use system wide + * sanitised values to fill in different fields, except for Hardware + * Management of Access Flags. HA Flag is set unconditionally on + * all CPUs, as it is safe to run with or without the feature and + * the bit is RES0 on CPUs that don't support it. + */ +int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) +{ + u64 vtcr = VTCR_EL2_FLAGS; + u32 parange, phys_shift; + u8 lvls; + + if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + return -EINVAL; + + phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); + if (phys_shift) { + if (phys_shift > kvm_ipa_limit || + phys_shift < 32) + return -EINVAL; + } else { + phys_shift = KVM_PHYS_SHIFT; + } + + parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7; + if (parange > ID_AA64MMFR0_PARANGE_MAX) + parange = ID_AA64MMFR0_PARANGE_MAX; + vtcr |= parange << VTCR_EL2_PS_SHIFT; + + vtcr |= VTCR_EL2_T0SZ(phys_shift); + /* + * Use a minimum 2 level page table to prevent splitting + * host PMD huge pages at stage2. + */ + lvls = stage2_pgtable_levels(phys_shift); + if (lvls < 2) + lvls = 2; + vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); + + /* + * Enable the Hardware Access Flag management, unconditionally + * on all CPUs. The features is RES0 on CPUs without the support + * and must be ignored by the CPUs. + */ + vtcr |= VTCR_EL2_HA; + + /* Set the vmid bits */ + vtcr |= (kvm_get_vmid_bits() == 16) ? + VTCR_EL2_VS_16BIT : + VTCR_EL2_VS_8BIT; + kvm->arch.vtcr = vtcr; + return 0; +} diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index e203169931c7..6390bd8c141b 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -14,6 +14,16 @@ #ifndef _ASM_X86_FIXMAP_H #define _ASM_X86_FIXMAP_H +/* + * Exposed to assembly code for setting up initial page tables. Cannot be + * calculated in assembly code (fixmap entries are an enum), but is sanity + * checked in the actual fixmap C code to make sure that the fixmap is + * covered fully. + */ +#define FIXMAP_PMD_NUM 2 +/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */ +#define FIXMAP_PMD_TOP 507 + #ifndef __ASSEMBLY__ #include <linux/kernel.h> #include <asm/acpi.h> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 09b2e3e2cf1b..55e51ff7e421 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -102,7 +102,15 @@ #define UNMAPPED_GVA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ -#define KVM_NR_PAGE_SIZES 3 +enum { + PT_PAGE_TABLE_LEVEL = 1, + PT_DIRECTORY_LEVEL = 2, + PT_PDPE_LEVEL = 3, + /* set max level to the biggest one */ + PT_MAX_HUGEPAGE_LEVEL = PT_PDPE_LEVEL, +}; +#define KVM_NR_PAGE_SIZES (PT_MAX_HUGEPAGE_LEVEL - \ + PT_PAGE_TABLE_LEVEL + 1) #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) @@ -177,6 +185,7 @@ enum { #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) +#define DR6_BT (1 << 15) #define DR6_RTM (1 << 16) #define DR6_FIXED_1 0xfffe0ff0 #define DR6_INIT 0xffff0ff0 @@ -247,7 +256,7 @@ struct kvm_mmu_memory_cache { * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp. */ union kvm_mmu_page_role { - unsigned word; + u32 word; struct { unsigned level:4; unsigned cr4_pae:1; @@ -273,6 +282,34 @@ union kvm_mmu_page_role { }; }; +union kvm_mmu_extended_role { +/* + * This structure complements kvm_mmu_page_role caching everything needed for + * MMU configuration. If nothing in both these structures changed, MMU + * re-configuration can be skipped. @valid bit is set on first usage so we don't + * treat all-zero structure as valid data. + */ + u32 word; + struct { + unsigned int valid:1; + unsigned int execonly:1; + unsigned int cr0_pg:1; + unsigned int cr4_pse:1; + unsigned int cr4_pke:1; + unsigned int cr4_smap:1; + unsigned int cr4_smep:1; + unsigned int cr4_la57:1; + }; +}; + +union kvm_mmu_role { + u64 as_u64; + struct { + union kvm_mmu_page_role base; + union kvm_mmu_extended_role ext; + }; +}; + struct kvm_rmap_head { unsigned long val; }; @@ -280,18 +317,18 @@ struct kvm_rmap_head { struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + bool unsync; /* * The following two entries are used to key the shadow page in the * hash table. */ - gfn_t gfn; union kvm_mmu_page_role role; + gfn_t gfn; u64 *spt; /* hold the gfn of each spte inside spt */ gfn_t *gfns; - bool unsync; int root_count; /* Currently serving as active root */ unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ @@ -360,7 +397,7 @@ struct kvm_mmu { void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; - union kvm_mmu_page_role base_role; + union kvm_mmu_role mmu_role; u8 root_level; u8 shadow_root_level; u8 ept_ad; @@ -490,7 +527,7 @@ struct kvm_vcpu_hv { struct kvm_hyperv_exit exit; struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT]; DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); - cpumask_t tlb_lush; + cpumask_t tlb_flush; }; struct kvm_vcpu_arch { @@ -534,7 +571,13 @@ struct kvm_vcpu_arch { * the paging mode of the l1 guest. This context is always used to * handle faults. */ - struct kvm_mmu mmu; + struct kvm_mmu *mmu; + + /* Non-nested MMU for L1 */ + struct kvm_mmu root_mmu; + + /* L1 MMU when running nested */ + struct kvm_mmu guest_mmu; /* * Paging state of an L2 guest (used for nested npt) @@ -585,6 +628,8 @@ struct kvm_vcpu_arch { bool has_error_code; u8 nr; u32 error_code; + unsigned long payload; + bool has_payload; u8 nested_apf; } exception; @@ -781,6 +826,9 @@ struct kvm_hv { u64 hv_reenlightenment_control; u64 hv_tsc_emulation_control; u64 hv_tsc_emulation_status; + + /* How many vCPUs have VP index != vCPU index */ + atomic_t num_mismatched_vp_indexes; }; enum kvm_irqchip_mode { @@ -871,6 +919,7 @@ struct kvm_arch { bool x2apic_broadcast_quirk_disabled; bool guest_can_read_msr_platform_info; + bool exception_payload_enabled; }; struct kvm_vm_stat { @@ -1133,6 +1182,9 @@ struct kvm_x86_ops { int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*get_msr_feature)(struct kvm_msr_entry *entry); + + int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version); }; struct kvm_arch_async_pf { @@ -1170,7 +1222,6 @@ void kvm_mmu_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, @@ -1324,7 +1375,8 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free); +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + ulong roots_to_free); gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index c0643831706e..616f8e637bc3 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -48,10 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); +void __init mem_encrypt_free_decrypted_mem(void); bool sme_active(void); bool sev_active(void); +#define __bss_decrypted __attribute__((__section__(".bss..decrypted"))) + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0ULL @@ -77,6 +80,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +#define __bss_decrypted + #endif /* CONFIG_AMD_MEM_ENCRYPT */ /* @@ -88,6 +93,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; #define __sme_pa(x) (__pa(x) | sme_me_mask) #define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) +extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; + #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index ce2b59047cb8..9c85b54bf03c 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -14,6 +14,7 @@ #include <asm/processor.h> #include <linux/bitops.h> #include <linux/threads.h> +#include <asm/fixmap.h> extern p4d_t level4_kernel_pgt[512]; extern p4d_t level4_ident_pgt[512]; @@ -22,7 +23,7 @@ extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_ident_pgt[512]; -extern pte_t level1_fixmap_pgt[512]; +extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM]; extern pgd_t init_top_pgt[]; #define swapper_pg_dir init_top_pgt diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 0116b2ee9e64..449c92da2c91 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -40,7 +40,7 @@ static inline int cpu_has_vmx(void) */ static inline void cpu_vmxoff(void) { - asm volatile (ASM_VMX_VMXOFF : : : "cc"); + asm volatile ("vmxoff"); cr4_clear_bits(X86_CR4_VMXE); } diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 9527ba5d62da..ade0f153947d 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -503,19 +503,6 @@ enum vmcs_field { #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul - -#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" -#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" -#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" -#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" -#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" -#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" -#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" -#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" -#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" -#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" -#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" - struct vmx_msr_entry { u32 index; u32 reserved; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index fd23d5778ea1..dabfcf7c3941 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -288,6 +288,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 +#define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -299,7 +300,7 @@ struct kvm_vcpu_events { __u8 injected; __u8 nr; __u8 has_error_code; - __u8 pad; + __u8 pending; __u32 error_code; } exception; struct { @@ -322,7 +323,9 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; - __u32 reserved[9]; + __u8 reserved[27]; + __u8 exception_has_payload; + __u64 exception_payload; }; /* for KVM_GET/SET_DEBUGREGS */ @@ -381,6 +384,7 @@ struct kvm_sync_regs { #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 +#define KVM_STATE_NESTED_EVMCS 0x00000004 #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_SMM_VMXON 0x00000002 diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 4e588f36228f..285eb3ec4200 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -382,6 +382,11 @@ static inline bool is_mbm_event(int e) e <= QOS_L3_MBM_LOCAL_EVENT_ID); } +struct rdt_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + /** * struct rdt_resource - attributes of an RDT resource * @rid: The index of the resource @@ -423,16 +428,19 @@ struct rdt_resource { struct rdt_cache cache; struct rdt_membw membw; const char *format_str; - int (*parse_ctrlval) (void *data, struct rdt_resource *r, - struct rdt_domain *d); + int (*parse_ctrlval)(struct rdt_parse_data *data, + struct rdt_resource *r, + struct rdt_domain *d); struct list_head evt_list; int num_rmid; unsigned int mon_scale; unsigned long fflags; }; -int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); extern struct mutex rdtgroup_mutex; @@ -536,6 +544,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int update_domains(struct rdt_resource *r, int closid); +int closids_supported(void); void closid_free(int closid); int alloc_rmid(void); void free_rmid(u32 rmid); diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index af358ca05160..0f53049719cd 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -64,19 +64,19 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } -int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d) +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { - unsigned long data; - char *buf = _buf; + unsigned long bw_val; if (d->have_new_ctrl) { rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; } - if (!bw_validate(buf, &data, r)) + if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; - d->new_ctrl = data; + d->new_ctrl = bw_val; d->have_new_ctrl = true; return 0; @@ -123,18 +123,13 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) return true; } -struct rdt_cbm_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - /* * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { - struct rdt_cbm_parse_data *data = _data; struct rdtgroup *rdtgrp = data->rdtgrp; u32 cbm_val; @@ -195,11 +190,17 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) static int parse_line(char *line, struct rdt_resource *r, struct rdtgroup *rdtgrp) { - struct rdt_cbm_parse_data data; + struct rdt_parse_data data; char *dom = NULL, *id; struct rdt_domain *d; unsigned long dom_id; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + r->rid == RDT_RESOURCE_MBA) { + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); + return -EINVAL; + } + next: if (!line || line[0] == '\0') return 0; diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index b799c00bef09..1b8e86a5d5e1 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -97,6 +97,12 @@ void rdt_last_cmd_printf(const char *fmt, ...) * limited as the number of resources grows. */ static int closid_free_map; +static int closid_free_map_len; + +int closids_supported(void) +{ + return closid_free_map_len; +} static void closid_init(void) { @@ -111,6 +117,7 @@ static void closid_init(void) /* CLOSID 0 is always reserved for the default group */ closid_free_map &= ~1; + closid_free_map_len = rdt_min_closid; } static int closid_alloc(void) @@ -802,7 +809,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, sw_shareable = 0; exclusive = 0; seq_printf(seq, "%d=", dom->id); - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { if (!closid_allocated(i)) continue; mode = rdtgroup_mode_by_closid(i); @@ -989,7 +996,7 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, /* Check for overlap with other resource groups */ ctrl = d->ctrl_val; - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { ctrl_b = (unsigned long *)ctrl; mode = rdtgroup_mode_by_closid(i); if (closid_allocated(i) && i != closid && @@ -1024,16 +1031,27 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) { int closid = rdtgrp->closid; struct rdt_resource *r; + bool has_cache = false; struct rdt_domain *d; for_each_alloc_enabled_rdt_resource(r) { + if (r->rid == RDT_RESOURCE_MBA) + continue; + has_cache = true; list_for_each_entry(d, &r->domains, list) { if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid], - rdtgrp->closid, false)) + rdtgrp->closid, false)) { + rdt_last_cmd_puts("schemata overlaps\n"); return false; + } } } + if (!has_cache) { + rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n"); + return false; + } + return true; } @@ -1085,7 +1103,6 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, rdtgrp->mode = RDT_MODE_SHAREABLE; } else if (!strcmp(buf, "exclusive")) { if (!rdtgroup_mode_test_exclusive(rdtgrp)) { - rdt_last_cmd_printf("schemata overlaps\n"); ret = -EINVAL; goto out; } @@ -1155,8 +1172,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, struct rdt_resource *r; struct rdt_domain *d; unsigned int size; - bool sep = false; - u32 cbm; + bool sep; + u32 ctrl; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -1174,6 +1191,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, } for_each_alloc_enabled_rdt_resource(r) { + sep = false; seq_printf(s, "%*s:", max_name_width, r->name); list_for_each_entry(d, &r->domains, list) { if (sep) @@ -1181,8 +1199,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { size = 0; } else { - cbm = d->ctrl_val[rdtgrp->closid]; - size = rdtgroup_cbm_to_size(r, d, cbm); + ctrl = (!is_mba_sc(r) ? + d->ctrl_val[rdtgrp->closid] : + d->mbps_val[rdtgrp->closid]); + if (r->rid == RDT_RESOURCE_MBA) + size = ctrl; + else + size = rdtgroup_cbm_to_size(r, d, ctrl); } seq_printf(s, "%d=%u", d->id, size); sep = true; @@ -2336,12 +2359,18 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) u32 *ctrl; for_each_alloc_enabled_rdt_resource(r) { + /* + * Only initialize default allocations for CBM cache + * resources + */ + if (r->rid == RDT_RESOURCE_MBA) + continue; list_for_each_entry(d, &r->domains, list) { d->have_new_ctrl = false; d->new_ctrl = r->cache.shareable_bits; used_b = r->cache.shareable_bits; ctrl = d->ctrl_val; - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { if (closid_allocated(i) && i != closid) { mode = rdtgroup_mode_by_closid(i); if (mode == RDT_MODE_PSEUDO_LOCKSETUP) @@ -2373,6 +2402,12 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) } for_each_alloc_enabled_rdt_resource(r) { + /* + * Only initialize default allocations for CBM cache + * resources + */ + if (r->rid == RDT_RESOURCE_MBA) + continue; ret = update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("failed to initialize allocations\n"); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8047379e575a..ddee1f0870c4 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -35,6 +35,7 @@ #include <asm/bootparam_utils.h> #include <asm/microcode.h> #include <asm/kasan.h> +#include <asm/fixmap.h> /* * Manage page tables very early on. @@ -112,6 +113,7 @@ static bool __head check_la57_support(unsigned long physaddr) unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { + unsigned long vaddr, vaddr_end; unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -165,7 +167,8 @@ unsigned long __head __startup_64(unsigned long physaddr, pud[511] += load_delta; pmd = fixup_pointer(level2_fixmap_pgt, physaddr); - pmd[506] += load_delta; + for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) + pmd[i] += load_delta; /* * Set up the identity mapping for the switchover. These @@ -235,6 +238,21 @@ unsigned long __head __startup_64(unsigned long physaddr, sme_encrypt_kernel(bp); /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (mem_encrypt_active()) { + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__end_bss_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + + /* * Return the SME encryption mask (if SME is active) to be used as a * modifier for the initial pgdir entry programmed into CR3. */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 15ebc2fc166e..a3618cf04cf6 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -24,6 +24,7 @@ #include "../entry/calling.h" #include <asm/export.h> #include <asm/nospec-branch.h> +#include <asm/fixmap.h> #ifdef CONFIG_PARAVIRT #include <asm/asm-offsets.h> @@ -445,13 +446,20 @@ NEXT_PAGE(level2_kernel_pgt) KERNEL_IMAGE_SIZE/PMD_SIZE) NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 + .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 + pgtno = 0 + .rept (FIXMAP_PMD_NUM) + .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \ + + _PAGE_TABLE_NOENC; + pgtno = pgtno + 1 + .endr + /* 6 MB reserved space + a 2MB hole */ + .fill 4,8,0 NEXT_PAGE(level1_fixmap_pgt) + .rept (FIXMAP_PMD_NUM) .fill 512,8,0 + .endr #undef PMDS diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1e6764648af3..013fe3d21dbb 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -28,6 +28,7 @@ #include <linux/sched/clock.h> #include <linux/mm.h> #include <linux/slab.h> +#include <linux/set_memory.h> #include <asm/hypervisor.h> #include <asm/mem_encrypt.h> @@ -61,9 +62,10 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) static struct pvclock_vsyscall_time_info - hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); -static struct pvclock_wall_clock wall_clock; + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock __bss_decrypted; static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +static struct pvclock_vsyscall_time_info *hvclock_mem; static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) { @@ -236,6 +238,45 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +static void __init kvmclock_init_mem(void) +{ + unsigned long ncpus; + unsigned int order; + struct page *p; + int r; + + if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus()) + return; + + ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE; + order = get_order(ncpus * sizeof(*hvclock_mem)); + + p = alloc_pages(GFP_KERNEL, order); + if (!p) { + pr_warn("%s: failed to alloc %d pages", __func__, (1U << order)); + return; + } + + hvclock_mem = page_address(p); + + /* + * hvclock is shared between the guest and the hypervisor, must + * be mapped decrypted. + */ + if (sev_active()) { + r = set_memory_decrypted((unsigned long) hvclock_mem, + 1UL << order); + if (r) { + __free_pages(p, order); + hvclock_mem = NULL; + pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n"); + return; + } + } + + memset(hvclock_mem, 0, PAGE_SIZE << order); +} + static int __init kvm_setup_vsyscall_timeinfo(void) { #ifdef CONFIG_X86_64 @@ -250,6 +291,9 @@ static int __init kvm_setup_vsyscall_timeinfo(void) kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif + + kvmclock_init_mem(); + return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); @@ -269,8 +313,10 @@ static int kvmclock_setup_percpu(unsigned int cpu) /* Use the static page for the first CPUs, allocate otherwise */ if (cpu < HVC_BOOT_ARRAY_SIZE) p = &hv_clock_boot[cpu]; + else if (hvclock_mem) + p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE; else - p = kzalloc(sizeof(*p), GFP_KERNEL); + return -ENOMEM; per_cpu(hv_clock_per_cpu, cpu) = p; return p ? 0 : -ENOMEM; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index afdb303285f8..8dc69d82567e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -91,7 +91,7 @@ unsigned paravirt_patch_call(void *insnbuf, if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } @@ -111,7 +111,7 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target, if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8bde0a419f86..5dd3317d761f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -65,6 +65,23 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); +/* + * This section contains data which will be mapped as decrypted. Memory + * encryption operates on a page basis. Make this section PMD-aligned + * to avoid splitting the pages while mapping the section early. + * + * Note: We use a separate section so that only this section gets + * decrypted to avoid exposing more than we wish. + */ +#define BSS_DECRYPTED \ + . = ALIGN(PMD_SIZE); \ + __start_bss_decrypted = .; \ + *(.bss..decrypted); \ + . = ALIGN(PAGE_SIZE); \ + __start_bss_decrypted_unused = .; \ + . = ALIGN(PMD_SIZE); \ + __end_bss_decrypted = .; \ + #else #define X86_ALIGN_RODATA_BEGIN @@ -74,6 +91,7 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN #define ALIGN_ENTRY_TEXT_END +#define BSS_DECRYPTED #endif @@ -355,6 +373,7 @@ SECTIONS __bss_start = .; *(.bss..page_aligned) *(.bss) + BSS_DECRYPTED . = ALIGN(PAGE_SIZE); __bss_stop = .; } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 01d209ab5481..4e80080f277a 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -36,6 +36,8 @@ #include "trace.h" +#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) + static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) { return atomic64_read(&synic->sint[sint]); @@ -132,8 +134,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) struct kvm_vcpu *vcpu = NULL; int i; - if (vpidx < KVM_MAX_VCPUS) - vcpu = kvm_get_vcpu(kvm, vpidx); + if (vpidx >= KVM_MAX_VCPUS) + return NULL; + + vcpu = kvm_get_vcpu(kvm, vpidx); if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) @@ -689,6 +693,24 @@ void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) stimer_cleanup(&hv_vcpu->stimer[i]); } +bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) + return false; + return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; +} +EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); + +bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, + struct hv_vp_assist_page *assist_page) +{ + if (!kvm_hv_assist_page_enabled(vcpu)) + return false; + return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, + assist_page, sizeof(*assist_page)); +} +EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page); + static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) { struct hv_message *msg = &stimer->msg; @@ -1040,21 +1062,41 @@ static u64 current_task_runtime_100ns(void) static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; switch (msr) { - case HV_X64_MSR_VP_INDEX: - if (!host) + case HV_X64_MSR_VP_INDEX: { + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + int vcpu_idx = kvm_vcpu_get_idx(vcpu); + u32 new_vp_index = (u32)data; + + if (!host || new_vp_index >= KVM_MAX_VCPUS) return 1; - hv->vp_index = (u32)data; + + if (new_vp_index == hv_vcpu->vp_index) + return 0; + + /* + * The VP index is initialized to vcpu_index by + * kvm_hv_vcpu_postcreate so they initially match. Now the + * VP index is changing, adjust num_mismatched_vp_indexes if + * it now matches or no longer matches vcpu_idx. + */ + if (hv_vcpu->vp_index == vcpu_idx) + atomic_inc(&hv->num_mismatched_vp_indexes); + else if (new_vp_index == vcpu_idx) + atomic_dec(&hv->num_mismatched_vp_indexes); + + hv_vcpu->vp_index = new_vp_index; break; + } case HV_X64_MSR_VP_ASSIST_PAGE: { u64 gfn; unsigned long addr; if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { - hv->hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0)) + hv_vcpu->hv_vapic = data; + if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0)) return 1; break; } @@ -1062,12 +1104,19 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return 1; - if (__clear_user((void __user *)addr, PAGE_SIZE)) + + /* + * Clear apic_assist portion of f(struct hv_vp_assist_page + * only, there can be valuable data in the rest which needs + * to be preserved e.g. on migration. + */ + if (__clear_user((void __user *)addr, sizeof(u32))) return 1; - hv->hv_vapic = data; + hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); if (kvm_lapic_enable_pv_eoi(vcpu, - gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) + gfn_to_gpa(gfn) | KVM_MSR_ENABLED, + sizeof(struct hv_vp_assist_page))) return 1; break; } @@ -1080,7 +1129,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) case HV_X64_MSR_VP_RUNTIME: if (!host) return 1; - hv->runtime_offset = data - current_task_runtime_100ns(); + hv_vcpu->runtime_offset = data - current_task_runtime_100ns(); break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: @@ -1172,11 +1221,11 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; switch (msr) { case HV_X64_MSR_VP_INDEX: - data = hv->vp_index; + data = hv_vcpu->vp_index; break; case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); @@ -1185,10 +1234,10 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); case HV_X64_MSR_VP_ASSIST_PAGE: - data = hv->hv_vapic; + data = hv_vcpu->hv_vapic; break; case HV_X64_MSR_VP_RUNTIME: - data = current_task_runtime_100ns() + hv->runtime_offset; + data = current_task_runtime_100ns() + hv_vcpu->runtime_offset; break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: @@ -1255,32 +1304,47 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) return kvm_hv_get_msr(vcpu, msr, pdata, host); } -static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no) +static __always_inline unsigned long *sparse_set_to_vcpu_mask( + struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, + u64 *vp_bitmap, unsigned long *vcpu_bitmap) { - int i = 0, j; + struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_vcpu *vcpu; + int i, bank, sbank = 0; - if (!(valid_bank_mask & BIT_ULL(bank_no))) - return -1; + memset(vp_bitmap, 0, + KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap)); + for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, + KVM_HV_MAX_SPARSE_VCPU_SET_BITS) + vp_bitmap[bank] = sparse_banks[sbank++]; - for (j = 0; j < bank_no; j++) - if (valid_bank_mask & BIT_ULL(j)) - i++; + if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) { + /* for all vcpus vp_index == vcpu_idx */ + return (unsigned long *)vp_bitmap; + } - return i; + bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); + kvm_for_each_vcpu(i, vcpu, kvm) { + if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index, + (unsigned long *)vp_bitmap)) + __set_bit(i, vcpu_bitmap); + } + return vcpu_bitmap; } static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, u16 rep_cnt, bool ex) { struct kvm *kvm = current_vcpu->kvm; - struct kvm_vcpu_hv *hv_current = ¤t_vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = ¤t_vcpu->arch.hyperv; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; - struct kvm_vcpu *vcpu; - unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0}; - unsigned long valid_bank_mask = 0; + u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; + DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); + unsigned long *vcpu_mask; + u64 valid_bank_mask; u64 sparse_banks[64]; - int sparse_banks_len, i; + int sparse_banks_len; bool all_cpus; if (!ex) { @@ -1290,6 +1354,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, trace_kvm_hv_flush_tlb(flush.processor_mask, flush.address_space, flush.flags); + valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS; } else { @@ -1306,7 +1371,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; - sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) * + sparse_banks_len = + bitmap_weight((unsigned long *)&valid_bank_mask, 64) * sizeof(sparse_banks[0]); if (!sparse_banks_len && !all_cpus) @@ -1321,48 +1387,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, return HV_STATUS_INVALID_HYPERCALL_INPUT; } - cpumask_clear(&hv_current->tlb_lush); - - kvm_for_each_vcpu(i, vcpu, kvm) { - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; - int bank = hv->vp_index / 64, sbank = 0; - - if (!all_cpus) { - /* Banks >64 can't be represented */ - if (bank >= 64) - continue; - - /* Non-ex hypercalls can only address first 64 vCPUs */ - if (!ex && bank) - continue; - - if (ex) { - /* - * Check is the bank of this vCPU is in sparse - * set and get the sparse bank number. - */ - sbank = get_sparse_bank_no(valid_bank_mask, - bank); - - if (sbank < 0) - continue; - } - - if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64))) - continue; - } + cpumask_clear(&hv_vcpu->tlb_flush); - /* - * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we - * can't analyze it here, flush TLB regardless of the specified - * address space. - */ - __set_bit(i, vcpu_bitmap); - } + vcpu_mask = all_cpus ? NULL : + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, + vp_bitmap, vcpu_bitmap); + /* + * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't + * analyze it here, flush TLB regardless of the specified address space. + */ kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP, - vcpu_bitmap, &hv_current->tlb_lush); + vcpu_mask, &hv_vcpu->tlb_flush); ret_success: /* We always do full TLB flush, set rep_done = rep_cnt. */ @@ -1370,6 +1407,99 @@ ret_success: ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } +static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, + unsigned long *vcpu_bitmap) +{ + struct kvm_lapic_irq irq = { + .delivery_mode = APIC_DM_FIXED, + .vector = vector + }; + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) + continue; + + /* We fail only when APIC is disabled */ + kvm_apic_set_irq(vcpu, &irq, NULL); + } +} + +static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa, + bool ex, bool fast) +{ + struct kvm *kvm = current_vcpu->kvm; + struct hv_send_ipi_ex send_ipi_ex; + struct hv_send_ipi send_ipi; + u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; + DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); + unsigned long *vcpu_mask; + unsigned long valid_bank_mask; + u64 sparse_banks[64]; + int sparse_banks_len; + u32 vector; + bool all_cpus; + + if (!ex) { + if (!fast) { + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi, + sizeof(send_ipi)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + sparse_banks[0] = send_ipi.cpu_mask; + vector = send_ipi.vector; + } else { + /* 'reserved' part of hv_send_ipi should be 0 */ + if (unlikely(ingpa >> 32 != 0)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + sparse_banks[0] = outgpa; + vector = (u32)ingpa; + } + all_cpus = false; + valid_bank_mask = BIT_ULL(0); + + trace_kvm_hv_send_ipi(vector, sparse_banks[0]); + } else { + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex, + sizeof(send_ipi_ex)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector, + send_ipi_ex.vp_set.format, + send_ipi_ex.vp_set.valid_bank_mask); + + vector = send_ipi_ex.vector; + valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; + sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) * + sizeof(sparse_banks[0]); + + all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; + + if (!sparse_banks_len) + goto ret_success; + + if (!all_cpus && + kvm_read_guest(kvm, + ingpa + offsetof(struct hv_send_ipi_ex, + vp_set.bank_contents), + sparse_banks, + sparse_banks_len)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } + + if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + vcpu_mask = all_cpus ? NULL : + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, + vp_bitmap, vcpu_bitmap); + + kvm_send_ipi_to_many(kvm, vector, vcpu_mask); + +ret_success: + return HV_STATUS_SUCCESS; +} + bool kvm_hv_hypercall_enabled(struct kvm *kvm) { return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE; @@ -1539,6 +1669,20 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true); break; + case HVCALL_SEND_IPI: + if (unlikely(rep)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } + ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, false, fast); + break; + case HVCALL_SEND_IPI_EX: + if (unlikely(fast || rep)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } + ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false); + break; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index d6aa969e20f1..0e66c12ed2c3 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -62,6 +62,10 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); +bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu); +bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, + struct hv_vp_assist_page *assist_page); + static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, int timer_index) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fbb0e6df121b..3cd227ff807f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -70,6 +70,11 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul +static bool lapic_timer_advance_adjust_done = false; +#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 +/* step-by-step approximation to mitigate fluctuation */ +#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 + static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -955,14 +960,14 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, map = rcu_dereference(kvm->arch.apic_map); ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap); - if (ret) + if (ret) { + *r = 0; for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) continue; - if (*r < 0) - *r = 0; *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } + } rcu_read_unlock(); return ret; @@ -1472,7 +1477,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) void wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u64 guest_tsc, tsc_deadline; + u64 guest_tsc, tsc_deadline, ns; if (!lapic_in_kernel(vcpu)) return; @@ -1492,6 +1497,24 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) if (guest_tsc < tsc_deadline) __delay(min(tsc_deadline - guest_tsc, nsec_to_cycles(vcpu, lapic_timer_advance_ns))); + + if (!lapic_timer_advance_adjust_done) { + /* too early */ + if (guest_tsc < tsc_deadline) { + ns = (tsc_deadline - guest_tsc) * 1000000ULL; + do_div(ns, vcpu->arch.virtual_tsc_khz); + lapic_timer_advance_ns -= min((unsigned int)ns, + lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + } else { + /* too late */ + ns = (guest_tsc - tsc_deadline) * 1000000ULL; + do_div(ns, vcpu->arch.virtual_tsc_khz); + lapic_timer_advance_ns += min((unsigned int)ns, + lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + } + if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) + lapic_timer_advance_adjust_done = true; + } } static void start_sw_tscdeadline(struct kvm_lapic *apic) @@ -2621,17 +2644,25 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) { u64 addr = data & ~KVM_MSR_ENABLED; + struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data; + unsigned long new_len; + if (!IS_ALIGNED(addr, 4)) return 1; vcpu->arch.pv_eoi.msr_val = data; if (!pv_eoi_enabled(vcpu)) return 0; - return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, - addr, sizeof(u8)); + + if (addr == ghc->gpa && len <= ghc->len) + new_len = ghc->len; + else + new_len = len; + + return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); } void kvm_apic_accept_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index ed0ed39abd36..ff6ef9c3d760 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -120,7 +120,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; } -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); void kvm_lapic_init(void); void kvm_lapic_exit(void); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 51b953ad9d4e..4cf43ce42959 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -932,7 +932,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, while (cache->nobjs < ARRAY_SIZE(cache->objects)) { obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); if (!obj) - return -ENOMEM; + return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = obj; } return 0; @@ -960,7 +960,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, while (cache->nobjs < ARRAY_SIZE(cache->objects)) { page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); if (!page) - return -ENOMEM; + return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = page; } return 0; @@ -1265,24 +1265,24 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, mmu_free_pte_list_desc(desc); } -static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) +static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; struct pte_list_desc *prev_desc; int i; if (!rmap_head->val) { - printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); + pr_err("%s: %p 0->BUG\n", __func__, spte); BUG(); } else if (!(rmap_head->val & 1)) { - rmap_printk("pte_list_remove: %p 1->0\n", spte); + rmap_printk("%s: %p 1->0\n", __func__, spte); if ((u64 *)rmap_head->val != spte) { - printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); + pr_err("%s: %p 1->BUG\n", __func__, spte); BUG(); } rmap_head->val = 0; } else { - rmap_printk("pte_list_remove: %p many->many\n", spte); + rmap_printk("%s: %p many->many\n", __func__, spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { @@ -1296,11 +1296,17 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) prev_desc = desc; desc = desc->more; } - pr_err("pte_list_remove: %p many->many\n", spte); + pr_err("%s: %p many->many\n", __func__, spte); BUG(); } } +static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep) +{ + mmu_spte_clear_track_bits(sptep); + __pte_list_remove(sptep, rmap_head); +} + static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -1349,7 +1355,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) sp = page_header(__pa(spte)); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmap_head = gfn_to_rmap(kvm, gfn, sp); - pte_list_remove(spte, rmap_head); + __pte_list_remove(spte, rmap_head); } /* @@ -1685,7 +1691,7 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) while ((sptep = rmap_get_first(rmap_head, &iter))) { rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); - drop_spte(kvm, sptep); + pte_list_remove(rmap_head, sptep); flush = true; } @@ -1721,7 +1727,7 @@ restart: need_flush = 1; if (pte_write(*ptep)) { - drop_spte(kvm, sptep); + pte_list_remove(rmap_head, sptep); goto restart; } else { new_spte = *sptep & ~PT64_BASE_ADDR_MASK; @@ -1988,7 +1994,7 @@ static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, u64 *parent_pte) { - pte_list_remove(parent_pte, &sp->parent_ptes); + __pte_list_remove(parent_pte, &sp->parent_ptes); } static void drop_parent_pte(struct kvm_mmu_page *sp, @@ -2181,7 +2187,7 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { if (sp->role.cr4_pae != !!is_pae(vcpu) - || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { + || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } @@ -2375,14 +2381,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, int collisions = 0; LIST_HEAD(invalid_list); - role = vcpu->arch.mmu.base_role; + role = vcpu->arch.mmu->mmu_role.base; role.level = level; role.direct = direct; if (role.direct) role.cr4_pae = 0; role.access = access; - if (!vcpu->arch.mmu.direct_map - && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + if (!vcpu->arch.mmu->direct_map + && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; @@ -2457,11 +2463,11 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato { iterator->addr = addr; iterator->shadow_addr = root; - iterator->level = vcpu->arch.mmu.shadow_root_level; + iterator->level = vcpu->arch.mmu->shadow_root_level; if (iterator->level == PT64_ROOT_4LEVEL && - vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL && - !vcpu->arch.mmu.direct_map) + vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && + !vcpu->arch.mmu->direct_map) --iterator->level; if (iterator->level == PT32E_ROOT_LEVEL) { @@ -2469,10 +2475,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato * prev_root is currently only used for 64-bit hosts. So only * the active root_hpa is valid here. */ - BUG_ON(root != vcpu->arch.mmu.root_hpa); + BUG_ON(root != vcpu->arch.mmu->root_hpa); iterator->shadow_addr - = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; iterator->shadow_addr &= PT64_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) @@ -2483,7 +2489,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, u64 addr) { - shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa, + shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa, addr); } @@ -3095,7 +3101,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, int emulate = 0; gfn_t pseudo_gfn; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return 0; for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { @@ -3310,7 +3316,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, u64 spte = 0ull; uint retry_count = 0; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return false; if (!page_fault_can_be_fast(error_code)) @@ -3480,11 +3486,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, } /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free) +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + ulong roots_to_free) { int i; LIST_HEAD(invalid_list); - struct kvm_mmu *mmu = &vcpu->arch.mmu; bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); @@ -3544,20 +3550,20 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) struct kvm_mmu_page *sp; unsigned i; - if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); if(make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, 0, 0, - vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); + vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = __pa(sp->spt); - } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { + vcpu->arch.mmu->root_hpa = __pa(sp->spt); + } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu->pae_root[i]; MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); @@ -3570,9 +3576,9 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK; } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); } else BUG(); @@ -3586,7 +3592,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) gfn_t root_gfn; int i; - root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; + root_gfn = vcpu->arch.mmu->get_cr3(vcpu) >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) return 1; @@ -3595,8 +3601,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ - if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; + if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { + hpa_t root = vcpu->arch.mmu->root_hpa; MMU_WARN_ON(VALID_PAGE(root)); @@ -3606,11 +3612,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL); + vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = root; + vcpu->arch.mmu->root_hpa = root; return 0; } @@ -3620,17 +3626,17 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) + if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu->pae_root[i]; MMU_WARN_ON(VALID_PAGE(root)); - if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); + if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { + pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i); if (!(pdptr & PT_PRESENT_MASK)) { - vcpu->arch.mmu.pae_root[i] = 0; + vcpu->arch.mmu->pae_root[i] = 0; continue; } root_gfn = pdptr >> PAGE_SHIFT; @@ -3648,16 +3654,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.pae_root[i] = root | pm_mask; + vcpu->arch.mmu->pae_root[i] = root | pm_mask; } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); /* * If we shadow a 32 bit page table with a long mode page * table we enter this path. */ - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) { - if (vcpu->arch.mmu.lm_root == NULL) { + if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu->lm_root == NULL) { /* * The additional page necessary for this is only * allocated on demand. @@ -3669,12 +3675,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (lm_root == NULL) return 1; - lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; + lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask; - vcpu->arch.mmu.lm_root = lm_root; + vcpu->arch.mmu->lm_root = lm_root; } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); + vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root); } return 0; @@ -3682,7 +3688,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) static int mmu_alloc_roots(struct kvm_vcpu *vcpu) { - if (vcpu->arch.mmu.direct_map) + if (vcpu->arch.mmu->direct_map) return mmu_alloc_direct_roots(vcpu); else return mmu_alloc_shadow_roots(vcpu); @@ -3693,17 +3699,16 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) int i; struct kvm_mmu_page *sp; - if (vcpu->arch.mmu.direct_map) + if (vcpu->arch.mmu->direct_map) return; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return; vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - + if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { + hpa_t root = vcpu->arch.mmu->root_hpa; sp = page_header(root); /* @@ -3734,7 +3739,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu->pae_root[i]; if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; @@ -3808,7 +3813,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf; bool reserved = false; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) goto exit; walk_shadow_page_lockless_begin(vcpu); @@ -3825,7 +3830,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) if (!is_shadow_present_pte(spte)) break; - reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte, + reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte, iterator.level); } @@ -3904,7 +3909,7 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) struct kvm_shadow_walk_iterator iterator; u64 spte; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return; walk_shadow_page_lockless_begin(vcpu); @@ -3931,7 +3936,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, if (r) return r; - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); return nonpaging_map(vcpu, gva & PAGE_MASK, @@ -3944,8 +3949,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; arch.gfn = gfn; - arch.direct_map = vcpu->arch.mmu.direct_map; - arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); + arch.direct_map = vcpu->arch.mmu->direct_map; + arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } @@ -4051,7 +4056,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, int write = error_code & PFERR_WRITE_MASK; bool map_writable; - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; @@ -4127,7 +4132,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, { uint i; struct kvm_mmu_root_info root; - struct kvm_mmu *mmu = &vcpu->arch.mmu; + struct kvm_mmu *mmu = vcpu->arch.mmu; root.cr3 = mmu->get_cr3(vcpu); root.hpa = mmu->root_hpa; @@ -4150,7 +4155,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, union kvm_mmu_page_role new_role, bool skip_tlb_flush) { - struct kvm_mmu *mmu = &vcpu->arch.mmu; + struct kvm_mmu *mmu = vcpu->arch.mmu; /* * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid @@ -4201,7 +4206,8 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) { if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush)) - kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT); + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, + KVM_MMU_ROOT_CURRENT); } void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) @@ -4219,7 +4225,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu) static void inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { - vcpu->arch.mmu.inject_page_fault(vcpu, fault); + vcpu->arch.mmu->inject_page_fault(vcpu, fault); } static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, @@ -4423,7 +4429,8 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - bool uses_nx = context->nx || context->base_role.smep_andnot_wp; + bool uses_nx = context->nx || + context->mmu_role.base.smep_andnot_wp; struct rsvd_bits_validate *shadow_zero_check; int i; @@ -4562,7 +4569,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, * SMAP:kernel-mode data accesses from user-mode * mappings should fault. A fault is considered * as a SMAP violation if all of the following - * conditions are ture: + * conditions are true: * - X86_CR4_SMAP is set in CR4 * - A user page is accessed * - The access is not a fetch @@ -4723,27 +4730,65 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu, paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } -static union kvm_mmu_page_role -kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu) +static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_extended_role ext = {0}; + + ext.cr0_pg = !!is_paging(vcpu); + ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); + ext.cr4_pse = !!is_pse(vcpu); + ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE); + ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57); + + ext.valid = 1; + + return ext; +} + +static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, + bool base_only) +{ + union kvm_mmu_role role = {0}; + + role.base.access = ACC_ALL; + role.base.nxe = !!is_nx(vcpu); + role.base.cr4_pae = !!is_pae(vcpu); + role.base.cr0_wp = is_write_protection(vcpu); + role.base.smm = is_smm(vcpu); + role.base.guest_mode = is_guest_mode(vcpu); + + if (base_only) + return role; + + role.ext = kvm_calc_mmu_role_ext(vcpu); + + return role; +} + +static union kvm_mmu_role +kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) { - union kvm_mmu_page_role role = {0}; + union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); - role.guest_mode = is_guest_mode(vcpu); - role.smm = is_smm(vcpu); - role.ad_disabled = (shadow_accessed_mask == 0); - role.level = kvm_x86_ops->get_tdp_level(vcpu); - role.direct = true; - role.access = ACC_ALL; + role.base.ad_disabled = (shadow_accessed_mask == 0); + role.base.level = kvm_x86_ops->get_tdp_level(vcpu); + role.base.direct = true; return role; } static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = &vcpu->arch.mmu; + struct kvm_mmu *context = vcpu->arch.mmu; + union kvm_mmu_role new_role = + kvm_calc_tdp_mmu_root_page_role(vcpu, false); - context->base_role.word = mmu_base_role_mask.word & - kvm_calc_tdp_mmu_root_page_role(vcpu).word; + new_role.base.word &= mmu_base_role_mask.word; + if (new_role.as_u64 == context->mmu_role.as_u64) + return; + + context->mmu_role.as_u64 = new_role.as_u64; context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; @@ -4783,36 +4828,36 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) reset_tdp_shadow_zero_bits_mask(vcpu, context); } -static union kvm_mmu_page_role -kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu) -{ - union kvm_mmu_page_role role = {0}; - bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); - bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); - - role.nxe = is_nx(vcpu); - role.cr4_pae = !!is_pae(vcpu); - role.cr0_wp = is_write_protection(vcpu); - role.smep_andnot_wp = smep && !is_write_protection(vcpu); - role.smap_andnot_wp = smap && !is_write_protection(vcpu); - role.guest_mode = is_guest_mode(vcpu); - role.smm = is_smm(vcpu); - role.direct = !is_paging(vcpu); - role.access = ACC_ALL; +static union kvm_mmu_role +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +{ + union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); + + role.base.smep_andnot_wp = role.ext.cr4_smep && + !is_write_protection(vcpu); + role.base.smap_andnot_wp = role.ext.cr4_smap && + !is_write_protection(vcpu); + role.base.direct = !is_paging(vcpu); if (!is_long_mode(vcpu)) - role.level = PT32E_ROOT_LEVEL; + role.base.level = PT32E_ROOT_LEVEL; else if (is_la57_mode(vcpu)) - role.level = PT64_ROOT_5LEVEL; + role.base.level = PT64_ROOT_5LEVEL; else - role.level = PT64_ROOT_4LEVEL; + role.base.level = PT64_ROOT_4LEVEL; return role; } void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = &vcpu->arch.mmu; + struct kvm_mmu *context = vcpu->arch.mmu; + union kvm_mmu_role new_role = + kvm_calc_shadow_mmu_root_page_role(vcpu, false); + + new_role.base.word &= mmu_base_role_mask.word; + if (new_role.as_u64 == context->mmu_role.as_u64) + return; if (!is_paging(vcpu)) nonpaging_init_context(vcpu, context); @@ -4823,22 +4868,28 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) else paging32_init_context(vcpu, context); - context->base_role.word = mmu_base_role_mask.word & - kvm_calc_shadow_mmu_root_page_role(vcpu).word; + context->mmu_role.as_u64 = new_role.as_u64; reset_shadow_zero_bits_mask(vcpu, context); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -static union kvm_mmu_page_role -kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty) +static union kvm_mmu_role +kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, + bool execonly) { - union kvm_mmu_page_role role = vcpu->arch.mmu.base_role; + union kvm_mmu_role role; + + /* Base role is inherited from root_mmu */ + role.base.word = vcpu->arch.root_mmu.mmu_role.base.word; + role.ext = kvm_calc_mmu_role_ext(vcpu); + + role.base.level = PT64_ROOT_4LEVEL; + role.base.direct = false; + role.base.ad_disabled = !accessed_dirty; + role.base.guest_mode = true; + role.base.access = ACC_ALL; - role.level = PT64_ROOT_4LEVEL; - role.direct = false; - role.ad_disabled = !accessed_dirty; - role.guest_mode = true; - role.access = ACC_ALL; + role.ext.execonly = execonly; return role; } @@ -4846,11 +4897,17 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty) void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp) { - struct kvm_mmu *context = &vcpu->arch.mmu; - union kvm_mmu_page_role root_page_role = - kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty); + struct kvm_mmu *context = vcpu->arch.mmu; + union kvm_mmu_role new_role = + kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, + execonly); + + __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false); + + new_role.base.word &= mmu_base_role_mask.word; + if (new_role.as_u64 == context->mmu_role.as_u64) + return; - __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false); context->shadow_root_level = PT64_ROOT_4LEVEL; context->nx = true; @@ -4862,7 +4919,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->update_pte = ept_update_pte; context->root_level = PT64_ROOT_4LEVEL; context->direct_map = false; - context->base_role.word = root_page_role.word & mmu_base_role_mask.word; + context->mmu_role.as_u64 = new_role.as_u64; + update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); update_last_nonleaf_level(vcpu, context); @@ -4873,7 +4931,7 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = &vcpu->arch.mmu; + struct kvm_mmu *context = vcpu->arch.mmu; kvm_init_shadow_mmu(vcpu); context->set_cr3 = kvm_x86_ops->set_cr3; @@ -4884,14 +4942,20 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu) static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) { + union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false); struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; + new_role.base.word &= mmu_base_role_mask.word; + if (new_role.as_u64 == g_context->mmu_role.as_u64) + return; + + g_context->mmu_role.as_u64 = new_role.as_u64; g_context->get_cr3 = get_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; /* - * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using + * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using * L1's nested page tables (e.g. EPT12). The nested translation * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using * L2's page tables as the first level of translation and L1's @@ -4930,10 +4994,10 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots) if (reset_roots) { uint i; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu->root_hpa = INVALID_PAGE; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; } if (mmu_is_nested(vcpu)) @@ -4948,10 +5012,14 @@ EXPORT_SYMBOL_GPL(kvm_init_mmu); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) { + union kvm_mmu_role role; + if (tdp_enabled) - return kvm_calc_tdp_mmu_root_page_role(vcpu); + role = kvm_calc_tdp_mmu_root_page_role(vcpu, true); else - return kvm_calc_shadow_mmu_root_page_role(vcpu); + role = kvm_calc_shadow_mmu_root_page_role(vcpu, true); + + return role.base; } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -4981,8 +5049,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { - kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL); - WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); + WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa)); + kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_unload); @@ -4996,7 +5066,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } ++vcpu->kvm->stat.mmu_pte_updated; - vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); + vcpu->arch.mmu->update_pte(vcpu, sp, spte, new); } static bool need_remote_flush(u64 old, u64 new) @@ -5173,10 +5243,12 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, local_flush = true; while (npte--) { + u32 base_role = vcpu->arch.mmu->mmu_role.base.word; + entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte); if (gentry && - !((sp->role.word ^ vcpu->arch.mmu.base_role.word) + !((sp->role.word ^ base_role) & mmu_base_role_mask.word) && rmap_can_add(vcpu)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (need_remote_flush(entry, *spte)) @@ -5194,7 +5266,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa_t gpa; int r; - if (vcpu->arch.mmu.direct_map) + if (vcpu->arch.mmu->direct_map) return 0; gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); @@ -5230,10 +5302,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, { int r, emulation_type = 0; enum emulation_result er; - bool direct = vcpu->arch.mmu.direct_map; + bool direct = vcpu->arch.mmu->direct_map; /* With shadow page tables, fault_address contains a GVA or nGPA. */ - if (vcpu->arch.mmu.direct_map) { + if (vcpu->arch.mmu->direct_map) { vcpu->arch.gpa_available = true; vcpu->arch.gpa_val = cr2; } @@ -5246,8 +5318,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, } if (r == RET_PF_INVALID) { - r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), - false); + r = vcpu->arch.mmu->page_fault(vcpu, cr2, + lower_32_bits(error_code), + false); WARN_ON(r == RET_PF_INVALID); } @@ -5263,7 +5336,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, * paging in both guests. If true, we simply unprotect the page * and resume the guest. */ - if (vcpu->arch.mmu.direct_map && + if (vcpu->arch.mmu->direct_map && (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); return 1; @@ -5311,7 +5384,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - struct kvm_mmu *mmu = &vcpu->arch.mmu; + struct kvm_mmu *mmu = vcpu->arch.mmu; int i; /* INVLPG on a * non-canonical address is a NOP according to the SDM. */ @@ -5342,7 +5415,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { - struct kvm_mmu *mmu = &vcpu->arch.mmu; + struct kvm_mmu *mmu = vcpu->arch.mmu; bool tlb_flush = false; uint i; @@ -5386,8 +5459,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { - free_page((unsigned long)vcpu->arch.mmu.pae_root); - free_page((unsigned long)vcpu->arch.mmu.lm_root); + free_page((unsigned long)vcpu->arch.mmu->pae_root); + free_page((unsigned long)vcpu->arch.mmu->lm_root); } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) @@ -5407,9 +5480,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) if (!page) return -ENOMEM; - vcpu->arch.mmu.pae_root = page_address(page); + vcpu->arch.mmu->pae_root = page_address(page); for (i = 0; i < 4; ++i) - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + vcpu->arch.mmu->pae_root[i] = INVALID_PAGE; return 0; } @@ -5418,27 +5491,21 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) { uint i; - vcpu->arch.walk_mmu = &vcpu->arch.mmu; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - vcpu->arch.mmu.translate_gpa = translate_gpa; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; + vcpu->arch.mmu = &vcpu->arch.root_mmu; + vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; + vcpu->arch.root_mmu.root_hpa = INVALID_PAGE; + vcpu->arch.root_mmu.translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - - return alloc_mmu_pages(vcpu); -} + vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; -void kvm_mmu_setup(struct kvm_vcpu *vcpu) -{ - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE; + vcpu->arch.guest_mmu.translate_gpa = translate_gpa; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - /* - * kvm_mmu_setup() is called only on vCPU initialization. - * Therefore, no need to reset mmu roots as they are not yet - * initialized. - */ - kvm_init_mmu(vcpu, false); + vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; + return alloc_mmu_pages(vcpu); } static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, @@ -5621,7 +5688,7 @@ restart: if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && PageTransCompoundMap(pfn_to_page(pfn))) { - drop_spte(kvm, sptep); + pte_list_remove(rmap_head, sptep); need_tlb_flush = 1; goto restart; } @@ -5878,6 +5945,16 @@ int kvm_mmu_module_init(void) { int ret = -ENOMEM; + /* + * MMU roles use union aliasing which is, generally speaking, an + * undefined behavior. However, we supposedly know how compilers behave + * and the current status quo is unlikely to change. Guardians below are + * supposed to let us know if the assumption becomes false. + */ + BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32)); + BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32)); + BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64)); + kvm_mmu_reset_all_pte_masks(); pte_list_desc_cache = kmem_cache_create("pte_list_desc", @@ -5907,7 +5984,7 @@ out: } /* - * Caculate mmu pages needed for kvm. + * Calculate mmu pages needed for kvm. */ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 1fab69c0b2f3..c7b333147c4a 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -43,11 +43,6 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 -#define PT_PDPE_LEVEL 3 -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 -#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1) - static inline u64 rsvd_bits(int s, int e) { if (e < s) @@ -80,7 +75,7 @@ static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { - if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) + if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE)) return 0; return kvm_mmu_load(vcpu); @@ -102,9 +97,9 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) { - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) - vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa | - kvm_get_active_pcid(vcpu)); + if (VALID_PAGE(vcpu->arch.mmu->root_hpa)) + vcpu->arch.mmu->set_cr3(vcpu, vcpu->arch.mmu->root_hpa | + kvm_get_active_pcid(vcpu)); } /* diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 1272861e77b9..abac7e208853 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -59,19 +59,19 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) int i; struct kvm_mmu_page *sp; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return; - if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; + if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { + hpa_t root = vcpu->arch.mmu->root_hpa; sp = page_header(root); - __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level); + __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level); return; } for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu->pae_root[i]; if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; @@ -122,7 +122,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) hpa = pfn << PAGE_SHIFT; if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " - "ent %llxn", vcpu->arch.mmu.root_level, pfn, + "ent %llxn", vcpu->arch.mmu->root_level, pfn, hpa, *sptep); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 14ffd973df54..7cf2185b7eb5 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -158,14 +158,15 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) goto no_present; if (!FNAME(is_present_gpte)(gpte)) goto no_present; /* if accessed bit is not supported prefetch non accessed gpte */ - if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK)) + if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) && + !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; return false; @@ -480,7 +481,7 @@ error: static int FNAME(walk_addr)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, u32 access) { - return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr, + return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, access); } @@ -509,7 +510,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); - FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); + FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_error_pfn(pfn)) @@ -604,7 +605,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, direct_access = gw->pte_access; - top_level = vcpu->arch.mmu.root_level; + top_level = vcpu->arch.mmu->root_level; if (top_level == PT32E_ROOT_LEVEL) top_level = PT32_ROOT_LEVEL; /* @@ -616,7 +617,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) goto out_gpte_changed; for (shadow_walk_init(&it, vcpu, addr); @@ -1004,7 +1005,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; pte_access &= FNAME(gpte_access)(gpte); - FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); + FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, &nr_present)) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d96092b35936..f416f5c7f2ae 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -805,6 +805,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) nested_svm_check_exception(svm, nr, has_error_code, error_code)) return; + kvm_deliver_exception_payload(&svm->vcpu); + if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); @@ -2918,18 +2920,18 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { WARN_ON(mmu_is_nested(vcpu)); kvm_init_shadow_mmu(vcpu); - vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; - vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; - vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; - vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu); - reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu); + vcpu->arch.mmu->set_cr3 = nested_svm_set_tdp_cr3; + vcpu->arch.mmu->get_cr3 = nested_svm_get_tdp_cr3; + vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; + vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; + vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu); + reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) { - vcpu->arch.walk_mmu = &vcpu->arch.mmu; + vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; } static int nested_svm_check_permissions(struct vcpu_svm *svm) @@ -2965,16 +2967,13 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, svm->vmcb->control.exit_info_1 = error_code; /* - * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception. - * The fix is to add the ancillary datum (CR2 or DR6) to structs - * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be - * written only when inject_pending_event runs (DR6 would written here - * too). This should be conditional on a new capability---if the - * capability is disabled, kvm_multiple_exception would write the - * ancillary information to CR2 or DR6, for backwards ABI-compatibility. + * EXITINFO2 is undefined for all exception intercepts other + * than #PF. */ if (svm->vcpu.arch.exception.nested_apf) svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; + else if (svm->vcpu.arch.exception.has_payload) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; else svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; @@ -5638,26 +5637,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r13, %c[r13](%[svm]) \n\t" "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" -#endif /* * Clear host registers marked as clobbered to prevent * speculative use. */ - "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t" - "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t" - "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t" - "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t" - "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t" -#ifdef CONFIG_X86_64 - "xor %%r8, %%r8 \n\t" - "xor %%r9, %%r9 \n\t" - "xor %%r10, %%r10 \n\t" - "xor %%r11, %%r11 \n\t" - "xor %%r12, %%r12 \n\t" - "xor %%r13, %%r13 \n\t" - "xor %%r14, %%r14 \n\t" - "xor %%r15, %%r15 \n\t" + "xor %%r8d, %%r8d \n\t" + "xor %%r9d, %%r9d \n\t" + "xor %%r10d, %%r10d \n\t" + "xor %%r11d, %%r11d \n\t" + "xor %%r12d, %%r12d \n\t" + "xor %%r13d, %%r13d \n\t" + "xor %%r14d, %%r14d \n\t" + "xor %%r15d, %%r15d \n\t" #endif + "xor %%ebx, %%ebx \n\t" + "xor %%ecx, %%ecx \n\t" + "xor %%edx, %%edx \n\t" + "xor %%esi, %%esi \n\t" + "xor %%edi, %%edi \n\t" "pop %%" _ASM_BP : : [svm]"a"(svm), @@ -7036,6 +7033,13 @@ failed: return ret; } +static int nested_enable_evmcs(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version) +{ + /* Intel-only feature */ + return -ENODEV; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -7165,6 +7169,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .mem_enc_op = svm_mem_enc_op, .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, + + .nested_enable_evmcs = nested_enable_evmcs, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 0f997683404f..0659465a745c 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1418,6 +1418,48 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex, __entry->valid_bank_mask, __entry->format, __entry->address_space, __entry->flags) ); + +/* + * Tracepoints for kvm_hv_send_ipi. + */ +TRACE_EVENT(kvm_hv_send_ipi, + TP_PROTO(u32 vector, u64 processor_mask), + TP_ARGS(vector, processor_mask), + + TP_STRUCT__entry( + __field(u32, vector) + __field(u64, processor_mask) + ), + + TP_fast_assign( + __entry->vector = vector; + __entry->processor_mask = processor_mask; + ), + + TP_printk("vector %x processor_mask 0x%llx", + __entry->vector, __entry->processor_mask) +); + +TRACE_EVENT(kvm_hv_send_ipi_ex, + TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask), + TP_ARGS(vector, format, valid_bank_mask), + + TP_STRUCT__entry( + __field(u32, vector) + __field(u64, format) + __field(u64, valid_bank_mask) + ), + + TP_fast_assign( + __entry->vector = vector; + __entry->format = format; + __entry->valid_bank_mask = valid_bank_mask; + ), + + TP_printk("vector %x format %llx valid_bank_mask 0x%llx", + __entry->vector, __entry->format, + __entry->valid_bank_mask) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 612fd17be635..641a65b30685 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -20,6 +20,7 @@ #include "mmu.h" #include "cpuid.h" #include "lapic.h" +#include "hyperv.h" #include <linux/kvm_host.h> #include <linux/module.h> @@ -61,7 +62,7 @@ #define __ex(x) __kvm_handle_fault_on_reboot(x) #define __ex_clear(x, reg) \ - ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) + ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -107,9 +108,12 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); * VMX and be a hypervisor for its own guests. If nested=0, guests may not * use VMX instructions. */ -static bool __read_mostly nested = 0; +static bool __read_mostly nested = 1; module_param(nested, bool, S_IRUGO); +static bool __read_mostly nested_early_check = 0; +module_param(nested_early_check, bool, S_IRUGO); + static u64 __read_mostly host_xss; static bool __read_mostly enable_pml = 1; @@ -131,7 +135,7 @@ static bool __read_mostly enable_preemption_timer = 1; module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #endif -#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) +#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ @@ -187,6 +191,7 @@ static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); extern const ulong vmx_return; +extern const ulong vmx_early_consistency_check_return; static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); @@ -827,14 +832,28 @@ struct nested_vmx { */ struct vmcs12 *cached_shadow_vmcs12; /* - * Indicates if the shadow vmcs must be updated with the - * data hold by vmcs12 + * Indicates if the shadow vmcs or enlightened vmcs must be updated + * with the data held by struct vmcs12. */ - bool sync_shadow_vmcs; + bool need_vmcs12_sync; bool dirty_vmcs12; + /* + * vmcs02 has been initialized, i.e. state that is constant for + * vmcs02 has been written to the backing VMCS. Initialization + * is delayed until L1 actually attempts to run a nested VM. + */ + bool vmcs02_initialized; + bool change_vmcs01_virtual_apic_mode; + /* + * Enlightened VMCS has been enabled. It does not mean that L1 has to + * use it. However, VMX features available to L1 will be limited based + * on what the enlightened VMCS supports. + */ + bool enlightened_vmcs_enabled; + /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; @@ -870,6 +889,10 @@ struct nested_vmx { /* in guest mode on SMM entry? */ bool guest_mode; } smm; + + gpa_t hv_evmcs_vmptr; + struct page *hv_evmcs_page; + struct hv_enlightened_vmcs *hv_evmcs; }; #define POSTED_INTR_ON 0 @@ -1381,6 +1404,49 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs); #define KVM_EVMCS_VERSION 1 +/* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + * VM_FUNCTION_CONTROL = 0x00002018, + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + * + * TSC_MULTIPLIER = 0x00002032, + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, + * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, + * + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ + PIN_BASED_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ + SECONDARY_EXEC_APIC_REGISTER_VIRT | \ + SECONDARY_EXEC_ENABLE_PML | \ + SECONDARY_EXEC_ENABLE_VMFUNC | \ + SECONDARY_EXEC_SHADOW_VMCS | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_PAUSE_LOOP_EXITING) +#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) +#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) +#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) + #if IS_ENABLED(CONFIG_HYPERV) static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -1473,69 +1539,12 @@ static void evmcs_load(u64 phys_addr) static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { - /* - * Enlightened VMCSv1 doesn't support these: - * - * POSTED_INTR_NV = 0x00000002, - * GUEST_INTR_STATUS = 0x00000810, - * APIC_ACCESS_ADDR = 0x00002014, - * POSTED_INTR_DESC_ADDR = 0x00002016, - * EOI_EXIT_BITMAP0 = 0x0000201c, - * EOI_EXIT_BITMAP1 = 0x0000201e, - * EOI_EXIT_BITMAP2 = 0x00002020, - * EOI_EXIT_BITMAP3 = 0x00002022, - */ - vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; - vmcs_conf->cpu_based_2nd_exec_ctrl &= - ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; - vmcs_conf->cpu_based_2nd_exec_ctrl &= - ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmcs_conf->cpu_based_2nd_exec_ctrl &= - ~SECONDARY_EXEC_APIC_REGISTER_VIRT; - - /* - * GUEST_PML_INDEX = 0x00000812, - * PML_ADDRESS = 0x0000200e, - */ - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML; - - /* VM_FUNCTION_CONTROL = 0x00002018, */ - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC; - - /* - * EPTP_LIST_ADDRESS = 0x00002024, - * VMREAD_BITMAP = 0x00002026, - * VMWRITE_BITMAP = 0x00002028, - */ - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS; - - /* - * TSC_MULTIPLIER = 0x00002032, - */ - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING; + vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; - /* - * PLE_GAP = 0x00004020, - * PLE_WINDOW = 0x00004022, - */ - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - - /* - * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, - */ - vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - - /* - * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, - * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, - */ - vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; + vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; - /* - * Currently unsupported in KVM: - * GUEST_IA32_RTIT_CTL = 0x00002814, - */ } /* check_ept_pointer() should be under protection of ept_pointer_lock. */ @@ -1560,7 +1569,8 @@ static void check_ept_pointer_match(struct kvm *kvm) static int vmx_hv_remote_flush_tlb(struct kvm *kvm) { - int ret; + struct kvm_vcpu *vcpu; + int ret = -ENOTSUPP, i; spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); @@ -1568,14 +1578,14 @@ static int vmx_hv_remote_flush_tlb(struct kvm *kvm) check_ept_pointer_match(kvm); if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { - ret = -ENOTSUPP; - goto out; + kvm_for_each_vcpu(i, vcpu, kvm) + ret |= hyperv_flush_guest_mapping( + to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer); + } else { + ret = hyperv_flush_guest_mapping( + to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer); } - ret = hyperv_flush_guest_mapping( - to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer); - -out: spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); return ret; } @@ -1591,6 +1601,35 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ +static int nested_enable_evmcs(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* We don't support disabling the feature for simplicity. */ + if (vmx->nested.enlightened_vmcs_enabled) + return 0; + + vmx->nested.enlightened_vmcs_enabled = true; + + /* + * vmcs_version represents the range of supported Enlightened VMCS + * versions: lower 8 bits is the minimal version, higher 8 bits is the + * maximum supported version. KVM supports versions from 1 to + * KVM_EVMCS_VERSION. + */ + if (vmcs_version) + *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1; + + vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; + vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; + vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; + vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC; + + return 0; +} + static inline bool is_exception_n(u32 intr_info, u8 vector) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -1613,11 +1652,6 @@ static inline bool is_page_fault(u32 intr_info) return is_exception_n(intr_info, PF_VECTOR); } -static inline bool is_no_device(u32 intr_info) -{ - return is_exception_n(intr_info, NM_VECTOR); -} - static inline bool is_invalid_opcode(u32 intr_info) { return is_exception_n(intr_info, UD_VECTOR); @@ -1628,12 +1662,6 @@ static inline bool is_gp_fault(u32 intr_info) return is_exception_n(intr_info, GP_VECTOR); } -static inline bool is_external_interrupt(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); -} - static inline bool is_machine_check(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -2059,9 +2087,6 @@ static inline bool is_nmi(u32 intr_info) static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification); -static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, - u32 reason, unsigned long qualification); static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { @@ -2073,7 +2098,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) return -1; } -static inline void __invvpid(int ext, u16 vpid, gva_t gva) +static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) { struct { u64 vpid : 16; @@ -2082,22 +2107,20 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva) } operand = { vpid, 0, gva }; bool error; - asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na) - : CC_OUT(na) (error) : "a"(&operand), "c"(ext) - : "memory"); + asm volatile (__ex("invvpid %2, %1") CC_SET(na) + : CC_OUT(na) (error) : "r"(ext), "m"(operand)); BUG_ON(error); } -static inline void __invept(int ext, u64 eptp, gpa_t gpa) +static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) { struct { u64 eptp, gpa; } operand = {eptp, gpa}; bool error; - asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na) - : CC_OUT(na) (error) : "a" (&operand), "c" (ext) - : "memory"); + asm volatile (__ex("invept %2, %1") CC_SET(na) + : CC_OUT(na) (error) : "r"(ext), "m"(operand)); BUG_ON(error); } @@ -2116,9 +2139,8 @@ static void vmcs_clear(struct vmcs *vmcs) u64 phys_addr = __pa(vmcs); bool error; - asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na) - : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) - : "memory"); + asm volatile (__ex("vmclear %1") CC_SET(na) + : CC_OUT(na) (error) : "m"(phys_addr)); if (unlikely(error)) printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", vmcs, phys_addr); @@ -2141,9 +2163,8 @@ static void vmcs_load(struct vmcs *vmcs) if (static_branch_unlikely(&enable_evmcs)) return evmcs_load(phys_addr); - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na) - : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) - : "memory"); + asm volatile (__ex("vmptrld %1") CC_SET(na) + : CC_OUT(na) (error) : "m"(phys_addr)); if (unlikely(error)) printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", vmcs, phys_addr); @@ -2319,8 +2340,8 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) { unsigned long value; - asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") - : "=a"(value) : "d"(field) : "cc"); + asm volatile (__ex_clear("vmread %1, %0", "%k0") + : "=r"(value) : "r"(field)); return value; } @@ -2371,8 +2392,8 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val { bool error; - asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na) - : CC_OUT(na) (error) : "a"(value), "d"(field)); + asm volatile (__ex("vmwrite %2, %1") CC_SET(na) + : CC_OUT(na) (error) : "r"(field), "rm"(value)); if (unlikely(error)) vmwrite_error(field, value); } @@ -2703,7 +2724,8 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, u64 guest_val, u64 host_val) { vmcs_write64(guest_val_vmcs, guest_val); - vmcs_write64(host_val_vmcs, host_val); + if (host_val_vmcs != HOST_IA32_EFER) + vmcs_write64(host_val_vmcs, host_val); vm_entry_controls_setbit(vmx, entry); vm_exit_controls_setbit(vmx, exit); } @@ -2801,8 +2823,6 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) ignore_bits &= ~(u64)EFER_SCE; #endif - clear_atomic_switch_msr(vmx, MSR_EFER); - /* * On EPT, we can't emulate NX, so we must switch EFER atomically. * On CPUs that support "load IA32_EFER", always switch EFER @@ -2815,8 +2835,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) if (guest_efer != host_efer) add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer, false); + else + clear_atomic_switch_msr(vmx, MSR_EFER); return false; } else { + clear_atomic_switch_msr(vmx, MSR_EFER); + guest_efer &= ~ignore_bits; guest_efer |= host_efer & ignore_bits; @@ -3268,34 +3292,30 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned int nr = vcpu->arch.exception.nr; + bool has_payload = vcpu->arch.exception.has_payload; + unsigned long payload = vcpu->arch.exception.payload; if (nr == PF_VECTOR) { if (vcpu->arch.exception.nested_apf) { *exit_qual = vcpu->arch.apf.nested_apf_token; return 1; } - /* - * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception. - * The fix is to add the ancillary datum (CR2 or DR6) to structs - * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 - * can be written only when inject_pending_event runs. This should be - * conditional on a new capability---if the capability is disabled, - * kvm_multiple_exception would write the ancillary information to - * CR2 or DR6, for backwards ABI-compatibility. - */ if (nested_vmx_is_page_fault_vmexit(vmcs12, vcpu->arch.exception.error_code)) { - *exit_qual = vcpu->arch.cr2; - return 1; - } - } else { - if (vmcs12->exception_bitmap & (1u << nr)) { - if (nr == DB_VECTOR) - *exit_qual = vcpu->arch.dr6; - else - *exit_qual = 0; + *exit_qual = has_payload ? payload : vcpu->arch.cr2; return 1; } + } else if (vmcs12->exception_bitmap & (1u << nr)) { + if (nr == DB_VECTOR) { + if (!has_payload) { + payload = vcpu->arch.dr6; + payload &= ~(DR6_FIXED_1 | DR6_BT); + payload ^= DR6_RTM; + } + *exit_qual = payload; + } else + *exit_qual = 0; + return 1; } return 0; @@ -3322,6 +3342,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) u32 error_code = vcpu->arch.exception.error_code; u32 intr_info = nr | INTR_INFO_VALID_MASK; + kvm_deliver_exception_payload(vcpu); + if (has_error_code) { vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; @@ -4393,9 +4415,7 @@ static void kvm_cpu_vmxon(u64 addr) cr4_set_bits(X86_CR4_VMXE); intel_pt_handle_vmx(1); - asm volatile (ASM_VMX_VMXON_RAX - : : "a"(&addr), "m"(addr) - : "memory", "cc"); + asm volatile ("vmxon %0" : : "m"(addr)); } static int hardware_enable(void) @@ -4464,7 +4484,7 @@ static void vmclear_local_loaded_vmcss(void) */ static void kvm_cpu_vmxoff(void) { - asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); + asm volatile (__ex("vmxoff")); intel_pt_handle_vmx(0); cr4_clear_bits(X86_CR4_VMXE); @@ -5108,9 +5128,10 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, bool invalidate_gpa) { if (enable_ept && (invalidate_gpa || !enable_vpid)) { - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return; - ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa)); + ept_sync_context(construct_eptp(vcpu, + vcpu->arch.mmu->root_hpa)); } else { vpid_sync_context(vpid); } @@ -5260,7 +5281,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long hw_cr0; - hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK); + hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); if (enable_unrestricted_guest) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { @@ -6335,6 +6356,9 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) rdmsr(MSR_IA32_CR_PAT, low32, high32); vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); } + + if (cpu_has_load_ia32_efer) + vmcs_write64(HOST_IA32_EFER, host_efer); } static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) @@ -6662,7 +6686,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); if (enable_pml) { - ASSERT(vmx->pml_pg); vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } @@ -8063,35 +8086,39 @@ static int handle_monitor(struct kvm_vcpu *vcpu) /* * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), - * set the success or error code of an emulated VMX instruction, as specified - * by Vol 2B, VMX Instruction Reference, "Conventions". + * set the success or error code of an emulated VMX instruction (as specified + * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated + * instruction. */ -static void nested_vmx_succeed(struct kvm_vcpu *vcpu) +static int nested_vmx_succeed(struct kvm_vcpu *vcpu) { vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); + return kvm_skip_emulated_instruction(vcpu); } -static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) { vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)) | X86_EFLAGS_CF); + return kvm_skip_emulated_instruction(vcpu); } -static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error) +static int nested_vmx_failValid(struct kvm_vcpu *vcpu, + u32 vm_instruction_error) { - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { - /* - * failValid writes the error number to the current VMCS, which - * can't be done there isn't a current VMCS. - */ - nested_vmx_failInvalid(vcpu); - return; - } + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * failValid writes the error number to the current VMCS, which + * can't be done if there isn't a current VMCS. + */ + if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) + return nested_vmx_failInvalid(vcpu); + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_SF | X86_EFLAGS_OF)) @@ -8101,6 +8128,7 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, * We don't need to force a shadow sync because * VM_INSTRUCTION_ERROR is not shadowed */ + return kvm_skip_emulated_instruction(vcpu); } static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) @@ -8288,6 +8316,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) vmx->nested.vpid02 = allocate_vpid(); + vmx->nested.vmcs02_initialized = false; vmx->nested.vmxon = true; return 0; @@ -8341,10 +8370,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (vmx->nested.vmxon) { - nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); - return kvm_skip_emulated_instruction(vcpu); - } + if (vmx->nested.vmxon) + return nested_vmx_failValid(vcpu, + VMXERR_VMXON_IN_VMX_ROOT_OPERATION); if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { @@ -8363,21 +8391,17 @@ static int handle_vmon(struct kvm_vcpu *vcpu) * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; * which replaces physical address width with 32 */ - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + return nested_vmx_failInvalid(vcpu); page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } + if (is_error_page(page)) + return nested_vmx_failInvalid(vcpu); + if (*(u32 *)kmap(page) != VMCS12_REVISION) { kunmap(page); kvm_release_page_clean(page); - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_failInvalid(vcpu); } kunmap(page); kvm_release_page_clean(page); @@ -8387,8 +8411,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (ret) return ret; - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); } /* @@ -8419,8 +8442,24 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); } -static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) +static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.hv_evmcs) + return; + + kunmap(vmx->nested.hv_evmcs_page); + kvm_release_page_dirty(vmx->nested.hv_evmcs_page); + vmx->nested.hv_evmcs_vmptr = -1ull; + vmx->nested.hv_evmcs_page = NULL; + vmx->nested.hv_evmcs = NULL; +} + +static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vmx->nested.current_vmptr == -1ull) return; @@ -8428,16 +8467,18 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); - vmx->nested.sync_shadow_vmcs = false; + vmx->nested.need_vmcs12_sync = false; vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; /* Flush VMCS12 to guest memory */ - kvm_vcpu_write_guest_page(&vmx->vcpu, + kvm_vcpu_write_guest_page(vcpu, vmx->nested.current_vmptr >> PAGE_SHIFT, vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); + kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + vmx->nested.current_vmptr = -1ull; } @@ -8445,8 +8486,10 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. */ -static void free_nested(struct vcpu_vmx *vmx) +static void free_nested(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; @@ -8479,6 +8522,10 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.pi_desc = NULL; } + kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + + nested_release_evmcs(vcpu); + free_loaded_vmcs(&vmx->nested.vmcs02); } @@ -8487,9 +8534,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) { if (!nested_vmx_check_permission(vcpu)) return 1; - free_nested(to_vmx(vcpu)); - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + free_nested(vcpu); + return nested_vmx_succeed(vcpu); } /* Emulate the VMCLEAR instruction */ @@ -8505,25 +8551,28 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - return kvm_skip_emulated_instruction(vcpu); - } + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + return nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_INVALID_ADDRESS); - if (vmptr == vmx->nested.vmxon_ptr) { - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - return kvm_skip_emulated_instruction(vcpu); - } + if (vmptr == vmx->nested.vmxon_ptr) + return nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); - if (vmptr == vmx->nested.current_vmptr) - nested_release_vmcs12(vmx); + if (vmx->nested.hv_evmcs_page) { + if (vmptr == vmx->nested.hv_evmcs_vmptr) + nested_release_evmcs(vcpu); + } else { + if (vmptr == vmx->nested.current_vmptr) + nested_release_vmcs12(vcpu); - kvm_vcpu_write_guest(vcpu, - vmptr + offsetof(struct vmcs12, launch_state), - &zero, sizeof(zero)); + kvm_vcpu_write_guest(vcpu, + vmptr + offsetof(struct vmcs12, + launch_state), + &zero, sizeof(zero)); + } - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); } static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); @@ -8606,6 +8655,395 @@ static inline int vmcs12_write_any(struct vmcs12 *vmcs12, } +static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) +{ + struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + vmcs12->hdr.revision_id = evmcs->revision_id; + + /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ + vmcs12->tpr_threshold = evmcs->tpr_threshold; + vmcs12->guest_rip = evmcs->guest_rip; + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { + vmcs12->guest_rsp = evmcs->guest_rsp; + vmcs12->guest_rflags = evmcs->guest_rflags; + vmcs12->guest_interruptibility_info = + evmcs->guest_interruptibility_info; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { + vmcs12->cpu_based_vm_exec_control = + evmcs->cpu_based_vm_exec_control; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { + vmcs12->exception_bitmap = evmcs->exception_bitmap; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { + vmcs12->vm_entry_controls = evmcs->vm_entry_controls; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { + vmcs12->vm_entry_intr_info_field = + evmcs->vm_entry_intr_info_field; + vmcs12->vm_entry_exception_error_code = + evmcs->vm_entry_exception_error_code; + vmcs12->vm_entry_instruction_len = + evmcs->vm_entry_instruction_len; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { + vmcs12->host_ia32_pat = evmcs->host_ia32_pat; + vmcs12->host_ia32_efer = evmcs->host_ia32_efer; + vmcs12->host_cr0 = evmcs->host_cr0; + vmcs12->host_cr3 = evmcs->host_cr3; + vmcs12->host_cr4 = evmcs->host_cr4; + vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; + vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; + vmcs12->host_rip = evmcs->host_rip; + vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; + vmcs12->host_es_selector = evmcs->host_es_selector; + vmcs12->host_cs_selector = evmcs->host_cs_selector; + vmcs12->host_ss_selector = evmcs->host_ss_selector; + vmcs12->host_ds_selector = evmcs->host_ds_selector; + vmcs12->host_fs_selector = evmcs->host_fs_selector; + vmcs12->host_gs_selector = evmcs->host_gs_selector; + vmcs12->host_tr_selector = evmcs->host_tr_selector; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { + vmcs12->pin_based_vm_exec_control = + evmcs->pin_based_vm_exec_control; + vmcs12->vm_exit_controls = evmcs->vm_exit_controls; + vmcs12->secondary_vm_exec_control = + evmcs->secondary_vm_exec_control; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { + vmcs12->io_bitmap_a = evmcs->io_bitmap_a; + vmcs12->io_bitmap_b = evmcs->io_bitmap_b; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { + vmcs12->msr_bitmap = evmcs->msr_bitmap; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { + vmcs12->guest_es_base = evmcs->guest_es_base; + vmcs12->guest_cs_base = evmcs->guest_cs_base; + vmcs12->guest_ss_base = evmcs->guest_ss_base; + vmcs12->guest_ds_base = evmcs->guest_ds_base; + vmcs12->guest_fs_base = evmcs->guest_fs_base; + vmcs12->guest_gs_base = evmcs->guest_gs_base; + vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; + vmcs12->guest_tr_base = evmcs->guest_tr_base; + vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; + vmcs12->guest_idtr_base = evmcs->guest_idtr_base; + vmcs12->guest_es_limit = evmcs->guest_es_limit; + vmcs12->guest_cs_limit = evmcs->guest_cs_limit; + vmcs12->guest_ss_limit = evmcs->guest_ss_limit; + vmcs12->guest_ds_limit = evmcs->guest_ds_limit; + vmcs12->guest_fs_limit = evmcs->guest_fs_limit; + vmcs12->guest_gs_limit = evmcs->guest_gs_limit; + vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; + vmcs12->guest_tr_limit = evmcs->guest_tr_limit; + vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; + vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; + vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; + vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; + vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; + vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; + vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; + vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; + vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; + vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; + vmcs12->guest_es_selector = evmcs->guest_es_selector; + vmcs12->guest_cs_selector = evmcs->guest_cs_selector; + vmcs12->guest_ss_selector = evmcs->guest_ss_selector; + vmcs12->guest_ds_selector = evmcs->guest_ds_selector; + vmcs12->guest_fs_selector = evmcs->guest_fs_selector; + vmcs12->guest_gs_selector = evmcs->guest_gs_selector; + vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; + vmcs12->guest_tr_selector = evmcs->guest_tr_selector; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { + vmcs12->tsc_offset = evmcs->tsc_offset; + vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; + vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { + vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; + vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; + vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; + vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; + vmcs12->guest_cr0 = evmcs->guest_cr0; + vmcs12->guest_cr3 = evmcs->guest_cr3; + vmcs12->guest_cr4 = evmcs->guest_cr4; + vmcs12->guest_dr7 = evmcs->guest_dr7; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { + vmcs12->host_fs_base = evmcs->host_fs_base; + vmcs12->host_gs_base = evmcs->host_gs_base; + vmcs12->host_tr_base = evmcs->host_tr_base; + vmcs12->host_gdtr_base = evmcs->host_gdtr_base; + vmcs12->host_idtr_base = evmcs->host_idtr_base; + vmcs12->host_rsp = evmcs->host_rsp; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { + vmcs12->ept_pointer = evmcs->ept_pointer; + vmcs12->virtual_processor_id = evmcs->virtual_processor_id; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { + vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; + vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; + vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; + vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; + vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; + vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; + vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; + vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; + vmcs12->guest_pending_dbg_exceptions = + evmcs->guest_pending_dbg_exceptions; + vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; + vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; + vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; + vmcs12->guest_activity_state = evmcs->guest_activity_state; + vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; + } + + /* + * Not used? + * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; + * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; + * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; + * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; + * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; + * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; + * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; + * vmcs12->page_fault_error_code_mask = + * evmcs->page_fault_error_code_mask; + * vmcs12->page_fault_error_code_match = + * evmcs->page_fault_error_code_match; + * vmcs12->cr3_target_count = evmcs->cr3_target_count; + * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; + * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; + * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; + */ + + /* + * Read only fields: + * vmcs12->guest_physical_address = evmcs->guest_physical_address; + * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; + * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; + * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; + * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; + * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; + * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; + * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; + * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; + * vmcs12->exit_qualification = evmcs->exit_qualification; + * vmcs12->guest_linear_address = evmcs->guest_linear_address; + * + * Not present in struct vmcs12: + * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; + * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; + * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; + * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; + */ + + return 0; +} + +static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) +{ + struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + /* + * Should not be changed by KVM: + * + * evmcs->host_es_selector = vmcs12->host_es_selector; + * evmcs->host_cs_selector = vmcs12->host_cs_selector; + * evmcs->host_ss_selector = vmcs12->host_ss_selector; + * evmcs->host_ds_selector = vmcs12->host_ds_selector; + * evmcs->host_fs_selector = vmcs12->host_fs_selector; + * evmcs->host_gs_selector = vmcs12->host_gs_selector; + * evmcs->host_tr_selector = vmcs12->host_tr_selector; + * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; + * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; + * evmcs->host_cr0 = vmcs12->host_cr0; + * evmcs->host_cr3 = vmcs12->host_cr3; + * evmcs->host_cr4 = vmcs12->host_cr4; + * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; + * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; + * evmcs->host_rip = vmcs12->host_rip; + * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; + * evmcs->host_fs_base = vmcs12->host_fs_base; + * evmcs->host_gs_base = vmcs12->host_gs_base; + * evmcs->host_tr_base = vmcs12->host_tr_base; + * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; + * evmcs->host_idtr_base = vmcs12->host_idtr_base; + * evmcs->host_rsp = vmcs12->host_rsp; + * sync_vmcs12() doesn't read these: + * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; + * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; + * evmcs->msr_bitmap = vmcs12->msr_bitmap; + * evmcs->ept_pointer = vmcs12->ept_pointer; + * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; + * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; + * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; + * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; + * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; + * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; + * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; + * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; + * evmcs->tpr_threshold = vmcs12->tpr_threshold; + * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; + * evmcs->exception_bitmap = vmcs12->exception_bitmap; + * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; + * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; + * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; + * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; + * evmcs->page_fault_error_code_mask = + * vmcs12->page_fault_error_code_mask; + * evmcs->page_fault_error_code_match = + * vmcs12->page_fault_error_code_match; + * evmcs->cr3_target_count = vmcs12->cr3_target_count; + * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; + * evmcs->tsc_offset = vmcs12->tsc_offset; + * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; + * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; + * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; + * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; + * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; + * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; + * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; + * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; + * + * Not present in struct vmcs12: + * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; + * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; + * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; + * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; + */ + + evmcs->guest_es_selector = vmcs12->guest_es_selector; + evmcs->guest_cs_selector = vmcs12->guest_cs_selector; + evmcs->guest_ss_selector = vmcs12->guest_ss_selector; + evmcs->guest_ds_selector = vmcs12->guest_ds_selector; + evmcs->guest_fs_selector = vmcs12->guest_fs_selector; + evmcs->guest_gs_selector = vmcs12->guest_gs_selector; + evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; + evmcs->guest_tr_selector = vmcs12->guest_tr_selector; + + evmcs->guest_es_limit = vmcs12->guest_es_limit; + evmcs->guest_cs_limit = vmcs12->guest_cs_limit; + evmcs->guest_ss_limit = vmcs12->guest_ss_limit; + evmcs->guest_ds_limit = vmcs12->guest_ds_limit; + evmcs->guest_fs_limit = vmcs12->guest_fs_limit; + evmcs->guest_gs_limit = vmcs12->guest_gs_limit; + evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; + evmcs->guest_tr_limit = vmcs12->guest_tr_limit; + evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; + evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; + + evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; + evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; + evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; + evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; + evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; + evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; + evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; + evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; + + evmcs->guest_es_base = vmcs12->guest_es_base; + evmcs->guest_cs_base = vmcs12->guest_cs_base; + evmcs->guest_ss_base = vmcs12->guest_ss_base; + evmcs->guest_ds_base = vmcs12->guest_ds_base; + evmcs->guest_fs_base = vmcs12->guest_fs_base; + evmcs->guest_gs_base = vmcs12->guest_gs_base; + evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; + evmcs->guest_tr_base = vmcs12->guest_tr_base; + evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; + evmcs->guest_idtr_base = vmcs12->guest_idtr_base; + + evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; + evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; + + evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; + evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; + evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; + evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; + + evmcs->guest_pending_dbg_exceptions = + vmcs12->guest_pending_dbg_exceptions; + evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; + evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; + + evmcs->guest_activity_state = vmcs12->guest_activity_state; + evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; + + evmcs->guest_cr0 = vmcs12->guest_cr0; + evmcs->guest_cr3 = vmcs12->guest_cr3; + evmcs->guest_cr4 = vmcs12->guest_cr4; + evmcs->guest_dr7 = vmcs12->guest_dr7; + + evmcs->guest_physical_address = vmcs12->guest_physical_address; + + evmcs->vm_instruction_error = vmcs12->vm_instruction_error; + evmcs->vm_exit_reason = vmcs12->vm_exit_reason; + evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; + evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; + evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; + evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; + evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; + evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; + + evmcs->exit_qualification = vmcs12->exit_qualification; + + evmcs->guest_linear_address = vmcs12->guest_linear_address; + evmcs->guest_rsp = vmcs12->guest_rsp; + evmcs->guest_rflags = vmcs12->guest_rflags; + + evmcs->guest_interruptibility_info = + vmcs12->guest_interruptibility_info; + evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; + evmcs->vm_entry_controls = vmcs12->vm_entry_controls; + evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; + evmcs->vm_entry_exception_error_code = + vmcs12->vm_entry_exception_error_code; + evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; + + evmcs->guest_rip = vmcs12->guest_rip; + + evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; + + return 0; +} + /* * Copy the writable VMCS shadow fields back to the VMCS12, in case * they have been modified by the L1 guest. Note that the "read-only" @@ -8679,20 +9117,6 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) vmcs_load(vmx->loaded_vmcs->vmcs); } -/* - * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was - * used before) all generate the same failure when it is missing. - */ -static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->nested.current_vmptr == -1ull) { - nested_vmx_failInvalid(vcpu); - return 0; - } - return 1; -} - static int handle_vmread(struct kvm_vcpu *vcpu) { unsigned long field; @@ -8705,8 +9129,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (!nested_vmx_check_vmcs12(vcpu)) - return kvm_skip_emulated_instruction(vcpu); + if (to_vmx(vcpu)->nested.current_vmptr == -1ull) + return nested_vmx_failInvalid(vcpu); if (!is_guest_mode(vcpu)) vmcs12 = get_vmcs12(vcpu); @@ -8715,20 +9139,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu) * When vmcs->vmcs_link_pointer is -1ull, any VMREAD * to shadowed-field sets the ALU flags for VMfailInvalid. */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) + return nested_vmx_failInvalid(vcpu); vmcs12 = get_shadow_vmcs12(vcpu); } /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(vmcs12, field, &field_value) < 0) { - nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - return kvm_skip_emulated_instruction(vcpu); - } + if (vmcs12_read_any(vmcs12, field, &field_value) < 0) + return nested_vmx_failValid(vcpu, + VMXERR_UNSUPPORTED_VMCS_COMPONENT); + /* * Now copy part of this value to register or memory, as requested. * Note that the number of bits actually copied is 32 or 64 depending @@ -8746,8 +9168,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) (is_long_mode(vcpu) ? 8 : 4), NULL); } - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); } @@ -8772,8 +9193,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (!nested_vmx_check_vmcs12(vcpu)) - return kvm_skip_emulated_instruction(vcpu); + if (vmx->nested.current_vmptr == -1ull) + return nested_vmx_failInvalid(vcpu); if (vmx_instruction_info & (1u << 10)) field_value = kvm_register_readl(vcpu, @@ -8796,11 +9217,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) * VMCS," then the "read-only" fields are actually read/write. */ if (vmcs_field_readonly(field) && - !nested_cpu_has_vmwrite_any_field(vcpu)) { - nested_vmx_failValid(vcpu, + !nested_cpu_has_vmwrite_any_field(vcpu)) + return nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - return kvm_skip_emulated_instruction(vcpu); - } if (!is_guest_mode(vcpu)) vmcs12 = get_vmcs12(vcpu); @@ -8809,18 +9228,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE * to shadowed-field sets the ALU flags for VMfailInvalid. */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) + return nested_vmx_failInvalid(vcpu); vmcs12 = get_shadow_vmcs12(vcpu); - } - if (vmcs12_write_any(vmcs12, field, field_value) < 0) { - nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - return kvm_skip_emulated_instruction(vcpu); - } + if (vmcs12_write_any(vmcs12, field, field_value) < 0) + return nested_vmx_failValid(vcpu, + VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* * Do not track vmcs12 dirty-state if in guest-mode @@ -8842,8 +9257,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) } } - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); } static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) @@ -8854,7 +9268,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, __pa(vmx->vmcs01.shadow_vmcs)); - vmx->nested.sync_shadow_vmcs = true; + vmx->nested.need_vmcs12_sync = true; } vmx->nested.dirty_vmcs12 = true; } @@ -8871,36 +9285,37 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { - nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - return kvm_skip_emulated_instruction(vcpu); - } + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + return nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INVALID_ADDRESS); - if (vmptr == vmx->nested.vmxon_ptr) { - nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); - return kvm_skip_emulated_instruction(vcpu); - } + if (vmptr == vmx->nested.vmxon_ptr) + return nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_VMXON_POINTER); + + /* Forbid normal VMPTRLD if Enlightened version was used */ + if (vmx->nested.hv_evmcs) + return 1; if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; struct page *page; page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } + if (is_error_page(page)) + return nested_vmx_failInvalid(vcpu); + new_vmcs12 = kmap(page); if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || (new_vmcs12->hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { kunmap(page); kvm_release_page_clean(page); - nested_vmx_failValid(vcpu, + return nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); - return kvm_skip_emulated_instruction(vcpu); } - nested_release_vmcs12(vmx); + nested_release_vmcs12(vcpu); + /* * Load VMCS12 from guest memory since it is not already * cached. @@ -8912,8 +9327,71 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) set_current_vmptr(vmx, vmptr); } - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); +} + +/* + * This is an equivalent of the nested hypervisor executing the vmptrld + * instruction. + */ +static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, + bool from_launch) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_vp_assist_page assist_page; + + if (likely(!vmx->nested.enlightened_vmcs_enabled)) + return 1; + + if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) + return 1; + + if (unlikely(!assist_page.enlighten_vmentry)) + return 1; + + if (unlikely(assist_page.current_nested_vmcs != + vmx->nested.hv_evmcs_vmptr)) { + + if (!vmx->nested.hv_evmcs) + vmx->nested.current_vmptr = -1ull; + + nested_release_evmcs(vcpu); + + vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( + vcpu, assist_page.current_nested_vmcs); + + if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) + return 0; + + vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); + + if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) { + nested_release_evmcs(vcpu); + return 0; + } + + vmx->nested.dirty_vmcs12 = true; + /* + * As we keep L2 state for one guest only 'hv_clean_fields' mask + * can't be used when we switch between them. Reset it here for + * simplicity. + */ + vmx->nested.hv_evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs; + + /* + * Unlike normal vmcs12, enlightened vmcs12 is not fully + * reloaded from guest's memory (read only fields, fields not + * present in struct hv_enlightened_vmcs, ...). Make sure there + * are no leftovers. + */ + if (from_launch) + memset(vmx->nested.cached_vmcs12, 0, + sizeof(*vmx->nested.cached_vmcs12)); + + } + return 1; } /* Emulate the VMPTRST instruction */ @@ -8928,6 +9406,9 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; + if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) + return 1; + if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) return 1; /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ @@ -8936,8 +9417,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) kvm_inject_page_fault(vcpu, &e); return 1; } - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); } /* Emulate the INVEPT instruction */ @@ -8967,11 +9447,9 @@ static int handle_invept(struct kvm_vcpu *vcpu) types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; - if (type >= 32 || !(types & (1 << type))) { - nested_vmx_failValid(vcpu, + if (type >= 32 || !(types & (1 << type))) + return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - return kvm_skip_emulated_instruction(vcpu); - } /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) @@ -8993,14 +9471,20 @@ static int handle_invept(struct kvm_vcpu *vcpu) case VMX_EPT_EXTENT_CONTEXT: kvm_mmu_sync_roots(vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - nested_vmx_succeed(vcpu); break; default: BUG_ON(1); break; } - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); +} + +static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; } static int handle_invvpid(struct kvm_vcpu *vcpu) @@ -9014,6 +9498,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) u64 vpid; u64 gla; } operand; + u16 vpid02; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -9031,11 +9516,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; - if (type >= 32 || !(types & (1 << type))) { - nested_vmx_failValid(vcpu, + if (type >= 32 || !(types & (1 << type))) + return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - return kvm_skip_emulated_instruction(vcpu); - } /* according to the intel vmx instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) @@ -9047,47 +9530,39 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) kvm_inject_page_fault(vcpu, &e); return 1; } - if (operand.vpid >> 16) { - nested_vmx_failValid(vcpu, + if (operand.vpid >> 16) + return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - return kvm_skip_emulated_instruction(vcpu); - } + vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: if (!operand.vpid || - is_noncanonical_address(operand.gla, vcpu)) { - nested_vmx_failValid(vcpu, + is_noncanonical_address(operand.gla, vcpu)) + return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - return kvm_skip_emulated_instruction(vcpu); - } - if (cpu_has_vmx_invvpid_individual_addr() && - vmx->nested.vpid02) { + if (cpu_has_vmx_invvpid_individual_addr()) { __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, - vmx->nested.vpid02, operand.gla); + vpid02, operand.gla); } else - __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true); + __vmx_flush_tlb(vcpu, vpid02, false); break; case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: - if (!operand.vpid) { - nested_vmx_failValid(vcpu, + if (!operand.vpid) + return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - return kvm_skip_emulated_instruction(vcpu); - } - __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true); + __vmx_flush_tlb(vcpu, vpid02, false); break; case VMX_VPID_EXTENT_ALL_CONTEXT: - __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true); + __vmx_flush_tlb(vcpu, vpid02, false); break; default: WARN_ON_ONCE(1); return kvm_skip_emulated_instruction(vcpu); } - nested_vmx_succeed(vcpu); - - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); } static int handle_invpcid(struct kvm_vcpu *vcpu) @@ -9158,11 +9633,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) } for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) == operand.pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); - kvm_mmu_free_roots(vcpu, roots_to_free); + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); /* * If neither the current cr3 nor any of the prev_roots use the * given PCID, then nothing needs to be done here because a @@ -9289,7 +9764,7 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, kvm_mmu_unload(vcpu); mmu->ept_ad = accessed_dirty; - mmu->base_role.ad_disabled = !accessed_dirty; + mmu->mmu_role.base.ad_disabled = !accessed_dirty; vmcs12->ept_pointer = address; /* * TODO: Check what's the correct approach in case @@ -9648,9 +10123,6 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return false; else if (is_page_fault(intr_info)) return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; - else if (is_no_device(intr_info) && - !(vmcs12->guest_cr0 & X86_CR0_TS)) - return false; else if (is_debug(intr_info) && vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) @@ -10672,9 +11144,25 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_write32(PLE_WINDOW, vmx->ple_window); } - if (vmx->nested.sync_shadow_vmcs) { - copy_vmcs12_to_shadow(vmx); - vmx->nested.sync_shadow_vmcs = false; + if (vmx->nested.need_vmcs12_sync) { + /* + * hv_evmcs may end up being not mapped after migration (when + * L2 was running), map it here to make sure vmcs12 changes are + * properly reflected. + */ + if (vmx->nested.enlightened_vmcs_enabled && + !vmx->nested.hv_evmcs) + nested_vmx_handle_enlightened_vmptrld(vcpu, false); + + if (vmx->nested.hv_evmcs) { + copy_vmcs12_to_enlightened(vmx); + /* All fields are clean */ + vmx->nested.hv_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + } else { + copy_vmcs12_to_shadow(vmx); + } + vmx->nested.need_vmcs12_sync = false; } if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) @@ -10741,7 +11229,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" "jmp 1f \n\t" "2: \n\t" - __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" + __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" "1: \n\t" /* Reload cr2 if changed */ "mov %c[cr2](%0), %%" _ASM_AX " \n\t" @@ -10773,9 +11261,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) /* Enter guest mode */ "jne 1f \n\t" - __ex(ASM_VMX_VMLAUNCH) "\n\t" + __ex("vmlaunch") "\n\t" "jmp 2f \n\t" - "1: " __ex(ASM_VMX_VMRESUME) "\n\t" + "1: " __ex("vmresume") "\n\t" "2: " /* Save guest registers, load host registers, keep flags */ "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" @@ -10797,6 +11285,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r13, %c[r13](%0) \n\t" "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" + /* + * Clear host registers marked as clobbered to prevent + * speculative use. + */ "xor %%r8d, %%r8d \n\t" "xor %%r9d, %%r9d \n\t" "xor %%r10d, %%r10d \n\t" @@ -10954,6 +11446,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vmx->loaded_vmcs = vmcs; vmx_vcpu_load(vcpu, cpu); put_cpu(); + + vm_entry_controls_reset_shadow(vmx); + vm_exit_controls_reset_shadow(vmx); + vmx_segment_cache_clear(vmx); } /* @@ -10962,12 +11458,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) */ static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - vcpu_load(vcpu); - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - free_nested(vmx); - vcpu_put(vcpu); + vcpu_load(vcpu); + vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); + free_nested(vcpu); + vcpu_put(vcpu); } static void vmx_free_vcpu(struct kvm_vcpu *vcpu) @@ -11330,28 +11824,28 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) return get_vmcs12(vcpu)->ept_pointer; } -static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { WARN_ON(mmu_is_nested(vcpu)); - if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu))) - return 1; + vcpu->arch.mmu = &vcpu->arch.guest_mmu; kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, nested_ept_ad_enabled(vcpu), nested_ept_get_cr3(vcpu)); - vcpu->arch.mmu.set_cr3 = vmx_set_cr3; - vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; - vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; + vcpu->arch.mmu->set_cr3 = vmx_set_cr3; + vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; + vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; + vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; - return 0; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) { - vcpu->arch.walk_mmu = &vcpu->arch.mmu; + vcpu->arch.mmu = &vcpu->arch.root_mmu; + vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; } static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, @@ -11768,15 +12262,12 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - u64 address = vmcs12->pml_address; - int maxphyaddr = cpuid_maxphyaddr(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) { - if (!nested_cpu_has_ept(vmcs12) || - !IS_ALIGNED(address, 4096) || - address >> maxphyaddr) - return -EINVAL; - } + if (!nested_cpu_has_ept(vmcs12) || + !page_address_valid(vcpu, vmcs12->pml_address)) + return -EINVAL; return 0; } @@ -11956,112 +12447,87 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne return 0; } -static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +/* + * Returns if KVM is able to config CPU to tag TLB entries + * populated by L2 differently than TLB entries populated + * by L1. + * + * If L1 uses EPT, then TLB entries are tagged with different EPTP. + * + * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged + * with different VPID (L1 entries are tagged with vmx->vpid + * while L2 entries are tagged with vmx->nested.vpid02). + */ +static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); - vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); - vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); - vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); - vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); - vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); - vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); - vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); - vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); - vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); - vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); - vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); - vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); - vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); - vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); - vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); - vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); - vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); - vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); - vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); - vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); - vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); - vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, - vmcs12->guest_pending_dbg_exceptions); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + return nested_cpu_has_ept(vmcs12) || + (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); +} - if (nested_cpu_has_xsaves(vmcs12)) - vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); - vmcs_write64(VMCS_LINK_POINTER, -1ull); +static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +{ + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) + return vmcs12->guest_ia32_efer; + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); + else + return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); +} - if (cpu_has_vmx_posted_intr()) - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); +static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) +{ + /* + * If vmcs02 hasn't been initialized, set the constant vmcs02 state + * according to L0's settings (vmcs12 is irrelevant here). Host + * fields that come from L0 and are not constant, e.g. HOST_CR3, + * will be set as needed prior to VMLAUNCH/VMRESUME. + */ + if (vmx->nested.vmcs02_initialized) + return; + vmx->nested.vmcs02_initialized = true; /* - * Whether page-faults are trapped is determined by a combination of - * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. - * If enable_ept, L0 doesn't care about page faults and we should - * set all of these to L1's desires. However, if !enable_ept, L0 does - * care about (at least some) page faults, and because it is not easy - * (if at all possible?) to merge L0 and L1's desires, we simply ask - * to exit on each and every L2 page fault. This is done by setting - * MASK=MATCH=0 and (see below) EB.PF=1. - * Note that below we don't need special code to set EB.PF beyond the - * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, - * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when - * !enable_ept, EB.PF is 1, so the "or" will always be 1. + * We don't care what the EPTP value is we just need to guarantee + * it's valid so we don't get a false positive when doing early + * consistency checks. */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, - enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, - enable_ept ? vmcs12->page_fault_error_code_match : 0); + if (enable_ept && nested_early_check) + vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); - if (cpu_has_vmx_apicv()) { - vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); - vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); - vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); - vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); - } + if (cpu_has_vmx_posted_intr()) + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); - /* - * Set host-state according to L0's settings (vmcs12 is irrelevant here) - * Some constant fields are set here by vmx_set_constant_host_state(). - * Other fields are different per CPU, and will be set later when - * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest() - * is called. - */ - vmx_set_constant_host_state(vmx); + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); + + if (enable_pml) + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); /* - * Set the MSR load/store lists to match L0's settings. + * Set the MSR load/store lists to match L0's settings. Only the + * addresses are constant (for vmcs02), the counts can change based + * on L2's behavior, e.g. switching to/from long mode. */ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); - set_cr4_guest_host_mask(vmx); + vmx_set_constant_host_state(vmx); +} - if (kvm_mpx_supported()) { - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); - else - vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); - } +static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, + struct vmcs12 *vmcs12) +{ + prepare_vmcs02_constant_state(vmx); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) @@ -12069,78 +12535,30 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); } - - /* - * L1 may access the L2's PDPTR, so save them to construct vmcs12 - */ - if (enable_ept) { - vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); - vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); - vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); - vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); - } - - if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); } -/* - * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested - * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it - * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 - * guest in a way that will both be appropriate to L1's requests, and our - * needs. In addition to modifying the active vmcs (which is vmcs02), this - * function also has additional necessary side-effects, like setting various - * vcpu->arch fields. - * Returns 0 on success, 1 on failure. Invalid state exit qualification code - * is assigned to entry_failure_code on failure. - */ -static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *entry_failure_code) +static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { - struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control, vmcs12_exec_ctrl; + u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); - if (vmx->nested.dirty_vmcs12) { - prepare_vmcs02_full(vcpu, vmcs12); - vmx->nested.dirty_vmcs12 = false; - } + if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) + prepare_vmcs02_early_full(vmx, vmcs12); /* - * First, the fields that are shadowed. This must be kept in sync - * with vmx_shadow_fields.h. + * HOST_RSP is normally set correctly in vmx_vcpu_run() just before + * entry, but only if the current (host) sp changed from the value + * we wrote last (vmx->host_rsp). This cache is no longer relevant + * if we switch vmcs, and rather than hold a separate cache per vmcs, + * here we just force the write to happen on entry. host_rsp will + * also be written unconditionally by nested_vmx_check_vmentry_hw() + * if we are doing early consistency checks via hardware. */ + vmx->host_rsp = 0; - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); - vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); - vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); - vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); - vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); - - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { - kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); - } else { - kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); - } - if (vmx->nested.nested_run_pending) { - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - vmcs12->vm_entry_intr_info_field); - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs12->vm_entry_exception_error_code); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs12->vm_entry_instruction_len); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs12->guest_interruptibility_info); - vmx->loaded_vmcs->nmi_known_unmasked = - !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); - } else { - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); - } - vmx_set_rflags(vcpu, vmcs12->guest_rflags); - + /* + * PIN CONTROLS + */ exec_control = vmcs12->pin_based_vm_exec_control; /* Preemption timer setting is computed directly in vmx_vcpu_run. */ @@ -12155,13 +12573,43 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } else { exec_control &= ~PIN_BASED_POSTED_INTR; } - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); + /* + * EXEC CONTROLS + */ + exec_control = vmx_exec_control(vmx); /* L0's desires */ + exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + exec_control &= ~CPU_BASED_TPR_SHADOW; + exec_control |= vmcs12->cpu_based_vm_exec_control; + /* + * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if + * nested_get_vmcs12_pages can't fix it up, the illegal value + * will result in a VM entry failure. + */ + if (exec_control & CPU_BASED_TPR_SHADOW) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } else { +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif + } + + /* + * A vmexit (to either L1 hypervisor or L0 userspace) is always needed + * for I/O port accesses. + */ + exec_control &= ~CPU_BASED_USE_IO_BITMAPS; + exec_control |= CPU_BASED_UNCOND_IO_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + + /* + * SECONDARY EXEC CONTROLS + */ if (cpu_has_secondary_exec_ctrls()) { exec_control = vmx->secondary_exec_control; @@ -12202,43 +12650,214 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } /* - * HOST_RSP is normally set correctly in vmx_vcpu_run() just before - * entry, but only if the current (host) sp changed from the value - * we wrote last (vmx->host_rsp). This cache is no longer relevant - * if we switch vmcs, and rather than hold a separate cache per vmcs, - * here we just force the write to happen on entry. + * ENTRY CONTROLS + * + * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE + * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate + * on the related bits (if supported by the CPU) in the hope that + * we can avoid VMWrites during vmx_set_efer(). + */ + exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) & + ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; + if (cpu_has_load_ia32_efer) { + if (guest_efer & EFER_LMA) + exec_control |= VM_ENTRY_IA32E_MODE; + if (guest_efer != host_efer) + exec_control |= VM_ENTRY_LOAD_IA32_EFER; + } + vm_entry_controls_init(vmx, exec_control); + + /* + * EXIT CONTROLS + * + * L2->L1 exit controls are emulated - the hardware exit is to L0 so + * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER + * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ - vmx->host_rsp = 0; + exec_control = vmcs_config.vmexit_ctrl; + if (cpu_has_load_ia32_efer && guest_efer != host_efer) + exec_control |= VM_EXIT_LOAD_IA32_EFER; + vm_exit_controls_init(vmx, exec_control); - exec_control = vmx_exec_control(vmx); /* L0's desires */ - exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; - exec_control &= ~CPU_BASED_TPR_SHADOW; - exec_control |= vmcs12->cpu_based_vm_exec_control; + /* + * Conceptually we want to copy the PML address and index from + * vmcs01 here, and then back to vmcs01 on nested vmexit. But, + * since we always flush the log on each vmexit and never change + * the PML address (once set), this happens to be equivalent to + * simply resetting the index in vmcs02. + */ + if (enable_pml) + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); /* - * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if - * nested_get_vmcs12_pages can't fix it up, the illegal value - * will result in a VM entry failure. + * Interrupt/Exception Fields */ - if (exec_control & CPU_BASED_TPR_SHADOW) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); - vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + if (vmx->nested.nested_run_pending) { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs12->vm_entry_intr_info_field); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs12->vm_entry_exception_error_code); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs12->vm_entry_instruction_len); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs12->guest_interruptibility_info); + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); } else { -#ifdef CONFIG_X86_64 - exec_control |= CPU_BASED_CR8_LOAD_EXITING | - CPU_BASED_CR8_STORE_EXITING; -#endif + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); + } +} + +static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +{ + struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); + vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); + vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); + vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); + vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); + vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); + vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); + vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); + vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); + vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); + vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); + vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); + vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); + vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); + vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); + vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); + vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); + vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); + vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); + vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); + vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); + vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); + vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); + vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); + vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); + vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); + vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); + vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); + vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); + vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); + vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); + vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); + vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); + vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + } + + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs12->guest_pending_dbg_exceptions); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + + /* + * L1 may access the L2's PDPTR, so save them to construct + * vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } } + if (nested_cpu_has_xsaves(vmcs12)) + vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); + /* - * A vmexit (to either L1 hypervisor or L0 userspace) is always needed - * for I/O port accesses. + * Whether page-faults are trapped is determined by a combination of + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. + * If enable_ept, L0 doesn't care about page faults and we should + * set all of these to L1's desires. However, if !enable_ept, L0 does + * care about (at least some) page faults, and because it is not easy + * (if at all possible?) to merge L0 and L1's desires, we simply ask + * to exit on each and every L2 page fault. This is done by setting + * MASK=MATCH=0 and (see below) EB.PF=1. + * Note that below we don't need special code to set EB.PF beyond the + * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, + * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when + * !enable_ept, EB.PF is 1, so the "or" will always be 1. */ - exec_control &= ~CPU_BASED_USE_IO_BITMAPS; - exec_control |= CPU_BASED_UNCOND_IO_EXITING; + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, + enable_ept ? vmcs12->page_fault_error_code_mask : 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, + enable_ept ? vmcs12->page_fault_error_code_match : 0); - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + if (cpu_has_vmx_apicv()) { + vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); + vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); + vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); + vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); + } + + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + + set_cr4_guest_host_mask(vmx); + + if (kvm_mpx_supported()) { + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + else + vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + } +} + +/* + * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 + * guest in a way that will both be appropriate to L1's requests, and our + * needs. In addition to modifying the active vmcs (which is vmcs02), this + * function also has additional necessary side-effects, like setting various + * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 *entry_failure_code) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + + if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { + prepare_vmcs02_full(vmx, vmcs12); + vmx->nested.dirty_vmcs12 = false; + } + + /* + * First, the fields that are shadowed. This must be kept in sync + * with vmx_shadow_fields.h. + */ + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); + } + + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + } + vmx_set_rflags(vcpu, vmcs12->guest_rflags); + + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the * bitwise-or of what L1 wants to trap for L2, and what we want to @@ -12248,20 +12867,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - /* L2->L1 exit controls are emulated - the hardware exit is to L0 so - * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER - * bits are further modified by vmx_set_efer() below. - */ - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); - - /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are - * emulated by vmx_set_efer(), below. - */ - vm_entry_controls_init(vmx, - (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & - ~VM_ENTRY_IA32E_MODE) | - (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); - if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); @@ -12284,37 +12889,29 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * influence global bitmap(for vpid01 and vpid02 allocation) * even if spawn a lot of nested vCPUs. */ - if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { + if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; - __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true); + __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); } } else { - vmx_flush_tlb(vcpu, true); + /* + * If L1 use EPT, then L0 needs to execute INVEPT on + * EPTP02 instead of EPTP01. Therefore, delay TLB + * flush until vmcs02->eptp is fully updated by + * KVM_REQ_LOAD_CR3. Note that this assumes + * KVM_REQ_TLB_FLUSH is evaluated after + * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). + */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } } - if (enable_pml) { - /* - * Conceptually we want to copy the PML address and index from - * vmcs01 here, and then back to vmcs01 on nested vmexit. But, - * since we always flush the log on each vmexit, this happens - * to be equivalent to simply resetting the fields in vmcs02. - */ - ASSERT(vmx->pml_pg); - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - } - - if (nested_cpu_has_ept(vmcs12)) { - if (nested_ept_init_mmu_context(vcpu)) { - *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; - } - } else if (nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + if (nested_cpu_has_ept(vmcs12)) + nested_ept_init_mmu_context(vcpu); + else if (nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) vmx_flush_tlb(vcpu, true); - } /* * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those @@ -12330,14 +12927,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_set_cr4(vcpu, vmcs12->guest_cr4); vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) - vcpu->arch.efer = vmcs12->guest_ia32_efer; - else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) - vcpu->arch.efer |= (EFER_LMA | EFER_LME); - else - vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); - /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ + vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); + /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ vmx_set_efer(vcpu, vcpu->arch.efer); /* @@ -12379,6 +12970,7 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); + bool ia32e; if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) @@ -12453,6 +13045,21 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; /* + * If the load IA32_EFER VM-exit control is 1, bits reserved in the + * IA32_EFER MSR must be 0 in the field for that register. In addition, + * the values of the LMA and LME bits in the field must each be that of + * the host address-space size VM-exit control. + */ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { + ia32e = (vmcs12->vm_exit_controls & + VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; + if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) + return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + } + + /* * From the Intel SDM, volume 3: * Fields relevant to VM-entry event injection must be set properly. * These fields are the VM-entry interruption-information field, the @@ -12508,6 +13115,10 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } } + if (nested_cpu_has_ept(vmcs12) && + !valid_ept_address(vcpu, vmcs12->ept_pointer)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return 0; } @@ -12573,21 +13184,6 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 1; } - /* - * If the load IA32_EFER VM-exit control is 1, bits reserved in the - * IA32_EFER MSR must be 0 in the field for that register. In addition, - * the values of the LMA and LME bits in the field must each be that of - * the host address-space size VM-exit control. - */ - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { - ia32e = (vmcs12->vm_exit_controls & - VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; - if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) - return 1; - } - if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) @@ -12596,26 +13192,139 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 0; } +static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4; + + if (!nested_early_check) + return 0; + + if (vmx->msr_autoload.host.nr) + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + if (vmx->msr_autoload.guest.nr) + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + + preempt_disable(); + + vmx_prepare_switch_to_guest(vcpu); + + /* + * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, + * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to + * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. + * there is no need to preserve other bits or save/restore the field. + */ + vmcs_writel(GUEST_RFLAGS, 0); + + vmcs_writel(HOST_RIP, vmx_early_consistency_check_return); + + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + + cr4 = cr4_read_shadow(); + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->loaded_vmcs->host_state.cr4 = cr4; + } + + vmx->__launched = vmx->loaded_vmcs->launched; + + asm( + /* Set HOST_RSP */ + __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t" + + /* Check if vmlaunch of vmresume is needed */ + "cmpl $0, %c[launched](%0)\n\t" + "je 1f\n\t" + __ex("vmresume") "\n\t" + "jmp 2f\n\t" + "1: " __ex("vmlaunch") "\n\t" + "jmp 2f\n\t" + "2: " + + /* Set vmx->fail accordingly */ + "setbe %c[fail](%0)\n\t" + + ".pushsection .rodata\n\t" + ".global vmx_early_consistency_check_return\n\t" + "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t" + ".popsection" + : + : "c"(vmx), "d"((unsigned long)HOST_RSP), + [launched]"i"(offsetof(struct vcpu_vmx, __launched)), + [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)) + : "rax", "cc", "memory" + ); + + vmcs_writel(HOST_RIP, vmx_return); + + preempt_enable(); + + if (vmx->msr_autoload.host.nr) + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + if (vmx->msr_autoload.guest.nr) + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + + if (vmx->fail) { + WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != + VMXERR_ENTRY_INVALID_CONTROL_FIELD); + vmx->fail = 0; + return 1; + } + + /* + * VMExit clears RFLAGS.IF and DR7, even on a consistency check. + */ + local_irq_enable(); + if (hw_breakpoint_active()) + set_debugreg(__this_cpu_read(cpu_dr7), 7); + + /* + * A non-failing VMEntry means we somehow entered guest mode with + * an illegal RIP, and that's just the tip of the iceberg. There + * is no telling what memory has been modified or what state has + * been exposed to unknown code. Hitting this all but guarantees + * a (very critical) hardware issue. + */ + WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & + VMX_EXIT_REASONS_FAILED_VMENTRY)); + + return 0; +} +STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); + +static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12); + /* - * If exit_qual is NULL, this is being called from state restore (either RSM + * If from_vmentry is false, this is being called from state restore (either RSM * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. ++ * ++ * Returns: ++ * 0 - success, i.e. proceed with actual VMEnter ++ * 1 - consistency check VMExit ++ * -1 - consistency check VMFail */ -static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) +static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - bool from_vmentry = !!exit_qual; - u32 dummy_exit_qual; bool evaluate_pending_interrupts; - int r = 0; + u32 exit_reason = EXIT_REASON_INVALID_STATE; + u32 exit_qual; evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); - enter_guest_mode(vcpu); - if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (kvm_mpx_supported() && @@ -12623,24 +13332,35 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); - vmx_segment_cache_clear(vmx); + prepare_vmcs02_early(vmx, vmcs12); + + if (from_vmentry) { + nested_get_vmcs12_pages(vcpu); + + if (nested_vmx_check_vmentry_hw(vcpu)) { + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + return -1; + } + + if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + goto vmentry_fail_vmexit; + } + + enter_guest_mode(vcpu); if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vcpu->arch.tsc_offset += vmcs12->tsc_offset; - r = EXIT_REASON_INVALID_STATE; - if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual)) - goto fail; + if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) + goto vmentry_fail_vmexit_guest_mode; if (from_vmentry) { - nested_get_vmcs12_pages(vcpu); - - r = EXIT_REASON_MSR_LOAD_FAIL; - *exit_qual = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (*exit_qual) - goto fail; + exit_reason = EXIT_REASON_MSR_LOAD_FAIL; + exit_qual = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (exit_qual) + goto vmentry_fail_vmexit_guest_mode; } else { /* * The MMU is not initialized to point at the right entities yet and @@ -12677,12 +13397,28 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) */ return 0; -fail: + /* + * A failed consistency check that leads to a VMExit during L1's + * VMEnter to L2 is a variation of a normal VMexit, as explained in + * 26.7 "VM-entry failures during or after loading guest state". + */ +vmentry_fail_vmexit_guest_mode: if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; leave_guest_mode(vcpu); + +vmentry_fail_vmexit: vmx_switch_vmcs(vcpu, &vmx->vmcs01); - return r; + + if (!from_vmentry) + return 1; + + load_vmcs12_host_state(vcpu, vmcs12); + vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; + vmcs12->exit_qualification = exit_qual; + if (enable_shadow_vmcs || vmx->nested.hv_evmcs) + vmx->nested.need_vmcs12_sync = true; + return 1; } /* @@ -12694,14 +13430,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); - u32 exit_qual; int ret; if (!nested_vmx_check_permission(vcpu)) return 1; - if (!nested_vmx_check_vmcs12(vcpu)) - goto out; + if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true)) + return 1; + + if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) + return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); @@ -12711,13 +13449,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * rather than RFLAGS.ZF, and no error number is stored to the * VM-instruction error field. */ - if (vmcs12->hdr.shadow_vmcs) { - nested_vmx_failInvalid(vcpu); - goto out; - } + if (vmcs12->hdr.shadow_vmcs) + return nested_vmx_failInvalid(vcpu); - if (enable_shadow_vmcs) + if (vmx->nested.hv_evmcs) { + copy_enlightened_to_vmcs12(vmx); + /* Enlightened VMCS doesn't have launch state */ + vmcs12->launch_state = !launch; + } else if (enable_shadow_vmcs) { copy_shadow_to_vmcs12(vmx); + } /* * The nested entry process starts with enforcing various prerequisites @@ -12729,59 +13470,37 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ - if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) { - nested_vmx_failValid(vcpu, - VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); - goto out; - } + if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) + return nested_vmx_failValid(vcpu, + VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); - if (vmcs12->launch_state == launch) { - nested_vmx_failValid(vcpu, + if (vmcs12->launch_state == launch) + return nested_vmx_failValid(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - goto out; - } ret = check_vmentry_prereqs(vcpu, vmcs12); - if (ret) { - nested_vmx_failValid(vcpu, ret); - goto out; - } - - /* - * After this point, the trap flag no longer triggers a singlestep trap - * on the vm entry instructions; don't call kvm_skip_emulated_instruction. - * This is not 100% correct; for performance reasons, we delegate most - * of the checks on host state to the processor. If those fail, - * the singlestep trap is missed. - */ - skip_emulated_instruction(vcpu); - - ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual); - if (ret) { - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, exit_qual); - return 1; - } + if (ret) + return nested_vmx_failValid(vcpu, ret); /* * We're finally done with prerequisite checking, and can start with * the nested entry. */ - vmx->nested.nested_run_pending = 1; - ret = enter_vmx_non_root_mode(vcpu, &exit_qual); - if (ret) { - nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual); - vmx->nested.nested_run_pending = 0; + ret = nested_vmx_enter_non_root_mode(vcpu, true); + vmx->nested.nested_run_pending = !ret; + if (ret > 0) return 1; - } + else if (ret) + return nested_vmx_failValid(vcpu, + VMXERR_ENTRY_INVALID_CONTROL_FIELD); /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; /* - * Must happen outside of enter_vmx_non_root_mode() as it will + * Must happen outside of nested_vmx_enter_non_root_mode() as it will * also be used as part of restoring nVMX state for * snapshot restore (migration). * @@ -12802,9 +13521,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return kvm_vcpu_halt(vcpu); } return 1; - -out: - return kvm_skip_emulated_instruction(vcpu); } /* @@ -13118,24 +13834,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, kvm_clear_interrupt_queue(vcpu); } -static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - u32 entry_failure_code; - - nested_ept_uninit_mmu_context(vcpu); - - /* - * Only PDPTE load can fail as the value of cr3 was checked on entry and - * couldn't have changed. - */ - if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) - nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); - - if (!enable_ept) - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; -} - /* * A part of what we need to when the nested L2 guest exits and we want to * run its L1 parent, is to reset L1's guest state to the host state specified @@ -13149,6 +13847,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct kvm_segment seg; + u32 entry_failure_code; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -13161,6 +13860,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); + vmx_set_interrupt_shadow(vcpu, 0); + /* * Note that calling vmx_set_cr0 is important, even if cr0 hasn't * actually changed, because vmx_set_cr0 refers to efer set above. @@ -13175,23 +13876,35 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); vmx_set_cr4(vcpu, vmcs12->host_cr4); - load_vmcs12_mmu_host_state(vcpu, vmcs12); + nested_ept_uninit_mmu_context(vcpu); + + /* + * Only PDPTE load can fail as the value of cr3 was checked on entry and + * couldn't have changed. + */ + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); + + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; /* - * If vmcs01 don't use VPID, CPU flushes TLB on every + * If vmcs01 doesn't use VPID, CPU flushes TLB on every * VMEntry/VMExit. Thus, no need to flush TLB. * - * If vmcs12 uses VPID, TLB entries populated by L2 are - * tagged with vmx->nested.vpid02 while L1 entries are tagged - * with vmx->vpid. Thus, no need to flush TLB. + * If vmcs12 doesn't use VPID, L1 expects TLB to be + * flushed on every VMEntry/VMExit. + * + * Otherwise, we can preserve TLB entries as long as we are + * able to tag L1 TLB entries differently than L2 TLB entries. * - * Therefore, flush TLB only in case vmcs01 uses VPID and - * vmcs12 don't use VPID as in this case L1 & L2 TLB entries - * are both tagged with vmx->vpid. + * If vmcs12 uses EPT, we need to execute this flush on EPTP01 + * and therefore we request the TLB flush to happen only after VMCS EPTP + * has been set by KVM_REQ_LOAD_CR3. */ if (enable_vpid && - !(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02)) { - vmx_flush_tlb(vcpu, true); + (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); @@ -13271,6 +13984,140 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); } +static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) +{ + struct shared_msr_entry *efer_msr; + unsigned int i; + + if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) + return vmcs_read64(GUEST_IA32_EFER); + + if (cpu_has_load_ia32_efer) + return host_efer; + + for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { + if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) + return vmx->msr_autoload.guest.val[i].value; + } + + efer_msr = find_msr_entry(vmx, MSR_EFER); + if (efer_msr) + return efer_msr->data; + + return host_efer; +} + +static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmx_msr_entry g, h; + struct msr_data msr; + gpa_t gpa; + u32 i, j; + + vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); + + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { + /* + * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set + * as vmcs01.GUEST_DR7 contains a userspace defined value + * and vcpu->arch.dr7 is not squirreled away before the + * nested VMENTER (not worth adding a variable in nested_vmx). + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + kvm_set_dr(vcpu, 7, DR7_FIXED_1); + else + WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); + } + + /* + * Note that calling vmx_set_{efer,cr0,cr4} is important as they + * handle a variety of side effects to KVM's software model. + */ + vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); + + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); + + vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); + vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); + + nested_ept_uninit_mmu_context(vcpu); + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + /* + * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs + * from vmcs01 (if necessary). The PDPTRs are not loaded on + * VMFail, like everything else we just need to ensure our + * software model is up-to-date. + */ + ept_save_pdptrs(vcpu); + + kvm_mmu_reset_context(vcpu); + + if (cpu_has_vmx_msr_bitmap()) + vmx_update_msr_bitmap(vcpu); + + /* + * This nasty bit of open coding is a compromise between blindly + * loading L1's MSRs using the exit load lists (incorrect emulation + * of VMFail), leaving the nested VM's MSRs in the software model + * (incorrect behavior) and snapshotting the modified MSRs (too + * expensive since the lists are unbound by hardware). For each + * MSR that was (prematurely) loaded from the nested VMEntry load + * list, reload it from the exit load list if it exists and differs + * from the guest value. The intent is to stuff host state as + * silently as possible, not to fully process the exit load list. + */ + msr.host_initiated = false; + for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { + gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); + if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { + pr_debug_ratelimited( + "%s read MSR index failed (%u, 0x%08llx)\n", + __func__, i, gpa); + goto vmabort; + } + + for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { + gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); + if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { + pr_debug_ratelimited( + "%s read MSR failed (%u, 0x%08llx)\n", + __func__, j, gpa); + goto vmabort; + } + if (h.index != g.index) + continue; + if (h.value == g.value) + break; + + if (nested_vmx_load_msr_check(vcpu, &h)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, j, h.index, h.reserved); + goto vmabort; + } + + msr.index = h.index; + msr.data = h.value; + if (kvm_set_msr(vcpu, &msr)) { + pr_debug_ratelimited( + "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", + __func__, j, h.index, h.value); + goto vmabort; + } + } + } + + return; + +vmabort: + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); +} + /* * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 * and modify vmcs12 to make it see what it would expect to see there if @@ -13286,14 +14133,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); - /* - * The only expected VM-instruction error is "VM entry with - * invalid control field(s)." Anything else indicates a - * problem with L0. - */ - WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) != - VMXERR_ENTRY_INVALID_CONTROL_FIELD)); - leave_guest_mode(vcpu); if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) @@ -13320,12 +14159,19 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + } else { + /* + * The only expected VM-instruction error is "VM entry with + * invalid control field(s)." Anything else indicates a + * problem with L0. And we should never get here with a + * VMFail of any type if early consistency checks are enabled. + */ + WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != + VMXERR_ENTRY_INVALID_CONTROL_FIELD); + WARN_ON_ONCE(nested_early_check); } vmx_switch_vmcs(vcpu, &vmx->vmcs01); - vm_entry_controls_reset_shadow(vmx); - vm_exit_controls_reset_shadow(vmx); - vmx_segment_cache_clear(vmx); /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); @@ -13369,8 +14215,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (enable_shadow_vmcs && exit_reason != -1) - vmx->nested.sync_shadow_vmcs = true; + if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) + vmx->nested.need_vmcs12_sync = true; /* in case we halted in L2 */ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -13405,24 +14251,24 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, return; } - + /* * After an early L2 VM-entry failure, we're now back * in L1 which thinks it just finished a VMLAUNCH or * VMRESUME instruction, so we need to set the failure * flag and the VM-instruction error field of the VMCS - * accordingly. + * accordingly, and skip the emulated instruction. */ - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - - load_vmcs12_mmu_host_state(vcpu, vmcs12); + (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); /* - * The emulated instruction was already skipped in - * nested_vmx_run, but the updated RIP was never - * written back to the vmcs01. + * Restore L1's host state to KVM's software model. We're here + * because a consistency check was caught by hardware, which + * means some amount of guest state has been propagated to KVM's + * model and needs to be unwound to the host's state. */ - skip_emulated_instruction(vcpu); + nested_vmx_restore_host_state(vcpu); + vmx->fail = 0; } @@ -13435,26 +14281,7 @@ static void vmx_leave_nested(struct kvm_vcpu *vcpu) to_vmx(vcpu)->nested.nested_run_pending = 0; nested_vmx_vmexit(vcpu, -1, 0, 0); } - free_nested(to_vmx(vcpu)); -} - -/* - * L1's failure to enter L2 is a subset of a normal exit, as explained in - * 23.7 "VM-entry failures during or after loading guest state" (this also - * lists the acceptable exit-reason and exit-qualification parameters). - * It should only be called before L2 actually succeeded to run, and when - * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss). - */ -static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, - u32 reason, unsigned long qualification) -{ - load_vmcs12_host_state(vcpu, vmcs12); - vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; - vmcs12->exit_qualification = qualification; - nested_vmx_succeed(vcpu); - if (enable_shadow_vmcs) - to_vmx(vcpu)->nested.sync_shadow_vmcs = true; + free_nested(vcpu); } static int vmx_check_intercept(struct kvm_vcpu *vcpu, @@ -13880,7 +14707,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) if (vmx->nested.smm.guest_mode) { vcpu->arch.hflags &= ~HF_SMM_MASK; - ret = enter_vmx_non_root_mode(vcpu, NULL); + ret = nested_vmx_enter_non_root_mode(vcpu, false); vcpu->arch.hflags |= HF_SMM_MASK; if (ret) return ret; @@ -13895,6 +14722,20 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } +static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * In case we do two consecutive get/set_nested_state()s while L2 was + * running hv_evmcs may end up not being mapped (we map it from + * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always + * have vmcs12 if it is true. + */ + return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull || + vmx->nested.hv_evmcs; +} + static int vmx_get_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, u32 user_data_size) @@ -13914,12 +14755,16 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); + + if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled) + kvm_state.flags |= KVM_STATE_NESTED_EVMCS; + if (nested_vmx_allowed(vcpu) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; - if (vmx->nested.current_vmptr != -1ull) { + if (vmx_has_valid_vmcs12(vcpu)) { kvm_state.size += VMCS12_SIZE; if (is_guest_mode(vcpu) && @@ -13948,20 +14793,24 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) return -EFAULT; - if (vmx->nested.current_vmptr == -1ull) + if (!vmx_has_valid_vmcs12(vcpu)) goto out; /* * When running L2, the authoritative vmcs12 state is in the * vmcs02. When running L1, the authoritative vmcs12 state is - * in the shadow vmcs linked to vmcs01, unless - * sync_shadow_vmcs is set, in which case, the authoritative + * in the shadow or enlightened vmcs linked to vmcs01, unless + * need_vmcs12_sync is set, in which case, the authoritative * vmcs12 state is in the vmcs12 already. */ - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu)) { sync_vmcs12(vcpu, vmcs12); - else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs) - copy_shadow_to_vmcs12(vmx); + } else if (!vmx->nested.need_vmcs12_sync) { + if (vmx->nested.hv_evmcs) + copy_enlightened_to_vmcs12(vmx); + else if (enable_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + } if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) return -EFAULT; @@ -13989,6 +14838,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->format != 0) return -EINVAL; + if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) + nested_enable_evmcs(vcpu, NULL); + if (!nested_vmx_allowed(vcpu)) return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; @@ -14006,13 +14858,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) return -EINVAL; - if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) - return -EINVAL; - - if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || - !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) - return -EINVAL; - if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) return -EINVAL; @@ -14042,7 +14887,25 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (ret) return ret; - set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); + /* Empty 'VMXON' state is permitted */ + if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) + return 0; + + if (kvm_state->vmx.vmcs_pa != -1ull) { + if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || + !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) + return -EINVAL; + + set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); + } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { + /* + * Sync eVMCS upon entry as we may not have + * HV_X64_MSR_VP_ASSIST_PAGE set up yet. + */ + vmx->nested.need_vmcs12_sync = true; + } else { + return -EINVAL; + } if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { vmx->nested.smm.vmxon = true; @@ -14086,7 +14949,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; vmx->nested.dirty_vmcs12 = true; - ret = enter_vmx_non_root_mode(vcpu, NULL); + ret = nested_vmx_enter_non_root_mode(vcpu, false); if (ret) return -EINVAL; @@ -14238,6 +15101,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .pre_enter_smm = vmx_pre_enter_smm, .pre_leave_smm = vmx_pre_leave_smm, .enable_smi_window = enable_smi_window, + + .nested_enable_evmcs = nested_enable_evmcs, }; static void vmx_cleanup_l1d_flush(void) diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx_shadow_fields.h index cd0c75f6d037..132432f375c2 100644 --- a/arch/x86/kvm/vmx_shadow_fields.h +++ b/arch/x86/kvm/vmx_shadow_fields.h @@ -28,7 +28,6 @@ */ /* 16-bits */ -SHADOW_FIELD_RW(GUEST_CS_SELECTOR) SHADOW_FIELD_RW(GUEST_INTR_STATUS) SHADOW_FIELD_RW(GUEST_PML_INDEX) SHADOW_FIELD_RW(HOST_FS_SELECTOR) @@ -47,8 +46,8 @@ SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE) SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD) SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN) SHADOW_FIELD_RW(TPR_THRESHOLD) -SHADOW_FIELD_RW(GUEST_CS_LIMIT) SHADOW_FIELD_RW(GUEST_CS_AR_BYTES) +SHADOW_FIELD_RW(GUEST_SS_AR_BYTES) SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO) SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE) @@ -61,8 +60,6 @@ SHADOW_FIELD_RW(GUEST_CR0) SHADOW_FIELD_RW(GUEST_CR3) SHADOW_FIELD_RW(GUEST_CR4) SHADOW_FIELD_RW(GUEST_RFLAGS) -SHADOW_FIELD_RW(GUEST_CS_BASE) -SHADOW_FIELD_RW(GUEST_ES_BASE) SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK) SHADOW_FIELD_RW(CR0_READ_SHADOW) SHADOW_FIELD_RW(CR4_READ_SHADOW) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca717737347e..bdcb5babfb68 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -136,7 +136,7 @@ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* lapic timer advance (tscdeadline mode only) in nanoseconds */ -unsigned int __read_mostly lapic_timer_advance_ns = 0; +unsigned int __read_mostly lapic_timer_advance_ns = 1000; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); EXPORT_SYMBOL_GPL(lapic_timer_advance_ns); @@ -400,9 +400,51 @@ static int exception_type(int vector) return EXCPT_FAULT; } +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) +{ + unsigned nr = vcpu->arch.exception.nr; + bool has_payload = vcpu->arch.exception.has_payload; + unsigned long payload = vcpu->arch.exception.payload; + + if (!has_payload) + return; + + switch (nr) { + case DB_VECTOR: + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~DR_TRAP_BITS; + /* + * DR6.RTM is set by all #DB exceptions that don't clear it. + */ + vcpu->arch.dr6 |= DR6_RTM; + vcpu->arch.dr6 |= payload; + /* + * Bit 16 should be set in the payload whenever the #DB + * exception should clear DR6.RTM. This makes the payload + * compatible with the pending debug exceptions under VMX. + * Though not currently documented in the SDM, this also + * makes the payload compatible with the exit qualification + * for #DB exceptions under VMX. + */ + vcpu->arch.dr6 ^= payload & DR6_RTM; + break; + case PF_VECTOR: + vcpu->arch.cr2 = payload; + break; + } + + vcpu->arch.exception.has_payload = false; + vcpu->arch.exception.payload = 0; +} +EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); + static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, - bool reinject) + bool has_payload, unsigned long payload, bool reinject) { u32 prev_nr; int class1, class2; @@ -424,6 +466,14 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, */ WARN_ON_ONCE(vcpu->arch.exception.pending); vcpu->arch.exception.injected = true; + if (WARN_ON_ONCE(has_payload)) { + /* + * A reinjected event has already + * delivered its payload. + */ + has_payload = false; + payload = 0; + } } else { vcpu->arch.exception.pending = true; vcpu->arch.exception.injected = false; @@ -431,6 +481,22 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.nr = nr; vcpu->arch.exception.error_code = error_code; + vcpu->arch.exception.has_payload = has_payload; + vcpu->arch.exception.payload = payload; + /* + * In guest mode, payload delivery should be deferred, + * so that the L1 hypervisor can intercept #PF before + * CR2 is modified (or intercept #DB before DR6 is + * modified under nVMX). However, for ABI + * compatibility with KVM_GET_VCPU_EVENTS and + * KVM_SET_VCPU_EVENTS, we can't delay payload + * delivery unless userspace has enabled this + * functionality via the per-VM capability, + * KVM_CAP_EXCEPTION_PAYLOAD. + */ + if (!vcpu->kvm->arch.exception_payload_enabled || + !is_guest_mode(vcpu)) + kvm_deliver_exception_payload(vcpu); return; } @@ -455,6 +521,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, vcpu->arch.exception.has_error_code = true; vcpu->arch.exception.nr = DF_VECTOR; vcpu->arch.exception.error_code = 0; + vcpu->arch.exception.has_payload = false; + vcpu->arch.exception.payload = 0; } else /* replace previous exception with a new one in a hope that instruction re-execution will regenerate lost @@ -464,16 +532,29 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, false); + kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, true); + kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); } EXPORT_SYMBOL_GPL(kvm_requeue_exception); +static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, + unsigned long payload) +{ + kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); +} + +static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, + u32 error_code, unsigned long payload) +{ + kvm_multiple_exception(vcpu, nr, true, error_code, + true, payload, false); +} + int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) @@ -490,11 +571,13 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) ++vcpu->stat.pf_guest; vcpu->arch.exception.nested_apf = is_guest_mode(vcpu) && fault->async_page_fault; - if (vcpu->arch.exception.nested_apf) + if (vcpu->arch.exception.nested_apf) { vcpu->arch.apf.nested_apf_token = fault->address; - else - vcpu->arch.cr2 = fault->address; - kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); + kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); + } else { + kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, + fault->address); + } } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); @@ -503,7 +586,7 @@ static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fau if (mmu_is_nested(vcpu) && !fault->nested_page_fault) vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); else - vcpu->arch.mmu.inject_page_fault(vcpu, fault); + vcpu->arch.mmu->inject_page_fault(vcpu, fault); return fault->nested_page_fault; } @@ -517,13 +600,13 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, false); + kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, true); + kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); } EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); @@ -602,7 +685,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & - vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) { + vcpu->arch.mmu->guest_rsvd_check.rsvd_bits_mask[0][2])) { ret = 0; goto out; } @@ -2477,7 +2560,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_PV_EOI_EN: - if (kvm_lapic_enable_pv_eoi(vcpu, data)) + if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; @@ -2912,6 +2995,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_VP_INDEX: case KVM_CAP_HYPERV_EVENTFD: case KVM_CAP_HYPERV_TLBFLUSH: + case KVM_CAP_HYPERV_SEND_IPI: + case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -2930,6 +3015,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: + case KVM_CAP_EXCEPTION_PAYLOAD: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -3185,11 +3271,16 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_put(vcpu); vcpu->arch.last_host_tsc = rdtsc(); /* - * If userspace has set any breakpoints or watchpoints, dr6 is restored - * on every vmexit, but if not, we might have a stale dr6 from the - * guest. do_debug expects dr6 to be cleared after it runs, do the same. + * Here dr6 is either zero or, if the guest has run and userspace + * has not set any breakpoints or watchpoints, it can be set to + * the guest dr6 (stored in vcpu->arch.dr6). do_debug expects dr6 + * to be cleared after it runs, so clear the host register. However, + * MOV to DR can be expensive when running nested, omit it if + * vcpu->arch.dr6 is already zero: in that case, the host dr6 cannot + * currently be nonzero. */ - set_debugreg(0, 6); + if (vcpu->arch.dr6) + set_debugreg(0, 6); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, @@ -3362,19 +3453,33 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { process_nmi(vcpu); + /* - * FIXME: pass injected and pending separately. This is only - * needed for nested virtualization, whose state cannot be - * migrated yet. For now we can combine them. + * The API doesn't provide the instruction length for software + * exceptions, so don't report them. As long as the guest RIP + * isn't advanced, we should expect to encounter the exception + * again. */ - events->exception.injected = - (vcpu->arch.exception.pending || - vcpu->arch.exception.injected) && - !kvm_exception_is_soft(vcpu->arch.exception.nr); + if (kvm_exception_is_soft(vcpu->arch.exception.nr)) { + events->exception.injected = 0; + events->exception.pending = 0; + } else { + events->exception.injected = vcpu->arch.exception.injected; + events->exception.pending = vcpu->arch.exception.pending; + /* + * For ABI compatibility, deliberately conflate + * pending and injected exceptions when + * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled. + */ + if (!vcpu->kvm->arch.exception_payload_enabled) + events->exception.injected |= + vcpu->arch.exception.pending; + } events->exception.nr = vcpu->arch.exception.nr; events->exception.has_error_code = vcpu->arch.exception.has_error_code; - events->exception.pad = 0; events->exception.error_code = vcpu->arch.exception.error_code; + events->exception_has_payload = vcpu->arch.exception.has_payload; + events->exception_payload = vcpu->arch.exception.payload; events->interrupt.injected = vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; @@ -3398,6 +3503,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM); + if (vcpu->kvm->arch.exception_payload_enabled) + events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; + memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -3409,12 +3517,24 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW - | KVM_VCPUEVENT_VALID_SMM)) + | KVM_VCPUEVENT_VALID_SMM + | KVM_VCPUEVENT_VALID_PAYLOAD)) return -EINVAL; - if (events->exception.injected && - (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR || - is_guest_mode(vcpu))) + if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { + if (!vcpu->kvm->arch.exception_payload_enabled) + return -EINVAL; + if (events->exception.pending) + events->exception.injected = 0; + else + events->exception_has_payload = 0; + } else { + events->exception.pending = 0; + events->exception_has_payload = 0; + } + + if ((events->exception.injected || events->exception.pending) && + (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) return -EINVAL; /* INITs are latched while in SMM */ @@ -3424,11 +3544,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return -EINVAL; process_nmi(vcpu); - vcpu->arch.exception.injected = false; - vcpu->arch.exception.pending = events->exception.injected; + vcpu->arch.exception.injected = events->exception.injected; + vcpu->arch.exception.pending = events->exception.pending; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; vcpu->arch.exception.error_code = events->exception.error_code; + vcpu->arch.exception.has_payload = events->exception_has_payload; + vcpu->arch.exception.payload = events->exception_payload; vcpu->arch.interrupt.injected = events->interrupt.injected; vcpu->arch.interrupt.nr = events->interrupt.nr; @@ -3694,6 +3816,10 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, struct kvm_enable_cap *cap) { + int r; + uint16_t vmcs_version; + void __user *user_ptr; + if (cap->flags) return -EINVAL; @@ -3706,6 +3832,16 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return -EINVAL; return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); + case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: + r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version); + if (!r) { + user_ptr = (void __user *)(uintptr_t)cap->args[0]; + if (copy_to_user(user_ptr, &vmcs_version, + sizeof(vmcs_version))) + r = -EFAULT; + } + return r; + default: return -EINVAL; } @@ -4047,11 +4183,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; if (kvm_state.flags & - ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) + ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE + | KVM_STATE_NESTED_EVMCS)) break; /* nested_run_pending implies guest_mode. */ - if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) + if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING) + && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE)) break; r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); @@ -4363,6 +4501,10 @@ split_irqchip_unlock: kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; r = 0; break; + case KVM_CAP_EXCEPTION_PAYLOAD: + kvm->arch.exception_payload_enabled = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -4803,7 +4945,7 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, /* NPT walks are always user-walks */ access |= PFERR_USER_MASK; - t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception); + t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception); return t_gpa; } @@ -5889,7 +6031,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, if (WARN_ON_ONCE(is_guest_mode(vcpu))) return false; - if (!vcpu->arch.mmu.direct_map) { + if (!vcpu->arch.mmu->direct_map) { /* * Write permission should be allowed since only * write access need to be emulated. @@ -5922,7 +6064,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, kvm_release_pfn_clean(pfn); /* The instructions are well-emulated on direct mmu. */ - if (vcpu->arch.mmu.direct_map) { + if (vcpu->arch.mmu->direct_map) { unsigned int indirect_shadow_pages; spin_lock(&vcpu->kvm->mmu_lock); @@ -5989,7 +6131,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, vcpu->arch.last_retry_eip = ctxt->eip; vcpu->arch.last_retry_addr = cr2; - if (!vcpu->arch.mmu.direct_map) + if (!vcpu->arch.mmu->direct_map) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); @@ -6049,14 +6191,7 @@ static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) kvm_run->exit_reason = KVM_EXIT_DEBUG; *r = EMULATE_USER_EXIT; } else { - /* - * "Certain debug exceptions may clear bit 0-3. The - * remaining contents of the DR6 register are never - * cleared by the processor". - */ - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); + kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); } } @@ -6995,10 +7130,22 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); - if (vcpu->arch.exception.nr == DB_VECTOR && - (vcpu->arch.dr7 & DR7_GD)) { - vcpu->arch.dr7 &= ~DR7_GD; - kvm_update_dr7(vcpu); + if (vcpu->arch.exception.nr == DB_VECTOR) { + /* + * This code assumes that nSVM doesn't use + * check_nested_events(). If it does, the + * DR6/DR7 changes should happen before L1 + * gets a #VMEXIT for an intercepted #DB in + * L2. (Under VMX, on the other hand, the + * DR6/DR7 changes should not happen in the + * event of a VM-exit to L1 for an intercepted + * #DB in L2.) + */ + kvm_deliver_exception_payload(vcpu); + if (vcpu->arch.dr7 & DR7_GD) { + vcpu->arch.dr7 &= ~DR7_GD; + kvm_update_dr7(vcpu); + } } kvm_x86_ops->queue_exception(vcpu); @@ -8478,7 +8625,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvm_vcpu_mtrr_init(vcpu); vcpu_load(vcpu); kvm_vcpu_reset(vcpu, false); - kvm_mmu_setup(vcpu); + kvm_init_mmu(vcpu, false); vcpu_put(vcpu); return 0; } @@ -9327,7 +9474,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { int r; - if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || + if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) || work->wakeup_all) return; @@ -9335,11 +9482,11 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) if (unlikely(r)) return; - if (!vcpu->arch.mmu.direct_map && - work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) + if (!vcpu->arch.mmu->direct_map && + work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) return; - vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); + vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true); } static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) @@ -9463,6 +9610,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, vcpu->arch.exception.nr = 0; vcpu->arch.exception.has_error_code = false; vcpu->arch.exception.error_code = 0; + vcpu->arch.exception.has_payload = false; + vcpu->arch.exception.payload = 0; } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { fault.vector = PF_VECTOR; fault.error_code_valid = true; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 67b9568613f3..224cd0a47568 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -266,6 +266,8 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, int handle_ud(struct kvm_vcpu *vcpu); +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu); + void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7a8fc26c1115..faca978ebf9d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -815,10 +815,14 @@ void free_kernel_image_pages(void *begin, void *end) set_memory_np_noalias(begin_ul, len_pages); } +void __weak mem_encrypt_free_decrypted_mem(void) { } + void __ref free_initmem(void) { e820__reallocate_tables(); + mem_encrypt_free_decrypted_mem(); + free_kernel_image_pages(&__init_begin, &__init_end); } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index b2de398d1fd3..006f373f54ab 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -348,6 +348,30 @@ bool sev_active(void) EXPORT_SYMBOL(sev_active); /* Architecture __weak replacement functions */ +void __init mem_encrypt_free_decrypted_mem(void) +{ + unsigned long vaddr, vaddr_end, npages; + int r; + + vaddr = (unsigned long)__start_bss_decrypted_unused; + vaddr_end = (unsigned long)__end_bss_decrypted; + npages = (vaddr_end - vaddr) >> PAGE_SHIFT; + + /* + * The unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. + */ + if (mem_encrypt_active()) { + r = set_memory_encrypted(vaddr, npages); + if (r) { + pr_warn("failed to free unused decrypted pages\n"); + return; + } + } + + free_init_pages("unused decrypted", vaddr, vaddr_end); +} + void __init mem_encrypt_init(void) { if (!sme_me_mask) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ae394552fb94..089e78c4effd 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -637,6 +637,15 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); +#ifdef CONFIG_X86_64 + /* + * Ensure that the static initial page tables are covering the + * fixmap completely. + */ + BUILD_BUG_ON(__end_of_permanent_fixed_addresses > + (FIXMAP_PMD_NUM * PTRS_PER_PTE)); +#endif + if (idx >= __end_of_fixed_addresses) { BUG(); return; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2fe5c9b1816b..dd461c0167ef 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1907,7 +1907,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* L3_k[511] -> level2_fixmap_pgt */ convert_pfn_mfn(level3_kernel_pgt); - /* L3_k[511][506] -> level1_fixmap_pgt */ + /* L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt */ convert_pfn_mfn(level2_fixmap_pgt); /* We get [511][511] and have Xen's version of level2_kernel_pgt */ @@ -1952,7 +1952,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); + + for (i = 0; i < FIXMAP_PMD_NUM; i++) { + set_page_prot(level1_fixmap_pgt + i * PTRS_PER_PTE, + PAGE_KERNEL_RO); + } /* Pin down new L4 */ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 7d00d4ad44d4..95997e6c0696 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -478,7 +478,7 @@ static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) { int err, ret = IRQ_NONE; - struct pt_regs regs; + struct pt_regs regs = {0}; const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); uint8_t xenpmu_flags = get_xenpmu_flags(); diff --git a/block/bio.c b/block/bio.c index 8c680a776171..0093bed81c0e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1684,7 +1684,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op, const int sgrp = op_stat_group(req_op); int cpu = part_stat_lock(); - part_stat_add(cpu, part, ticks[sgrp], duration); + part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); part_round_stats(q, cpu, part); part_dec_in_flight(q, part, op_is_write(req_op)); diff --git a/block/blk-core.c b/block/blk-core.c index 4dbc93f43b38..cff0a60ee200 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2733,17 +2733,15 @@ void blk_account_io_done(struct request *req, u64 now) * containing request is enough. */ if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { - unsigned long duration; const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; - duration = nsecs_to_jiffies(now - req->start_time_ns); cpu = part_stat_lock(); part = req->part; part_stat_inc(cpu, part, ios[sgrp]); - part_stat_add(cpu, part, ticks[sgrp], duration); + part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); part_round_stats(req->q, cpu, part); part_dec_in_flight(req->q, part, rq_data_dir(req)); diff --git a/block/genhd.c b/block/genhd.c index 8cc719a37b32..be5bab20b2ab 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1343,18 +1343,18 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, ios[STAT_READ]), part_stat_read(hd, merges[STAT_READ]), part_stat_read(hd, sectors[STAT_READ]), - jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])), + (unsigned int)part_stat_read_msecs(hd, STAT_READ), part_stat_read(hd, ios[STAT_WRITE]), part_stat_read(hd, merges[STAT_WRITE]), part_stat_read(hd, sectors[STAT_WRITE]), - jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])), + (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), inflight[0], jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)), part_stat_read(hd, ios[STAT_DISCARD]), part_stat_read(hd, merges[STAT_DISCARD]), part_stat_read(hd, sectors[STAT_DISCARD]), - jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD])) + (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD) ); } disk_part_iter_exit(&piter); diff --git a/block/partition-generic.c b/block/partition-generic.c index 5a8975a1201c..d3d14e81fb12 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -136,18 +136,18 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, ios[STAT_READ]), part_stat_read(p, merges[STAT_READ]), (unsigned long long)part_stat_read(p, sectors[STAT_READ]), - jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])), + (unsigned int)part_stat_read_msecs(p, STAT_READ), part_stat_read(p, ios[STAT_WRITE]), part_stat_read(p, merges[STAT_WRITE]), (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), - jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])), + (unsigned int)part_stat_read_msecs(p, STAT_WRITE), inflight[0], jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue)), part_stat_read(p, ios[STAT_DISCARD]), part_stat_read(p, merges[STAT_DISCARD]), (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD]))); + (unsigned int)part_stat_read_msecs(p, STAT_DISCARD)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index d8e159feb573..89110dfc7127 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -90,14 +90,17 @@ config EFI_ARMSTUB config EFI_ARMSTUB_DTB_LOADER bool "Enable the DTB loader" depends on EFI_ARMSTUB + default y help Select this config option to add support for the dtb= command line parameter, allowing a device tree blob to be loaded into memory from the EFI System Partition by the stub. - The device tree is typically provided by the platform or by - the bootloader, so this option is mostly for development - purposes only. + If the device tree is provided by the platform or by + the bootloader this option may not be needed. + But, for various development reasons and to maintain existing + functionality for bootloaders that do not have such support + this option is necessary. config EFI_BOOTLOADER_CONTROL tristate "EFI Bootloader Control" diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c index e11ab12fbdf2..800986a79704 100644 --- a/drivers/mfd/omap-usb-host.c +++ b/drivers/mfd/omap-usb-host.c @@ -528,8 +528,8 @@ static int usbhs_omap_get_dt_pdata(struct device *dev, } static const struct of_device_id usbhs_child_match_table[] = { - { .compatible = "ti,omap-ehci", }, - { .compatible = "ti,omap-ohci", }, + { .compatible = "ti,ehci-omap", }, + { .compatible = "ti,ohci-omap3", }, { } }; @@ -855,6 +855,7 @@ static struct platform_driver usbhs_omap_driver = { .pm = &usbhsomap_dev_pm_ops, .of_match_table = usbhs_omap_dt_ids, }, + .probe = usbhs_omap_probe, .remove = usbhs_omap_remove, }; @@ -864,9 +865,9 @@ MODULE_ALIAS("platform:" USBHS_DRIVER_NAME); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("usb host common core driver for omap EHCI and OHCI"); -static int __init omap_usbhs_drvinit(void) +static int omap_usbhs_drvinit(void) { - return platform_driver_probe(&usbhs_omap_driver, usbhs_omap_probe); + return platform_driver_register(&usbhs_omap_driver); } /* @@ -878,7 +879,7 @@ static int __init omap_usbhs_drvinit(void) */ fs_initcall_sync(omap_usbhs_drvinit); -static void __exit omap_usbhs_drvexit(void) +static void omap_usbhs_drvexit(void) { platform_driver_unregister(&usbhs_omap_driver); } diff --git a/drivers/pinctrl/intel/pinctrl-cannonlake.c b/drivers/pinctrl/intel/pinctrl-cannonlake.c index fb1afe55bf53..8d48371caaa2 100644 --- a/drivers/pinctrl/intel/pinctrl-cannonlake.c +++ b/drivers/pinctrl/intel/pinctrl-cannonlake.c @@ -379,7 +379,7 @@ static const struct intel_padgroup cnlh_community1_gpps[] = { static const struct intel_padgroup cnlh_community3_gpps[] = { CNL_GPP(0, 155, 178, 192), /* GPP_K */ CNL_GPP(1, 179, 202, 224), /* GPP_H */ - CNL_GPP(2, 203, 215, 258), /* GPP_E */ + CNL_GPP(2, 203, 215, 256), /* GPP_E */ CNL_GPP(3, 216, 239, 288), /* GPP_F */ CNL_GPP(4, 240, 248, CNL_NO_GPIO), /* SPI */ }; diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index 62b009b27eda..ec8dafc94694 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -747,13 +747,63 @@ static const struct pinctrl_desc intel_pinctrl_desc = { .owner = THIS_MODULE, }; +/** + * intel_gpio_to_pin() - Translate from GPIO offset to pin number + * @pctrl: Pinctrl structure + * @offset: GPIO offset from gpiolib + * @commmunity: Community is filled here if not %NULL + * @padgrp: Pad group is filled here if not %NULL + * + * When coming through gpiolib irqchip, the GPIO offset is not + * automatically translated to pinctrl pin number. This function can be + * used to find out the corresponding pinctrl pin. + */ +static int intel_gpio_to_pin(struct intel_pinctrl *pctrl, unsigned offset, + const struct intel_community **community, + const struct intel_padgroup **padgrp) +{ + int i; + + for (i = 0; i < pctrl->ncommunities; i++) { + const struct intel_community *comm = &pctrl->communities[i]; + int j; + + for (j = 0; j < comm->ngpps; j++) { + const struct intel_padgroup *pgrp = &comm->gpps[j]; + + if (pgrp->gpio_base < 0) + continue; + + if (offset >= pgrp->gpio_base && + offset < pgrp->gpio_base + pgrp->size) { + int pin; + + pin = pgrp->base + offset - pgrp->gpio_base; + if (community) + *community = comm; + if (padgrp) + *padgrp = pgrp; + + return pin; + } + } + } + + return -EINVAL; +} + static int intel_gpio_get(struct gpio_chip *chip, unsigned offset) { struct intel_pinctrl *pctrl = gpiochip_get_data(chip); void __iomem *reg; u32 padcfg0; + int pin; + + pin = intel_gpio_to_pin(pctrl, offset, NULL, NULL); + if (pin < 0) + return -EINVAL; - reg = intel_get_padcfg(pctrl, offset, PADCFG0); + reg = intel_get_padcfg(pctrl, pin, PADCFG0); if (!reg) return -EINVAL; @@ -770,8 +820,13 @@ static void intel_gpio_set(struct gpio_chip *chip, unsigned offset, int value) unsigned long flags; void __iomem *reg; u32 padcfg0; + int pin; + + pin = intel_gpio_to_pin(pctrl, offset, NULL, NULL); + if (pin < 0) + return; - reg = intel_get_padcfg(pctrl, offset, PADCFG0); + reg = intel_get_padcfg(pctrl, pin, PADCFG0); if (!reg) return; @@ -790,8 +845,13 @@ static int intel_gpio_get_direction(struct gpio_chip *chip, unsigned int offset) struct intel_pinctrl *pctrl = gpiochip_get_data(chip); void __iomem *reg; u32 padcfg0; + int pin; - reg = intel_get_padcfg(pctrl, offset, PADCFG0); + pin = intel_gpio_to_pin(pctrl, offset, NULL, NULL); + if (pin < 0) + return -EINVAL; + + reg = intel_get_padcfg(pctrl, pin, PADCFG0); if (!reg) return -EINVAL; @@ -827,51 +887,6 @@ static const struct gpio_chip intel_gpio_chip = { .set_config = gpiochip_generic_config, }; -/** - * intel_gpio_to_pin() - Translate from GPIO offset to pin number - * @pctrl: Pinctrl structure - * @offset: GPIO offset from gpiolib - * @commmunity: Community is filled here if not %NULL - * @padgrp: Pad group is filled here if not %NULL - * - * When coming through gpiolib irqchip, the GPIO offset is not - * automatically translated to pinctrl pin number. This function can be - * used to find out the corresponding pinctrl pin. - */ -static int intel_gpio_to_pin(struct intel_pinctrl *pctrl, unsigned offset, - const struct intel_community **community, - const struct intel_padgroup **padgrp) -{ - int i; - - for (i = 0; i < pctrl->ncommunities; i++) { - const struct intel_community *comm = &pctrl->communities[i]; - int j; - - for (j = 0; j < comm->ngpps; j++) { - const struct intel_padgroup *pgrp = &comm->gpps[j]; - - if (pgrp->gpio_base < 0) - continue; - - if (offset >= pgrp->gpio_base && - offset < pgrp->gpio_base + pgrp->size) { - int pin; - - pin = pgrp->base + offset - pgrp->gpio_base; - if (community) - *community = comm; - if (padgrp) - *padgrp = pgrp; - - return pin; - } - } - } - - return -EINVAL; -} - static int intel_gpio_irq_reqres(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 7bafa703a992..84575baceebc 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -1040,18 +1040,33 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, return ret; for (i = 0; i < count; i++) { - /* Retry eagain maps */ - if (map_ops[i].status == GNTST_eagain) - gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, - &map_ops[i].status, __func__); - - if (map_ops[i].status == GNTST_okay) { + switch (map_ops[i].status) { + case GNTST_okay: + { struct xen_page_foreign *foreign; SetPageForeign(pages[i]); foreign = xen_page_foreign(pages[i]); foreign->domid = map_ops[i].dom; foreign->gref = map_ops[i].ref; + break; + } + + case GNTST_no_device_space: + pr_warn_ratelimited("maptrack limit reached, can't map all guest pages\n"); + break; + + case GNTST_eagain: + /* Retry eagain maps */ + gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, + map_ops + i, + &map_ops[i].status, __func__); + /* Test status in next loop iteration. */ + i--; + break; + + default: + break; } } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 57864422a2c8..25c08c6c7f99 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -83,10 +83,10 @@ struct partition { } __attribute__((packed)); struct disk_stats { + u64 nsecs[NR_STAT_GROUPS]; unsigned long sectors[NR_STAT_GROUPS]; unsigned long ios[NR_STAT_GROUPS]; unsigned long merges[NR_STAT_GROUPS]; - unsigned long ticks[NR_STAT_GROUPS]; unsigned long io_ticks; unsigned long time_in_queue; }; @@ -354,6 +354,9 @@ static inline void free_part_stats(struct hd_struct *part) #endif /* CONFIG_SMP */ +#define part_stat_read_msecs(part, which) \ + div_u64(part_stat_read(part, nsecs[which]), NSEC_PER_MSEC) + #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ part_stat_read(part, field[STAT_WRITE]) + \ diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 8bdbb5f29494..74b0aa9c7499 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -357,6 +357,8 @@ #define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt) #define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb) +#define GITS_CBASER_ADDRESS(cbaser) ((cbaser) & GENMASK_ULL(51, 12)) + #define GITS_BASER_NR_REGS 8 #define GITS_BASER_VALID (1ULL << 63) @@ -388,6 +390,9 @@ #define GITS_BASER_ENTRY_SIZE_MASK GENMASK_ULL(52, 48) #define GITS_BASER_PHYS_52_to_48(phys) \ (((phys) & GENMASK_ULL(47, 16)) | (((phys) >> 48) & 0xf) << 12) +#define GITS_BASER_ADDR_48_to_52(baser) \ + (((baser) & GENMASK_ULL(47, 16)) | (((baser) >> 12) & 0xf) << 48) + #define GITS_BASER_SHAREABILITY_SHIFT (10) #define GITS_BASER_InnerShareable \ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) diff --git a/include/linux/mfd/da9063/pdata.h b/include/linux/mfd/da9063/pdata.h index 8a125701ef7b..50bed4f89c1a 100644 --- a/include/linux/mfd/da9063/pdata.h +++ b/include/linux/mfd/da9063/pdata.h @@ -21,7 +21,7 @@ /* * Regulator configuration */ -/* DA9063 regulator IDs */ +/* DA9063 and DA9063L regulator IDs */ enum { /* BUCKs */ DA9063_ID_BCORE1, @@ -37,18 +37,20 @@ enum { DA9063_ID_BMEM_BIO_MERGED, /* When two BUCKs are merged, they cannot be reused separately */ - /* LDOs */ + /* LDOs on both DA9063 and DA9063L */ + DA9063_ID_LDO3, + DA9063_ID_LDO7, + DA9063_ID_LDO8, + DA9063_ID_LDO9, + DA9063_ID_LDO11, + + /* DA9063-only LDOs */ DA9063_ID_LDO1, DA9063_ID_LDO2, - DA9063_ID_LDO3, DA9063_ID_LDO4, DA9063_ID_LDO5, DA9063_ID_LDO6, - DA9063_ID_LDO7, - DA9063_ID_LDO8, - DA9063_ID_LDO9, DA9063_ID_LDO10, - DA9063_ID_LDO11, }; /* Regulators platform data */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7f2ff3a76995..2b7a652c9fa4 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -420,13 +420,19 @@ struct kvm_run { struct kvm_coalesced_mmio_zone { __u64 addr; __u32 size; - __u32 pad; + union { + __u32 pad; + __u32 pio; + }; }; struct kvm_coalesced_mmio { __u64 phys_addr; __u32 len; - __u32 pad; + union { + __u32 pad; + __u32 pio; + }; __u8 data[8]; }; @@ -752,6 +758,15 @@ struct kvm_ppc_resize_hpt { #define KVM_S390_SIE_PAGE_OFFSET 1 /* + * On arm64, machine type can be used to request the physical + * address size for the VM. Bits[7-0] are reserved for the guest + * PA size shift (i.e, log2(PA_Size)). For backward compatibility, + * value 0 implies the default IPA size, 40bits. + */ +#define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL +#define KVM_VM_TYPE_ARM_IPA_SIZE(x) \ + ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK) +/* * ioctls for /dev/kvm fds: */ #define KVM_GET_API_VERSION _IO(KVMIO, 0x00) @@ -955,6 +970,11 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 #define KVM_CAP_MSR_PLATFORM_INFO 159 #define KVM_CAP_PPC_NESTED_HV 160 +#define KVM_CAP_HYPERV_SEND_IPI 161 +#define KVM_CAP_COALESCED_PIO 162 +#define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163 +#define KVM_CAP_EXCEPTION_PAYLOAD 164 +#define KVM_CAP_ARM_VM_IPA_SIZE 165 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 86299efa804a..b09875f580d5 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -288,6 +288,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 +#define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -299,7 +300,10 @@ struct kvm_vcpu_events { __u8 injected; __u8 nr; __u8 has_error_code; - __u8 pad; + union { + __u8 pad; + __u8 pending; + }; __u32 error_code; } exception; struct { @@ -322,7 +326,9 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; - __u32 reserved[9]; + __u8 reserved[27]; + __u8 exception_has_payload; + __u64 exception_payload; }; /* for KVM_GET/SET_DEBUGREGS */ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 07548de5c988..2875ce85b322 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -719,6 +719,7 @@ struct kvm_ppc_one_seg_page_size { #define KVM_PPC_PAGE_SIZES_REAL 0x00000001 #define KVM_PPC_1T_SEGMENTS 0x00000002 +#define KVM_PPC_NO_HASH 0x00000004 struct kvm_ppc_smmu_info { __u64 flags; @@ -952,6 +953,11 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_HPAGE_1M 156 #define KVM_CAP_NESTED_STATE 157 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 +#define KVM_CAP_MSR_PLATFORM_INFO 159 +#define KVM_CAP_PPC_NESTED_HV 160 +#define KVM_CAP_HYPERV_SEND_IPI 161 +#define KVM_CAP_COALESCED_PIO 162 +#define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 13a861135127..6eb9bacd1948 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1 +1 @@ -libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o +libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2abd0f112627..bdb94939fd60 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -50,6 +50,7 @@ #include "libbpf.h" #include "bpf.h" #include "btf.h" +#include "str_error.h" #ifndef EM_BPF #define EM_BPF 247 @@ -469,7 +470,7 @@ static int bpf_object__elf_init(struct bpf_object *obj) obj->efile.fd = open(obj->path, O_RDONLY); if (obj->efile.fd < 0) { char errmsg[STRERR_BUFSIZE]; - char *cp = strerror_r(errno, errmsg, sizeof(errmsg)); + char *cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to open %s: %s\n", obj->path, cp); return -errno; @@ -810,8 +811,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj) data->d_size, name, idx); if (err) { char errmsg[STRERR_BUFSIZE]; - char *cp = strerror_r(-err, errmsg, - sizeof(errmsg)); + char *cp = str_error(-err, errmsg, sizeof(errmsg)); pr_warning("failed to alloc program %s (%s): %s", name, obj->path, cp); @@ -1140,7 +1140,7 @@ bpf_object__create_maps(struct bpf_object *obj) *pfd = bpf_create_map_xattr(&create_attr); if (*pfd < 0 && create_attr.btf_key_type_id) { - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", map->name, cp, errno); create_attr.btf_fd = 0; @@ -1155,7 +1155,7 @@ bpf_object__create_maps(struct bpf_object *obj) size_t j; err = *pfd; - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to create map (name: '%s'): %s\n", map->name, cp); for (j = 0; j < i; j++) @@ -1339,7 +1339,7 @@ load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type, } ret = -LIBBPF_ERRNO__LOAD; - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("load bpf program failed: %s\n", cp); if (log_buf && log_buf[0] != '\0') { @@ -1654,7 +1654,7 @@ static int check_path(const char *path) dir = dirname(dname); if (statfs(dir, &st_fs)) { - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to statfs %s: %s\n", dir, cp); err = -errno; } @@ -1690,7 +1690,7 @@ int bpf_program__pin_instance(struct bpf_program *prog, const char *path, } if (bpf_obj_pin(prog->instances.fds[instance], path)) { - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to pin program: %s\n", cp); return -errno; } @@ -1708,7 +1708,7 @@ static int make_dir(const char *path) err = -errno; if (err) { - cp = strerror_r(-err, errmsg, sizeof(errmsg)); + cp = str_error(-err, errmsg, sizeof(errmsg)); pr_warning("failed to mkdir %s: %s\n", path, cp); } return err; @@ -1770,7 +1770,7 @@ int bpf_map__pin(struct bpf_map *map, const char *path) } if (bpf_obj_pin(map->fd, path)) { - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to pin map: %s\n", cp); return -errno; } diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c new file mode 100644 index 000000000000..b8798114a357 --- /dev/null +++ b/tools/lib/bpf/str_error.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: LGPL-2.1 +#undef _GNU_SOURCE +#include <string.h> +#include <stdio.h> +#include "str_error.h" + +/* + * Wrapper to allow for building in non-GNU systems such as Alpine Linux's musl + * libc, while checking strerror_r() return to avoid having to check this in + * all places calling it. + */ +char *str_error(int err, char *dst, int len) +{ + int ret = strerror_r(err, dst, len); + if (ret) + snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + return dst; +} diff --git a/tools/lib/bpf/str_error.h b/tools/lib/bpf/str_error.h new file mode 100644 index 000000000000..355b1db571d1 --- /dev/null +++ b/tools/lib/bpf/str_error.h @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: LGPL-2.1 +#ifndef BPF_STR_ERROR +#define BPF_STR_ERROR + +char *str_error(int err, char *dst, int len); +#endif // BPF_STR_ERROR diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index 42261a9b280e..ac841bc5c35b 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -280,7 +280,7 @@ $(MAN_HTML): $(OUTPUT)%.html : %.txt mv $@+ $@ ifdef USE_ASCIIDOCTOR -$(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.txt +$(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : %.txt $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ $(ASCIIDOC) -b manpage -d manpage \ $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 5c34752e1cff..6210ba41c29e 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,6 +1,8 @@ -cr4_cpuid_sync_test -platform_info_test -set_sregs_test -sync_regs_test -vmx_tsc_adjust_test -state_test +/x86_64/cr4_cpuid_sync_test +/x86_64/evmcs_test +/x86_64/platform_info_test +/x86_64/set_sregs_test +/x86_64/sync_regs_test +/x86_64/vmx_tsc_adjust_test +/x86_64/state_test +/dirty_log_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index ec32dad3c3f0..01a219229238 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -1,26 +1,30 @@ all: -top_srcdir = ../../../../ +top_srcdir = ../../../.. UNAME_M := $(shell uname -m) -LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c -LIBKVM_x86_64 = lib/x86.c lib/vmx.c - -TEST_GEN_PROGS_x86_64 = platform_info_test -TEST_GEN_PROGS_x86_64 += set_sregs_test -TEST_GEN_PROGS_x86_64 += sync_regs_test -TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test -TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test -TEST_GEN_PROGS_x86_64 += state_test +LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/ucall.c lib/sparsebit.c +LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c +LIBKVM_aarch64 = lib/aarch64/processor.c + +TEST_GEN_PROGS_x86_64 = x86_64/platform_info_test +TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test +TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test +TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test +TEST_GEN_PROGS_x86_64 += x86_64/state_test +TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += dirty_log_test +TEST_GEN_PROGS_aarch64 += dirty_log_test + TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) INSTALL_HDR_PATH = $(top_srcdir)/usr LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ -LINUX_TOOL_INCLUDE = $(top_srcdir)tools/include -CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -I.. +LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include +CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(UNAME_M) -I.. LDFLAGS += -pthread # After inclusion, $(OUTPUT) is defined and @@ -29,7 +33,7 @@ include ../lib.mk STATIC_LIBS := $(OUTPUT)/libkvm.a LIBKVM_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM)) -EXTRA_CLEAN += $(LIBKVM_OBJ) $(STATIC_LIBS) +EXTRA_CLEAN += $(LIBKVM_OBJ) $(STATIC_LIBS) cscope.* x := $(shell mkdir -p $(sort $(dir $(LIBKVM_OBJ)))) $(LIBKVM_OBJ): $(OUTPUT)/%.o: %.c @@ -41,3 +45,12 @@ $(OUTPUT)/libkvm.a: $(LIBKVM_OBJ) all: $(STATIC_LIBS) $(TEST_GEN_PROGS): $(STATIC_LIBS) $(STATIC_LIBS):| khdr + +cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib .. +cscope: + $(RM) cscope.* + (find $(include_paths) -name '*.h' \ + -exec realpath --relative-base=$(PWD) {} \;; \ + find . -name '*.c' \ + -exec realpath --relative-base=$(PWD) {} \;) | sort -u > cscope.files + cscope -b diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 0c2cdc105f96..d59820cc2d39 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -5,6 +5,8 @@ * Copyright (C) 2018, Red Hat, Inc. */ +#define _GNU_SOURCE /* for program_invocation_name */ + #include <stdio.h> #include <stdlib.h> #include <unistd.h> @@ -15,76 +17,78 @@ #include "test_util.h" #include "kvm_util.h" +#include "processor.h" + +#define DEBUG printf -#define DEBUG printf +#define VCPU_ID 1 -#define VCPU_ID 1 /* The memory slot index to track dirty pages */ -#define TEST_MEM_SLOT_INDEX 1 -/* - * GPA offset of the testing memory slot. Must be bigger than the - * default vm mem slot, which is DEFAULT_GUEST_PHY_PAGES. - */ -#define TEST_MEM_OFFSET (1ULL << 30) /* 1G */ -/* Size of the testing memory slot */ -#define TEST_MEM_PAGES (1ULL << 18) /* 1G for 4K pages */ +#define TEST_MEM_SLOT_INDEX 1 + +/* Default guest test memory offset, 1G */ +#define DEFAULT_GUEST_TEST_MEM 0x40000000 + /* How many pages to dirty for each guest loop */ -#define TEST_PAGES_PER_LOOP 1024 +#define TEST_PAGES_PER_LOOP 1024 + /* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */ -#define TEST_HOST_LOOP_N 32 +#define TEST_HOST_LOOP_N 32 + /* Interval for each host loop (ms) */ -#define TEST_HOST_LOOP_INTERVAL 10 +#define TEST_HOST_LOOP_INTERVAL 10 + +/* + * Guest/Host shared variables. Ensure addr_gva2hva() and/or + * sync_global_to/from_guest() are used when accessing from + * the host. READ/WRITE_ONCE() should also be used with anything + * that may change. + */ +static uint64_t host_page_size; +static uint64_t guest_page_size; +static uint64_t guest_num_pages; +static uint64_t random_array[TEST_PAGES_PER_LOOP]; +static uint64_t iteration; /* - * Guest variables. We use these variables to share data between host - * and guest. There are two copies of the variables, one in host memory - * (which is unused) and one in guest memory. When the host wants to - * access these variables, it needs to call addr_gva2hva() to access the - * guest copy. + * GPA offset of the testing memory slot. Must be bigger than + * DEFAULT_GUEST_PHY_PAGES. */ -uint64_t guest_random_array[TEST_PAGES_PER_LOOP]; -uint64_t guest_iteration; -uint64_t guest_page_size; +static uint64_t guest_test_mem = DEFAULT_GUEST_TEST_MEM; /* - * Writes to the first byte of a random page within the testing memory - * region continuously. + * Continuously write to the first 8 bytes of a random pages within + * the testing memory region. */ -void guest_code(void) +static void guest_code(void) { - int i = 0; - uint64_t volatile *array = guest_random_array; - uint64_t volatile *guest_addr; + int i; while (true) { for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { - /* - * Write to the first 8 bytes of a random page - * on the testing memory region. - */ - guest_addr = (uint64_t *) - (TEST_MEM_OFFSET + - (array[i] % TEST_MEM_PAGES) * guest_page_size); - *guest_addr = guest_iteration; + uint64_t addr = guest_test_mem; + addr += (READ_ONCE(random_array[i]) % guest_num_pages) + * guest_page_size; + addr &= ~(host_page_size - 1); + *(uint64_t *)addr = READ_ONCE(iteration); } + /* Tell the host that we need more random numbers */ GUEST_SYNC(1); } } -/* - * Host variables. These variables should only be used by the host - * rather than the guest. - */ -bool host_quit; +/* Host variables */ +static bool host_quit; /* Points to the test VM memory region on which we track dirty logs */ -void *host_test_mem; +static void *host_test_mem; +static uint64_t host_num_pages; /* For statistics only */ -uint64_t host_dirty_count; -uint64_t host_clear_count; -uint64_t host_track_next_count; +static uint64_t host_dirty_count; +static uint64_t host_clear_count; +static uint64_t host_track_next_count; /* * We use this bitmap to track some pages that should have its dirty @@ -93,40 +97,34 @@ uint64_t host_track_next_count; * page bit is cleared in the latest bitmap, then the system must * report that write in the next get dirty log call. */ -unsigned long *host_bmap_track; +static unsigned long *host_bmap_track; -void generate_random_array(uint64_t *guest_array, uint64_t size) +static void generate_random_array(uint64_t *guest_array, uint64_t size) { uint64_t i; - for (i = 0; i < size; i++) { + for (i = 0; i < size; i++) guest_array[i] = random(); - } } -void *vcpu_worker(void *data) +static void *vcpu_worker(void *data) { int ret; - uint64_t loops, *guest_array, pages_count = 0; struct kvm_vm *vm = data; + uint64_t *guest_array; + uint64_t pages_count = 0; struct kvm_run *run; - struct guest_args args; + struct ucall uc; run = vcpu_state(vm, VCPU_ID); - /* Retrieve the guest random array pointer and cache it */ - guest_array = addr_gva2hva(vm, (vm_vaddr_t)guest_random_array); - - DEBUG("VCPU starts\n"); - + guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array); generate_random_array(guest_array, TEST_PAGES_PER_LOOP); while (!READ_ONCE(host_quit)) { - /* Let the guest to dirty these random pages */ + /* Let the guest dirty the random pages */ ret = _vcpu_run(vm, VCPU_ID); - guest_args_read(vm, VCPU_ID, &args); - if (run->exit_reason == KVM_EXIT_IO && - args.port == GUEST_PORT_SYNC) { + if (get_ucall(vm, VCPU_ID, &uc) == UCALL_SYNC) { pages_count += TEST_PAGES_PER_LOOP; generate_random_array(guest_array, TEST_PAGES_PER_LOOP); } else { @@ -137,18 +135,20 @@ void *vcpu_worker(void *data) } } - DEBUG("VCPU exits, dirtied %"PRIu64" pages\n", pages_count); + DEBUG("Dirtied %"PRIu64" pages\n", pages_count); return NULL; } -void vm_dirty_log_verify(unsigned long *bmap, uint64_t iteration) +static void vm_dirty_log_verify(unsigned long *bmap) { uint64_t page; - uint64_t volatile *value_ptr; + uint64_t *value_ptr; + uint64_t step = host_page_size >= guest_page_size ? 1 : + guest_page_size / host_page_size; - for (page = 0; page < TEST_MEM_PAGES; page++) { - value_ptr = host_test_mem + page * getpagesize(); + for (page = 0; page < host_num_pages; page += step) { + value_ptr = host_test_mem + page * host_page_size; /* If this is a special page that we were tracking... */ if (test_and_clear_bit(page, host_bmap_track)) { @@ -208,88 +208,117 @@ void vm_dirty_log_verify(unsigned long *bmap, uint64_t iteration) } } -void help(char *name) +static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, + uint64_t extra_mem_pages, void *guest_code) { - puts(""); - printf("usage: %s [-i iterations] [-I interval] [-h]\n", name); - puts(""); - printf(" -i: specify iteration counts (default: %"PRIu64")\n", - TEST_HOST_LOOP_N); - printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n", - TEST_HOST_LOOP_INTERVAL); - puts(""); - exit(0); + struct kvm_vm *vm; + uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; + + vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); +#ifdef __x86_64__ + vm_create_irqchip(vm); +#endif + vm_vcpu_add_default(vm, vcpuid, guest_code); + return vm; } -int main(int argc, char *argv[]) +static void run_test(enum vm_guest_mode mode, unsigned long iterations, + unsigned long interval, bool top_offset) { + unsigned int guest_pa_bits, guest_page_shift; pthread_t vcpu_thread; struct kvm_vm *vm; - uint64_t volatile *psize, *iteration; - unsigned long *bmap, iterations = TEST_HOST_LOOP_N, - interval = TEST_HOST_LOOP_INTERVAL; - int opt; - - while ((opt = getopt(argc, argv, "hi:I:")) != -1) { - switch (opt) { - case 'i': - iterations = strtol(optarg, NULL, 10); - break; - case 'I': - interval = strtol(optarg, NULL, 10); - break; - case 'h': - default: - help(argv[0]); - break; - } + uint64_t max_gfn; + unsigned long *bmap; + + switch (mode) { + case VM_MODE_P52V48_4K: + guest_pa_bits = 52; + guest_page_shift = 12; + break; + case VM_MODE_P52V48_64K: + guest_pa_bits = 52; + guest_page_shift = 16; + break; + case VM_MODE_P40V48_4K: + guest_pa_bits = 40; + guest_page_shift = 12; + break; + case VM_MODE_P40V48_64K: + guest_pa_bits = 40; + guest_page_shift = 16; + break; + default: + TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode); } - TEST_ASSERT(iterations > 2, "Iteration must be bigger than zero\n"); - TEST_ASSERT(interval > 0, "Interval must be bigger than zero"); + DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - DEBUG("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", - iterations, interval); + max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1; + guest_page_size = (1ul << guest_page_shift); + /* 1G of guest page sized pages */ + guest_num_pages = (1ul << (30 - guest_page_shift)); + host_page_size = getpagesize(); + host_num_pages = (guest_num_pages * guest_page_size) / host_page_size + + !!((guest_num_pages * guest_page_size) % host_page_size); - srandom(time(0)); + if (top_offset) { + guest_test_mem = (max_gfn - guest_num_pages) * guest_page_size; + guest_test_mem &= ~(host_page_size - 1); + } - bmap = bitmap_alloc(TEST_MEM_PAGES); - host_bmap_track = bitmap_alloc(TEST_MEM_PAGES); + DEBUG("guest test mem offset: 0x%lx\n", guest_test_mem); - vm = vm_create_default(VCPU_ID, TEST_MEM_PAGES, guest_code); + bmap = bitmap_alloc(host_num_pages); + host_bmap_track = bitmap_alloc(host_num_pages); + + vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code); /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - TEST_MEM_OFFSET, + guest_test_mem, TEST_MEM_SLOT_INDEX, - TEST_MEM_PAGES, + guest_num_pages, KVM_MEM_LOG_DIRTY_PAGES); - /* Cache the HVA pointer of the region */ - host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)TEST_MEM_OFFSET); /* Do 1:1 mapping for the dirty track memory slot */ - virt_map(vm, TEST_MEM_OFFSET, TEST_MEM_OFFSET, - TEST_MEM_PAGES * getpagesize(), 0); + virt_map(vm, guest_test_mem, guest_test_mem, + guest_num_pages * guest_page_size, 0); + + /* Cache the HVA pointer of the region */ + host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_mem); +#ifdef __x86_64__ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); +#endif +#ifdef __aarch64__ + ucall_init(vm, UCALL_MMIO, NULL); +#endif - /* Tell the guest about the page size on the system */ - psize = addr_gva2hva(vm, (vm_vaddr_t)&guest_page_size); - *psize = getpagesize(); + /* Export the shared variables to the guest */ + sync_global_to_guest(vm, host_page_size); + sync_global_to_guest(vm, guest_page_size); + sync_global_to_guest(vm, guest_test_mem); + sync_global_to_guest(vm, guest_num_pages); /* Start the iterations */ - iteration = addr_gva2hva(vm, (vm_vaddr_t)&guest_iteration); - *iteration = 1; + iteration = 1; + sync_global_to_guest(vm, iteration); + host_quit = false; + host_dirty_count = 0; + host_clear_count = 0; + host_track_next_count = 0; - /* Start dirtying pages */ pthread_create(&vcpu_thread, NULL, vcpu_worker, vm); - while (*iteration < iterations) { + while (iteration < iterations) { /* Give the vcpu thread some time to dirty some pages */ usleep(interval * 1000); kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); - vm_dirty_log_verify(bmap, *iteration); - (*iteration)++; + vm_dirty_log_verify(bmap); + iteration++; + sync_global_to_guest(vm, iteration); } /* Tell the vcpu thread to quit */ @@ -302,7 +331,118 @@ int main(int argc, char *argv[]) free(bmap); free(host_bmap_track); + ucall_uninit(vm); kvm_vm_free(vm); +} + +static struct vm_guest_modes { + enum vm_guest_mode mode; + bool supported; + bool enabled; +} vm_guest_modes[NUM_VM_MODES] = { +#if defined(__x86_64__) + { VM_MODE_P52V48_4K, 1, 1, }, + { VM_MODE_P52V48_64K, 0, 0, }, + { VM_MODE_P40V48_4K, 0, 0, }, + { VM_MODE_P40V48_64K, 0, 0, }, +#elif defined(__aarch64__) + { VM_MODE_P52V48_4K, 0, 0, }, + { VM_MODE_P52V48_64K, 0, 0, }, + { VM_MODE_P40V48_4K, 1, 1, }, + { VM_MODE_P40V48_64K, 1, 1, }, +#endif +}; + +static void help(char *name) +{ + int i; + + puts(""); + printf("usage: %s [-h] [-i iterations] [-I interval] " + "[-o offset] [-t] [-m mode]\n", name); + puts(""); + printf(" -i: specify iteration counts (default: %"PRIu64")\n", + TEST_HOST_LOOP_N); + printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n", + TEST_HOST_LOOP_INTERVAL); + printf(" -o: guest test memory offset (default: 0x%lx)\n", + DEFAULT_GUEST_TEST_MEM); + printf(" -t: map guest test memory at the top of the allowed " + "physical address range\n"); + printf(" -m: specify the guest mode ID to test " + "(default: test all supported modes)\n" + " This option may be used multiple times.\n" + " Guest mode IDs:\n"); + for (i = 0; i < NUM_VM_MODES; ++i) { + printf(" %d: %s%s\n", + vm_guest_modes[i].mode, + vm_guest_mode_string(vm_guest_modes[i].mode), + vm_guest_modes[i].supported ? " (supported)" : ""); + } + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + unsigned long iterations = TEST_HOST_LOOP_N; + unsigned long interval = TEST_HOST_LOOP_INTERVAL; + bool mode_selected = false; + bool top_offset = false; + unsigned int mode; + int opt, i; + + while ((opt = getopt(argc, argv, "hi:I:o:tm:")) != -1) { + switch (opt) { + case 'i': + iterations = strtol(optarg, NULL, 10); + break; + case 'I': + interval = strtol(optarg, NULL, 10); + break; + case 'o': + guest_test_mem = strtoull(optarg, NULL, 0); + break; + case 't': + top_offset = true; + break; + case 'm': + if (!mode_selected) { + for (i = 0; i < NUM_VM_MODES; ++i) + vm_guest_modes[i].enabled = 0; + mode_selected = true; + } + mode = strtoul(optarg, NULL, 10); + TEST_ASSERT(mode < NUM_VM_MODES, + "Guest mode ID %d too big", mode); + vm_guest_modes[mode].enabled = 1; + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + TEST_ASSERT(iterations > 2, "Iterations must be greater than two"); + TEST_ASSERT(interval > 0, "Interval must be greater than zero"); + TEST_ASSERT(!top_offset || guest_test_mem == DEFAULT_GUEST_TEST_MEM, + "Cannot use both -o [offset] and -t at the same time"); + + DEBUG("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", + iterations, interval); + + srandom(time(0)); + + for (i = 0; i < NUM_VM_MODES; ++i) { + if (!vm_guest_modes[i].enabled) + continue; + TEST_ASSERT(vm_guest_modes[i].supported, + "Guest mode ID %d (%s) not supported.", + vm_guest_modes[i].mode, + vm_guest_mode_string(vm_guest_modes[i].mode)); + run_test(vm_guest_modes[i].mode, iterations, interval, top_offset); + } return 0; } diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h new file mode 100644 index 000000000000..9ef2ab1a0c08 --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AArch64 processor specific defines + * + * Copyright (C) 2018, Red Hat, Inc. + */ +#ifndef SELFTEST_KVM_PROCESSOR_H +#define SELFTEST_KVM_PROCESSOR_H + +#include "kvm_util.h" + + +#define ARM64_CORE_REG(x) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \ + KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x)) + +#define CPACR_EL1 3, 0, 1, 0, 2 +#define TCR_EL1 3, 0, 2, 0, 2 +#define MAIR_EL1 3, 0, 10, 2, 0 +#define TTBR0_EL1 3, 0, 2, 0, 0 +#define SCTLR_EL1 3, 0, 1, 0, 0 + +/* + * Default MAIR + * index attribute + * DEVICE_nGnRnE 0 0000:0000 + * DEVICE_nGnRE 1 0000:0100 + * DEVICE_GRE 2 0000:1100 + * NORMAL_NC 3 0100:0100 + * NORMAL 4 1111:1111 + * NORMAL_WT 5 1011:1011 + */ +#define DEFAULT_MAIR_EL1 ((0x00ul << (0 * 8)) | \ + (0x04ul << (1 * 8)) | \ + (0x0cul << (2 * 8)) | \ + (0x44ul << (3 * 8)) | \ + (0xfful << (4 * 8)) | \ + (0xbbul << (5 * 8))) + +static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t *addr) +{ + struct kvm_one_reg reg; + reg.id = id; + reg.addr = (uint64_t)addr; + vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, ®); +} + +static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t val) +{ + struct kvm_one_reg reg; + reg.id = id; + reg.addr = (uint64_t)&val; + vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, ®); +} + +#endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/evmcs.h new file mode 100644 index 000000000000..4059014d93ea --- /dev/null +++ b/tools/testing/selftests/kvm/include/evmcs.h @@ -0,0 +1,1098 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * tools/testing/selftests/kvm/include/vmx.h + * + * Copyright (C) 2018, Red Hat, Inc. + * + */ + +#ifndef SELFTEST_KVM_EVMCS_H +#define SELFTEST_KVM_EVMCS_H + +#include <stdint.h> +#include "vmx.h" + +#define u16 uint16_t +#define u32 uint32_t +#define u64 uint64_t + +extern bool enable_evmcs; + +struct hv_vp_assist_page { + __u32 apic_assist; + __u32 reserved; + __u64 vtl_control[2]; + __u64 nested_enlightenments_control[2]; + __u32 enlighten_vmentry; + __u64 current_nested_vmcs; +}; + +struct hv_enlightened_vmcs { + u32 revision_id; + u32 abort; + + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + + u64 host_ia32_pat; + u64 host_ia32_efer; + + u64 host_cr0; + u64 host_cr3; + u64 host_cr4; + + u64 host_ia32_sysenter_esp; + u64 host_ia32_sysenter_eip; + u64 host_rip; + u32 host_ia32_sysenter_cs; + + u32 pin_based_vm_exec_control; + u32 vm_exit_controls; + u32 secondary_vm_exec_control; + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + + u64 guest_es_base; + u64 guest_cs_base; + u64 guest_ss_base; + u64 guest_ds_base; + u64 guest_fs_base; + u64 guest_gs_base; + u64 guest_ldtr_base; + u64 guest_tr_base; + u64 guest_gdtr_base; + u64 guest_idtr_base; + + u64 padding64_1[3]; + + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + + u64 cr3_target_value0; + u64 cr3_target_value1; + u64 cr3_target_value2; + u64 cr3_target_value3; + + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + + u32 cr3_target_count; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_msr_load_count; + + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 vmcs_link_pointer; + + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + + u64 guest_pending_dbg_exceptions; + u64 guest_sysenter_esp; + u64 guest_sysenter_eip; + + u32 guest_activity_state; + u32 guest_sysenter_cs; + + u64 cr0_guest_host_mask; + u64 cr4_guest_host_mask; + u64 cr0_read_shadow; + u64 cr4_read_shadow; + u64 guest_cr0; + u64 guest_cr3; + u64 guest_cr4; + u64 guest_dr7; + + u64 host_fs_base; + u64 host_gs_base; + u64 host_tr_base; + u64 host_gdtr_base; + u64 host_idtr_base; + u64 host_rsp; + + u64 ept_pointer; + + u16 virtual_processor_id; + u16 padding16[3]; + + u64 padding64_2[5]; + u64 guest_physical_address; + + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + + u64 exit_qualification; + u64 exit_io_instruction_ecx; + u64 exit_io_instruction_esi; + u64 exit_io_instruction_edi; + u64 exit_io_instruction_eip; + + u64 guest_linear_address; + u64 guest_rsp; + u64 guest_rflags; + + u32 guest_interruptibility_info; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 vm_entry_controls; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + + u64 guest_rip; + + u32 hv_clean_fields; + u32 hv_padding_32; + u32 hv_synthetic_controls; + struct { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 reserved:30; + } hv_enlightenments_control; + u32 hv_vp_id; + + u64 hv_vm_id; + u64 partition_assist_page; + u64 padding64_4[4]; + u64 guest_bndcfgs; + u64 padding64_5[7]; + u64 xss_exit_bitmap; + u64 padding64_6[7]; +}; + +#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 +#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +struct hv_enlightened_vmcs *current_evmcs; +struct hv_vp_assist_page *current_vp_assist; + +static inline int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist) +{ + u64 val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) | + HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; + + wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val); + + current_vp_assist = vp_assist; + + enable_evmcs = true; + + return 0; +} + +static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs) +{ + current_vp_assist->current_nested_vmcs = vmcs_pa; + current_vp_assist->enlighten_vmentry = 1; + + current_evmcs = vmcs; + + return 0; +} + +static inline int evmcs_vmptrst(uint64_t *value) +{ + *value = current_vp_assist->current_nested_vmcs & + ~HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; + + return 0; +} + +static inline int evmcs_vmread(uint64_t encoding, uint64_t *value) +{ + switch (encoding) { + case GUEST_RIP: + *value = current_evmcs->guest_rip; + break; + case GUEST_RSP: + *value = current_evmcs->guest_rsp; + break; + case GUEST_RFLAGS: + *value = current_evmcs->guest_rflags; + break; + case HOST_IA32_PAT: + *value = current_evmcs->host_ia32_pat; + break; + case HOST_IA32_EFER: + *value = current_evmcs->host_ia32_efer; + break; + case HOST_CR0: + *value = current_evmcs->host_cr0; + break; + case HOST_CR3: + *value = current_evmcs->host_cr3; + break; + case HOST_CR4: + *value = current_evmcs->host_cr4; + break; + case HOST_IA32_SYSENTER_ESP: + *value = current_evmcs->host_ia32_sysenter_esp; + break; + case HOST_IA32_SYSENTER_EIP: + *value = current_evmcs->host_ia32_sysenter_eip; + break; + case HOST_RIP: + *value = current_evmcs->host_rip; + break; + case IO_BITMAP_A: + *value = current_evmcs->io_bitmap_a; + break; + case IO_BITMAP_B: + *value = current_evmcs->io_bitmap_b; + break; + case MSR_BITMAP: + *value = current_evmcs->msr_bitmap; + break; + case GUEST_ES_BASE: + *value = current_evmcs->guest_es_base; + break; + case GUEST_CS_BASE: + *value = current_evmcs->guest_cs_base; + break; + case GUEST_SS_BASE: + *value = current_evmcs->guest_ss_base; + break; + case GUEST_DS_BASE: + *value = current_evmcs->guest_ds_base; + break; + case GUEST_FS_BASE: + *value = current_evmcs->guest_fs_base; + break; + case GUEST_GS_BASE: + *value = current_evmcs->guest_gs_base; + break; + case GUEST_LDTR_BASE: + *value = current_evmcs->guest_ldtr_base; + break; + case GUEST_TR_BASE: + *value = current_evmcs->guest_tr_base; + break; + case GUEST_GDTR_BASE: + *value = current_evmcs->guest_gdtr_base; + break; + case GUEST_IDTR_BASE: + *value = current_evmcs->guest_idtr_base; + break; + case TSC_OFFSET: + *value = current_evmcs->tsc_offset; + break; + case VIRTUAL_APIC_PAGE_ADDR: + *value = current_evmcs->virtual_apic_page_addr; + break; + case VMCS_LINK_POINTER: + *value = current_evmcs->vmcs_link_pointer; + break; + case GUEST_IA32_DEBUGCTL: + *value = current_evmcs->guest_ia32_debugctl; + break; + case GUEST_IA32_PAT: + *value = current_evmcs->guest_ia32_pat; + break; + case GUEST_IA32_EFER: + *value = current_evmcs->guest_ia32_efer; + break; + case GUEST_PDPTR0: + *value = current_evmcs->guest_pdptr0; + break; + case GUEST_PDPTR1: + *value = current_evmcs->guest_pdptr1; + break; + case GUEST_PDPTR2: + *value = current_evmcs->guest_pdptr2; + break; + case GUEST_PDPTR3: + *value = current_evmcs->guest_pdptr3; + break; + case GUEST_PENDING_DBG_EXCEPTIONS: + *value = current_evmcs->guest_pending_dbg_exceptions; + break; + case GUEST_SYSENTER_ESP: + *value = current_evmcs->guest_sysenter_esp; + break; + case GUEST_SYSENTER_EIP: + *value = current_evmcs->guest_sysenter_eip; + break; + case CR0_GUEST_HOST_MASK: + *value = current_evmcs->cr0_guest_host_mask; + break; + case CR4_GUEST_HOST_MASK: + *value = current_evmcs->cr4_guest_host_mask; + break; + case CR0_READ_SHADOW: + *value = current_evmcs->cr0_read_shadow; + break; + case CR4_READ_SHADOW: + *value = current_evmcs->cr4_read_shadow; + break; + case GUEST_CR0: + *value = current_evmcs->guest_cr0; + break; + case GUEST_CR3: + *value = current_evmcs->guest_cr3; + break; + case GUEST_CR4: + *value = current_evmcs->guest_cr4; + break; + case GUEST_DR7: + *value = current_evmcs->guest_dr7; + break; + case HOST_FS_BASE: + *value = current_evmcs->host_fs_base; + break; + case HOST_GS_BASE: + *value = current_evmcs->host_gs_base; + break; + case HOST_TR_BASE: + *value = current_evmcs->host_tr_base; + break; + case HOST_GDTR_BASE: + *value = current_evmcs->host_gdtr_base; + break; + case HOST_IDTR_BASE: + *value = current_evmcs->host_idtr_base; + break; + case HOST_RSP: + *value = current_evmcs->host_rsp; + break; + case EPT_POINTER: + *value = current_evmcs->ept_pointer; + break; + case GUEST_BNDCFGS: + *value = current_evmcs->guest_bndcfgs; + break; + case XSS_EXIT_BITMAP: + *value = current_evmcs->xss_exit_bitmap; + break; + case GUEST_PHYSICAL_ADDRESS: + *value = current_evmcs->guest_physical_address; + break; + case EXIT_QUALIFICATION: + *value = current_evmcs->exit_qualification; + break; + case GUEST_LINEAR_ADDRESS: + *value = current_evmcs->guest_linear_address; + break; + case VM_EXIT_MSR_STORE_ADDR: + *value = current_evmcs->vm_exit_msr_store_addr; + break; + case VM_EXIT_MSR_LOAD_ADDR: + *value = current_evmcs->vm_exit_msr_load_addr; + break; + case VM_ENTRY_MSR_LOAD_ADDR: + *value = current_evmcs->vm_entry_msr_load_addr; + break; + case CR3_TARGET_VALUE0: + *value = current_evmcs->cr3_target_value0; + break; + case CR3_TARGET_VALUE1: + *value = current_evmcs->cr3_target_value1; + break; + case CR3_TARGET_VALUE2: + *value = current_evmcs->cr3_target_value2; + break; + case CR3_TARGET_VALUE3: + *value = current_evmcs->cr3_target_value3; + break; + case TPR_THRESHOLD: + *value = current_evmcs->tpr_threshold; + break; + case GUEST_INTERRUPTIBILITY_INFO: + *value = current_evmcs->guest_interruptibility_info; + break; + case CPU_BASED_VM_EXEC_CONTROL: + *value = current_evmcs->cpu_based_vm_exec_control; + break; + case EXCEPTION_BITMAP: + *value = current_evmcs->exception_bitmap; + break; + case VM_ENTRY_CONTROLS: + *value = current_evmcs->vm_entry_controls; + break; + case VM_ENTRY_INTR_INFO_FIELD: + *value = current_evmcs->vm_entry_intr_info_field; + break; + case VM_ENTRY_EXCEPTION_ERROR_CODE: + *value = current_evmcs->vm_entry_exception_error_code; + break; + case VM_ENTRY_INSTRUCTION_LEN: + *value = current_evmcs->vm_entry_instruction_len; + break; + case HOST_IA32_SYSENTER_CS: + *value = current_evmcs->host_ia32_sysenter_cs; + break; + case PIN_BASED_VM_EXEC_CONTROL: + *value = current_evmcs->pin_based_vm_exec_control; + break; + case VM_EXIT_CONTROLS: + *value = current_evmcs->vm_exit_controls; + break; + case SECONDARY_VM_EXEC_CONTROL: + *value = current_evmcs->secondary_vm_exec_control; + break; + case GUEST_ES_LIMIT: + *value = current_evmcs->guest_es_limit; + break; + case GUEST_CS_LIMIT: + *value = current_evmcs->guest_cs_limit; + break; + case GUEST_SS_LIMIT: + *value = current_evmcs->guest_ss_limit; + break; + case GUEST_DS_LIMIT: + *value = current_evmcs->guest_ds_limit; + break; + case GUEST_FS_LIMIT: + *value = current_evmcs->guest_fs_limit; + break; + case GUEST_GS_LIMIT: + *value = current_evmcs->guest_gs_limit; + break; + case GUEST_LDTR_LIMIT: + *value = current_evmcs->guest_ldtr_limit; + break; + case GUEST_TR_LIMIT: + *value = current_evmcs->guest_tr_limit; + break; + case GUEST_GDTR_LIMIT: + *value = current_evmcs->guest_gdtr_limit; + break; + case GUEST_IDTR_LIMIT: + *value = current_evmcs->guest_idtr_limit; + break; + case GUEST_ES_AR_BYTES: + *value = current_evmcs->guest_es_ar_bytes; + break; + case GUEST_CS_AR_BYTES: + *value = current_evmcs->guest_cs_ar_bytes; + break; + case GUEST_SS_AR_BYTES: + *value = current_evmcs->guest_ss_ar_bytes; + break; + case GUEST_DS_AR_BYTES: + *value = current_evmcs->guest_ds_ar_bytes; + break; + case GUEST_FS_AR_BYTES: + *value = current_evmcs->guest_fs_ar_bytes; + break; + case GUEST_GS_AR_BYTES: + *value = current_evmcs->guest_gs_ar_bytes; + break; + case GUEST_LDTR_AR_BYTES: + *value = current_evmcs->guest_ldtr_ar_bytes; + break; + case GUEST_TR_AR_BYTES: + *value = current_evmcs->guest_tr_ar_bytes; + break; + case GUEST_ACTIVITY_STATE: + *value = current_evmcs->guest_activity_state; + break; + case GUEST_SYSENTER_CS: + *value = current_evmcs->guest_sysenter_cs; + break; + case VM_INSTRUCTION_ERROR: + *value = current_evmcs->vm_instruction_error; + break; + case VM_EXIT_REASON: + *value = current_evmcs->vm_exit_reason; + break; + case VM_EXIT_INTR_INFO: + *value = current_evmcs->vm_exit_intr_info; + break; + case VM_EXIT_INTR_ERROR_CODE: + *value = current_evmcs->vm_exit_intr_error_code; + break; + case IDT_VECTORING_INFO_FIELD: + *value = current_evmcs->idt_vectoring_info_field; + break; + case IDT_VECTORING_ERROR_CODE: + *value = current_evmcs->idt_vectoring_error_code; + break; + case VM_EXIT_INSTRUCTION_LEN: + *value = current_evmcs->vm_exit_instruction_len; + break; + case VMX_INSTRUCTION_INFO: + *value = current_evmcs->vmx_instruction_info; + break; + case PAGE_FAULT_ERROR_CODE_MASK: + *value = current_evmcs->page_fault_error_code_mask; + break; + case PAGE_FAULT_ERROR_CODE_MATCH: + *value = current_evmcs->page_fault_error_code_match; + break; + case CR3_TARGET_COUNT: + *value = current_evmcs->cr3_target_count; + break; + case VM_EXIT_MSR_STORE_COUNT: + *value = current_evmcs->vm_exit_msr_store_count; + break; + case VM_EXIT_MSR_LOAD_COUNT: + *value = current_evmcs->vm_exit_msr_load_count; + break; + case VM_ENTRY_MSR_LOAD_COUNT: + *value = current_evmcs->vm_entry_msr_load_count; + break; + case HOST_ES_SELECTOR: + *value = current_evmcs->host_es_selector; + break; + case HOST_CS_SELECTOR: + *value = current_evmcs->host_cs_selector; + break; + case HOST_SS_SELECTOR: + *value = current_evmcs->host_ss_selector; + break; + case HOST_DS_SELECTOR: + *value = current_evmcs->host_ds_selector; + break; + case HOST_FS_SELECTOR: + *value = current_evmcs->host_fs_selector; + break; + case HOST_GS_SELECTOR: + *value = current_evmcs->host_gs_selector; + break; + case HOST_TR_SELECTOR: + *value = current_evmcs->host_tr_selector; + break; + case GUEST_ES_SELECTOR: + *value = current_evmcs->guest_es_selector; + break; + case GUEST_CS_SELECTOR: + *value = current_evmcs->guest_cs_selector; + break; + case GUEST_SS_SELECTOR: + *value = current_evmcs->guest_ss_selector; + break; + case GUEST_DS_SELECTOR: + *value = current_evmcs->guest_ds_selector; + break; + case GUEST_FS_SELECTOR: + *value = current_evmcs->guest_fs_selector; + break; + case GUEST_GS_SELECTOR: + *value = current_evmcs->guest_gs_selector; + break; + case GUEST_LDTR_SELECTOR: + *value = current_evmcs->guest_ldtr_selector; + break; + case GUEST_TR_SELECTOR: + *value = current_evmcs->guest_tr_selector; + break; + case VIRTUAL_PROCESSOR_ID: + *value = current_evmcs->virtual_processor_id; + break; + default: return 1; + } + + return 0; +} + +static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value) +{ + switch (encoding) { + case GUEST_RIP: + current_evmcs->guest_rip = value; + break; + case GUEST_RSP: + current_evmcs->guest_rsp = value; + break; + case GUEST_RFLAGS: + current_evmcs->guest_rflags = value; + break; + case HOST_IA32_PAT: + current_evmcs->host_ia32_pat = value; + break; + case HOST_IA32_EFER: + current_evmcs->host_ia32_efer = value; + break; + case HOST_CR0: + current_evmcs->host_cr0 = value; + break; + case HOST_CR3: + current_evmcs->host_cr3 = value; + break; + case HOST_CR4: + current_evmcs->host_cr4 = value; + break; + case HOST_IA32_SYSENTER_ESP: + current_evmcs->host_ia32_sysenter_esp = value; + break; + case HOST_IA32_SYSENTER_EIP: + current_evmcs->host_ia32_sysenter_eip = value; + break; + case HOST_RIP: + current_evmcs->host_rip = value; + break; + case IO_BITMAP_A: + current_evmcs->io_bitmap_a = value; + break; + case IO_BITMAP_B: + current_evmcs->io_bitmap_b = value; + break; + case MSR_BITMAP: + current_evmcs->msr_bitmap = value; + break; + case GUEST_ES_BASE: + current_evmcs->guest_es_base = value; + break; + case GUEST_CS_BASE: + current_evmcs->guest_cs_base = value; + break; + case GUEST_SS_BASE: + current_evmcs->guest_ss_base = value; + break; + case GUEST_DS_BASE: + current_evmcs->guest_ds_base = value; + break; + case GUEST_FS_BASE: + current_evmcs->guest_fs_base = value; + break; + case GUEST_GS_BASE: + current_evmcs->guest_gs_base = value; + break; + case GUEST_LDTR_BASE: + current_evmcs->guest_ldtr_base = value; + break; + case GUEST_TR_BASE: + current_evmcs->guest_tr_base = value; + break; + case GUEST_GDTR_BASE: + current_evmcs->guest_gdtr_base = value; + break; + case GUEST_IDTR_BASE: + current_evmcs->guest_idtr_base = value; + break; + case TSC_OFFSET: + current_evmcs->tsc_offset = value; + break; + case VIRTUAL_APIC_PAGE_ADDR: + current_evmcs->virtual_apic_page_addr = value; + break; + case VMCS_LINK_POINTER: + current_evmcs->vmcs_link_pointer = value; + break; + case GUEST_IA32_DEBUGCTL: + current_evmcs->guest_ia32_debugctl = value; + break; + case GUEST_IA32_PAT: + current_evmcs->guest_ia32_pat = value; + break; + case GUEST_IA32_EFER: + current_evmcs->guest_ia32_efer = value; + break; + case GUEST_PDPTR0: + current_evmcs->guest_pdptr0 = value; + break; + case GUEST_PDPTR1: + current_evmcs->guest_pdptr1 = value; + break; + case GUEST_PDPTR2: + current_evmcs->guest_pdptr2 = value; + break; + case GUEST_PDPTR3: + current_evmcs->guest_pdptr3 = value; + break; + case GUEST_PENDING_DBG_EXCEPTIONS: + current_evmcs->guest_pending_dbg_exceptions = value; + break; + case GUEST_SYSENTER_ESP: + current_evmcs->guest_sysenter_esp = value; + break; + case GUEST_SYSENTER_EIP: + current_evmcs->guest_sysenter_eip = value; + break; + case CR0_GUEST_HOST_MASK: + current_evmcs->cr0_guest_host_mask = value; + break; + case CR4_GUEST_HOST_MASK: + current_evmcs->cr4_guest_host_mask = value; + break; + case CR0_READ_SHADOW: + current_evmcs->cr0_read_shadow = value; + break; + case CR4_READ_SHADOW: + current_evmcs->cr4_read_shadow = value; + break; + case GUEST_CR0: + current_evmcs->guest_cr0 = value; + break; + case GUEST_CR3: + current_evmcs->guest_cr3 = value; + break; + case GUEST_CR4: + current_evmcs->guest_cr4 = value; + break; + case GUEST_DR7: + current_evmcs->guest_dr7 = value; + break; + case HOST_FS_BASE: + current_evmcs->host_fs_base = value; + break; + case HOST_GS_BASE: + current_evmcs->host_gs_base = value; + break; + case HOST_TR_BASE: + current_evmcs->host_tr_base = value; + break; + case HOST_GDTR_BASE: + current_evmcs->host_gdtr_base = value; + break; + case HOST_IDTR_BASE: + current_evmcs->host_idtr_base = value; + break; + case HOST_RSP: + current_evmcs->host_rsp = value; + break; + case EPT_POINTER: + current_evmcs->ept_pointer = value; + break; + case GUEST_BNDCFGS: + current_evmcs->guest_bndcfgs = value; + break; + case XSS_EXIT_BITMAP: + current_evmcs->xss_exit_bitmap = value; + break; + case GUEST_PHYSICAL_ADDRESS: + current_evmcs->guest_physical_address = value; + break; + case EXIT_QUALIFICATION: + current_evmcs->exit_qualification = value; + break; + case GUEST_LINEAR_ADDRESS: + current_evmcs->guest_linear_address = value; + break; + case VM_EXIT_MSR_STORE_ADDR: + current_evmcs->vm_exit_msr_store_addr = value; + break; + case VM_EXIT_MSR_LOAD_ADDR: + current_evmcs->vm_exit_msr_load_addr = value; + break; + case VM_ENTRY_MSR_LOAD_ADDR: + current_evmcs->vm_entry_msr_load_addr = value; + break; + case CR3_TARGET_VALUE0: + current_evmcs->cr3_target_value0 = value; + break; + case CR3_TARGET_VALUE1: + current_evmcs->cr3_target_value1 = value; + break; + case CR3_TARGET_VALUE2: + current_evmcs->cr3_target_value2 = value; + break; + case CR3_TARGET_VALUE3: + current_evmcs->cr3_target_value3 = value; + break; + case TPR_THRESHOLD: + current_evmcs->tpr_threshold = value; + break; + case GUEST_INTERRUPTIBILITY_INFO: + current_evmcs->guest_interruptibility_info = value; + break; + case CPU_BASED_VM_EXEC_CONTROL: + current_evmcs->cpu_based_vm_exec_control = value; + break; + case EXCEPTION_BITMAP: + current_evmcs->exception_bitmap = value; + break; + case VM_ENTRY_CONTROLS: + current_evmcs->vm_entry_controls = value; + break; + case VM_ENTRY_INTR_INFO_FIELD: + current_evmcs->vm_entry_intr_info_field = value; + break; + case VM_ENTRY_EXCEPTION_ERROR_CODE: + current_evmcs->vm_entry_exception_error_code = value; + break; + case VM_ENTRY_INSTRUCTION_LEN: + current_evmcs->vm_entry_instruction_len = value; + break; + case HOST_IA32_SYSENTER_CS: + current_evmcs->host_ia32_sysenter_cs = value; + break; + case PIN_BASED_VM_EXEC_CONTROL: + current_evmcs->pin_based_vm_exec_control = value; + break; + case VM_EXIT_CONTROLS: + current_evmcs->vm_exit_controls = value; + break; + case SECONDARY_VM_EXEC_CONTROL: + current_evmcs->secondary_vm_exec_control = value; + break; + case GUEST_ES_LIMIT: + current_evmcs->guest_es_limit = value; + break; + case GUEST_CS_LIMIT: + current_evmcs->guest_cs_limit = value; + break; + case GUEST_SS_LIMIT: + current_evmcs->guest_ss_limit = value; + break; + case GUEST_DS_LIMIT: + current_evmcs->guest_ds_limit = value; + break; + case GUEST_FS_LIMIT: + current_evmcs->guest_fs_limit = value; + break; + case GUEST_GS_LIMIT: + current_evmcs->guest_gs_limit = value; + break; + case GUEST_LDTR_LIMIT: + current_evmcs->guest_ldtr_limit = value; + break; + case GUEST_TR_LIMIT: + current_evmcs->guest_tr_limit = value; + break; + case GUEST_GDTR_LIMIT: + current_evmcs->guest_gdtr_limit = value; + break; + case GUEST_IDTR_LIMIT: + current_evmcs->guest_idtr_limit = value; + break; + case GUEST_ES_AR_BYTES: + current_evmcs->guest_es_ar_bytes = value; + break; + case GUEST_CS_AR_BYTES: + current_evmcs->guest_cs_ar_bytes = value; + break; + case GUEST_SS_AR_BYTES: + current_evmcs->guest_ss_ar_bytes = value; + break; + case GUEST_DS_AR_BYTES: + current_evmcs->guest_ds_ar_bytes = value; + break; + case GUEST_FS_AR_BYTES: + current_evmcs->guest_fs_ar_bytes = value; + break; + case GUEST_GS_AR_BYTES: + current_evmcs->guest_gs_ar_bytes = value; + break; + case GUEST_LDTR_AR_BYTES: + current_evmcs->guest_ldtr_ar_bytes = value; + break; + case GUEST_TR_AR_BYTES: + current_evmcs->guest_tr_ar_bytes = value; + break; + case GUEST_ACTIVITY_STATE: + current_evmcs->guest_activity_state = value; + break; + case GUEST_SYSENTER_CS: + current_evmcs->guest_sysenter_cs = value; + break; + case VM_INSTRUCTION_ERROR: + current_evmcs->vm_instruction_error = value; + break; + case VM_EXIT_REASON: + current_evmcs->vm_exit_reason = value; + break; + case VM_EXIT_INTR_INFO: + current_evmcs->vm_exit_intr_info = value; + break; + case VM_EXIT_INTR_ERROR_CODE: + current_evmcs->vm_exit_intr_error_code = value; + break; + case IDT_VECTORING_INFO_FIELD: + current_evmcs->idt_vectoring_info_field = value; + break; + case IDT_VECTORING_ERROR_CODE: + current_evmcs->idt_vectoring_error_code = value; + break; + case VM_EXIT_INSTRUCTION_LEN: + current_evmcs->vm_exit_instruction_len = value; + break; + case VMX_INSTRUCTION_INFO: + current_evmcs->vmx_instruction_info = value; + break; + case PAGE_FAULT_ERROR_CODE_MASK: + current_evmcs->page_fault_error_code_mask = value; + break; + case PAGE_FAULT_ERROR_CODE_MATCH: + current_evmcs->page_fault_error_code_match = value; + break; + case CR3_TARGET_COUNT: + current_evmcs->cr3_target_count = value; + break; + case VM_EXIT_MSR_STORE_COUNT: + current_evmcs->vm_exit_msr_store_count = value; + break; + case VM_EXIT_MSR_LOAD_COUNT: + current_evmcs->vm_exit_msr_load_count = value; + break; + case VM_ENTRY_MSR_LOAD_COUNT: + current_evmcs->vm_entry_msr_load_count = value; + break; + case HOST_ES_SELECTOR: + current_evmcs->host_es_selector = value; + break; + case HOST_CS_SELECTOR: + current_evmcs->host_cs_selector = value; + break; + case HOST_SS_SELECTOR: + current_evmcs->host_ss_selector = value; + break; + case HOST_DS_SELECTOR: + current_evmcs->host_ds_selector = value; + break; + case HOST_FS_SELECTOR: + current_evmcs->host_fs_selector = value; + break; + case HOST_GS_SELECTOR: + current_evmcs->host_gs_selector = value; + break; + case HOST_TR_SELECTOR: + current_evmcs->host_tr_selector = value; + break; + case GUEST_ES_SELECTOR: + current_evmcs->guest_es_selector = value; + break; + case GUEST_CS_SELECTOR: + current_evmcs->guest_cs_selector = value; + break; + case GUEST_SS_SELECTOR: + current_evmcs->guest_ss_selector = value; + break; + case GUEST_DS_SELECTOR: + current_evmcs->guest_ds_selector = value; + break; + case GUEST_FS_SELECTOR: + current_evmcs->guest_fs_selector = value; + break; + case GUEST_GS_SELECTOR: + current_evmcs->guest_gs_selector = value; + break; + case GUEST_LDTR_SELECTOR: + current_evmcs->guest_ldtr_selector = value; + break; + case GUEST_TR_SELECTOR: + current_evmcs->guest_tr_selector = value; + break; + case VIRTUAL_PROCESSOR_ID: + current_evmcs->virtual_processor_id = value; + break; + default: return 1; + } + + return 0; +} + +static inline int evmcs_vmlaunch(void) +{ + int ret; + + current_evmcs->hv_clean_fields = 0; + + __asm__ __volatile__("push %%rbp;" + "push %%rcx;" + "push %%rdx;" + "push %%rsi;" + "push %%rdi;" + "push $0;" + "mov %%rsp, (%[host_rsp]);" + "lea 1f(%%rip), %%rax;" + "mov %%rax, (%[host_rip]);" + "vmlaunch;" + "incq (%%rsp);" + "1: pop %%rax;" + "pop %%rdi;" + "pop %%rsi;" + "pop %%rdx;" + "pop %%rcx;" + "pop %%rbp;" + : [ret]"=&a"(ret) + : [host_rsp]"r" + ((uint64_t)¤t_evmcs->host_rsp), + [host_rip]"r" + ((uint64_t)¤t_evmcs->host_rip) + : "memory", "cc", "rbx", "r8", "r9", "r10", + "r11", "r12", "r13", "r14", "r15"); + return ret; +} + +/* + * No guest state (e.g. GPRs) is established by this vmresume. + */ +static inline int evmcs_vmresume(void) +{ + int ret; + + current_evmcs->hv_clean_fields = 0; + + __asm__ __volatile__("push %%rbp;" + "push %%rcx;" + "push %%rdx;" + "push %%rsi;" + "push %%rdi;" + "push $0;" + "mov %%rsp, (%[host_rsp]);" + "lea 1f(%%rip), %%rax;" + "mov %%rax, (%[host_rip]);" + "vmresume;" + "incq (%%rsp);" + "1: pop %%rax;" + "pop %%rdi;" + "pop %%rsi;" + "pop %%rdx;" + "pop %%rcx;" + "pop %%rbp;" + : [ret]"=&a"(ret) + : [host_rsp]"r" + ((uint64_t)¤t_evmcs->host_rsp), + [host_rip]"r" + ((uint64_t)¤t_evmcs->host_rip) + : "memory", "cc", "rbx", "r8", "r9", "r10", + "r11", "r12", "r13", "r14", "r15"); + return ret; +} + +#endif /* !SELFTEST_KVM_EVMCS_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 3acf9a91704c..a4e59e3b4826 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -7,7 +7,7 @@ * */ #ifndef SELFTEST_KVM_UTIL_H -#define SELFTEST_KVM_UTIL_H 1 +#define SELFTEST_KVM_UTIL_H #include "test_util.h" @@ -17,12 +17,6 @@ #include "sparsebit.h" -/* - * Memslots can't cover the gfn starting at this gpa otherwise vCPUs can't be - * created. Only applies to VMs using EPT. - */ -#define KVM_DEFAULT_IDENTITY_MAP_ADDRESS 0xfffbc000ul - /* Callers of kvm_util only have an incomplete/opaque description of the * structure kvm_util is using to maintain the state of a VM. @@ -33,16 +27,23 @@ typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ /* Minimum allocated guest virtual and physical addresses */ -#define KVM_UTIL_MIN_VADDR 0x2000 +#define KVM_UTIL_MIN_VADDR 0x2000 #define DEFAULT_GUEST_PHY_PAGES 512 #define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000 -#define DEFAULT_STACK_PGS 5 +#define DEFAULT_STACK_PGS 5 enum vm_guest_mode { - VM_MODE_FLAT48PG, + VM_MODE_P52V48_4K, + VM_MODE_P52V48_64K, + VM_MODE_P40V48_4K, + VM_MODE_P40V48_64K, + NUM_VM_MODES, }; +#define vm_guest_mode_string(m) vm_guest_mode_string[m] +extern const char * const vm_guest_mode_string[]; + enum vm_mem_backing_src_type { VM_MEM_SRC_ANONYMOUS, VM_MEM_SRC_ANONYMOUS_THP, @@ -58,15 +59,15 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm); void kvm_vm_release(struct kvm_vm *vmp); void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log); -int kvm_memcmp_hva_gva(void *hva, - struct kvm_vm *vm, const vm_vaddr_t gva, size_t len); +int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, + size_t len); void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, - uint32_t data_memslot, uint32_t pgd_memslot); + uint32_t data_memslot, uint32_t pgd_memslot); void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); -void vcpu_dump(FILE *stream, struct kvm_vm *vm, - uint32_t vcpuid, uint8_t indent); +void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, + uint8_t indent); void vm_create_irqchip(struct kvm_vm *vm); @@ -75,13 +76,14 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, uint64_t guest_paddr, uint32_t slot, uint64_t npages, uint32_t flags); -void vcpu_ioctl(struct kvm_vm *vm, - uint32_t vcpuid, unsigned long ioctl, void *arg); +void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, + void *arg); void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); -void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_memslot); +void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, + int gdt_memslot); vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, - uint32_t data_memslot, uint32_t pgd_memslot); + uint32_t data_memslot, uint32_t pgd_memslot); void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, size_t size, uint32_t pgd_memslot); void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); @@ -93,56 +95,35 @@ struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_mp_state *mp_state); -void vcpu_regs_get(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_regs *regs); -void vcpu_regs_set(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_regs *regs); + struct kvm_mp_state *mp_state); +void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); +void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...); -void vcpu_sregs_get(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_sregs *sregs); -void vcpu_sregs_set(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_sregs *sregs); -int _vcpu_sregs_set(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_sregs *sregs); +void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_sregs *sregs); +void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_sregs *sregs); +int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_sregs *sregs); void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events); + struct kvm_vcpu_events *events); void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events); -uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index); -void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, - uint64_t msr_value); + struct kvm_vcpu_events *events); const char *exit_reason_str(unsigned int exit_reason); void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot); void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot); -vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, - vm_paddr_t paddr_min, uint32_t memslot); - -struct kvm_cpuid2 *kvm_get_supported_cpuid(void); -void vcpu_set_cpuid( - struct kvm_vm *vm, uint32_t vcpuid, struct kvm_cpuid2 *cpuid); - -struct kvm_cpuid_entry2 * -kvm_get_supported_cpuid_index(uint32_t function, uint32_t index); - -static inline struct kvm_cpuid_entry2 * -kvm_get_supported_cpuid_entry(uint32_t function) -{ - return kvm_get_supported_cpuid_index(function, 0); -} + uint32_t pgd_memslot); +vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, + uint32_t memslot); +vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot); struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_size, void *guest_code); void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); -typedef void (*vmx_guest_code_t)(vm_vaddr_t vmxon_vaddr, - vm_paddr_t vmxon_paddr, - vm_vaddr_t vmcs_vaddr, - vm_paddr_t vmcs_paddr); - struct kvm_userspace_memory_region * kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end); @@ -152,43 +133,49 @@ allocate_kvm_dirty_log(struct kvm_userspace_memory_region *region); int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd); -#define GUEST_PORT_SYNC 0x1000 -#define GUEST_PORT_ABORT 0x1001 -#define GUEST_PORT_DONE 0x1002 - -static inline void __exit_to_l0(uint16_t port, uint64_t arg0, uint64_t arg1) -{ - __asm__ __volatile__("in %[port], %%al" - : - : [port]"d"(port), "D"(arg0), "S"(arg1) - : "rax"); -} - -/* - * Allows to pass three arguments to the host: port is 16bit wide, - * arg0 & arg1 are 64bit wide - */ -#define GUEST_SYNC_ARGS(_port, _arg0, _arg1) \ - __exit_to_l0(_port, (uint64_t) (_arg0), (uint64_t) (_arg1)) - -#define GUEST_ASSERT(_condition) do { \ - if (!(_condition)) \ - GUEST_SYNC_ARGS(GUEST_PORT_ABORT, \ - "Failed guest assert: " \ - #_condition, __LINE__); \ - } while (0) - -#define GUEST_SYNC(stage) GUEST_SYNC_ARGS(GUEST_PORT_SYNC, "hello", stage) +#define sync_global_to_guest(vm, g) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + memcpy(_p, &(g), sizeof(g)); \ +}) + +#define sync_global_from_guest(vm, g) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + memcpy(&(g), _p, sizeof(g)); \ +}) + +/* ucall implementation types */ +typedef enum { + UCALL_PIO, + UCALL_MMIO, +} ucall_type_t; + +/* Common ucalls */ +enum { + UCALL_NONE, + UCALL_SYNC, + UCALL_ABORT, + UCALL_DONE, +}; -#define GUEST_DONE() GUEST_SYNC_ARGS(GUEST_PORT_DONE, 0, 0) +#define UCALL_MAX_ARGS 6 -struct guest_args { - uint64_t arg0; - uint64_t arg1; - uint16_t port; -} __attribute__ ((packed)); +struct ucall { + uint64_t cmd; + uint64_t args[UCALL_MAX_ARGS]; +}; -void guest_args_read(struct kvm_vm *vm, uint32_t vcpu_id, - struct guest_args *args); +void ucall_init(struct kvm_vm *vm, ucall_type_t type, void *arg); +void ucall_uninit(struct kvm_vm *vm); +void ucall(uint64_t cmd, int nargs, ...); +uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc); + +#define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) +#define GUEST_DONE() ucall(UCALL_DONE, 0) +#define GUEST_ASSERT(_condition) do { \ + if (!(_condition)) \ + ucall(UCALL_ABORT, 2, \ + "Failed guest assert: " \ + #_condition, __LINE__); \ +} while (0) #endif /* SELFTEST_KVM_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/sparsebit.h b/tools/testing/selftests/kvm/include/sparsebit.h index 54cfeb6568d3..31e030915c1f 100644 --- a/tools/testing/selftests/kvm/include/sparsebit.h +++ b/tools/testing/selftests/kvm/include/sparsebit.h @@ -15,8 +15,8 @@ * even in the case where most bits are set. */ -#ifndef _TEST_SPARSEBIT_H_ -#define _TEST_SPARSEBIT_H_ +#ifndef SELFTEST_KVM_SPARSEBIT_H +#define SELFTEST_KVM_SPARSEBIT_H #include <stdbool.h> #include <stdint.h> @@ -72,4 +72,4 @@ void sparsebit_validate_internal(struct sparsebit *sbit); } #endif -#endif /* _TEST_SPARSEBIT_H_ */ +#endif /* SELFTEST_KVM_SPARSEBIT_H */ diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 73c3933436ec..c7dafe8bd02c 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -7,8 +7,8 @@ * */ -#ifndef TEST_UTIL_H -#define TEST_UTIL_H 1 +#ifndef SELFTEST_KVM_TEST_UTIL_H +#define SELFTEST_KVM_TEST_UTIL_H #include <stdlib.h> #include <stdarg.h> @@ -41,4 +41,4 @@ void test_assert(bool exp, const char *exp_str, #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \ } while (0) -#endif /* TEST_UTIL_H */ +#endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/x86.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 42c3596815b8..e2884c2b81ff 100644 --- a/tools/testing/selftests/kvm/include/x86.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -1,5 +1,5 @@ /* - * tools/testing/selftests/kvm/include/x86.h + * tools/testing/selftests/kvm/include/x86_64/processor.h * * Copyright (C) 2018, Google LLC. * @@ -7,8 +7,8 @@ * */ -#ifndef SELFTEST_KVM_X86_H -#define SELFTEST_KVM_X86_H +#ifndef SELFTEST_KVM_PROCESSOR_H +#define SELFTEST_KVM_PROCESSOR_H #include <assert.h> #include <stdint.h> @@ -305,7 +305,25 @@ static inline unsigned long get_xmm(int n) struct kvm_x86_state; struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); -void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state); +void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_x86_state *state); + +struct kvm_cpuid2 *kvm_get_supported_cpuid(void); +void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_cpuid2 *cpuid); + +struct kvm_cpuid_entry2 * +kvm_get_supported_cpuid_index(uint32_t function, uint32_t index); + +static inline struct kvm_cpuid_entry2 * +kvm_get_supported_cpuid_entry(uint32_t function) +{ + return kvm_get_supported_cpuid_index(function, 0); +} + +uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index); +void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, + uint64_t msr_value); /* * Basic CPU control in CR0 @@ -1044,4 +1062,4 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s #define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 -#endif /* !SELFTEST_KVM_X86_H */ +#endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index b9ffe1024d3a..c9bd935b939c 100644 --- a/tools/testing/selftests/kvm/include/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -1,5 +1,5 @@ /* - * tools/testing/selftests/kvm/include/vmx.h + * tools/testing/selftests/kvm/include/x86_64/vmx.h * * Copyright (C) 2018, Google LLC. * @@ -11,7 +11,7 @@ #define SELFTEST_KVM_VMX_H #include <stdint.h> -#include "x86.h" +#include "processor.h" #define CPUID_VMX_BIT 5 @@ -339,6 +339,8 @@ struct vmx_msr_entry { uint64_t value; } __attribute__ ((aligned(16))); +#include "evmcs.h" + static inline int vmxon(uint64_t phys) { uint8_t ret; @@ -372,6 +374,9 @@ static inline int vmptrld(uint64_t vmcs_pa) { uint8_t ret; + if (enable_evmcs) + return -1; + __asm__ __volatile__ ("vmptrld %[pa]; setna %[ret]" : [ret]"=rm"(ret) : [pa]"m"(vmcs_pa) @@ -385,6 +390,9 @@ static inline int vmptrst(uint64_t *value) uint64_t tmp; uint8_t ret; + if (enable_evmcs) + return evmcs_vmptrst(value); + __asm__ __volatile__("vmptrst %[value]; setna %[ret]" : [value]"=m"(tmp), [ret]"=rm"(ret) : : "cc", "memory"); @@ -411,6 +419,9 @@ static inline int vmlaunch(void) { int ret; + if (enable_evmcs) + return evmcs_vmlaunch(); + __asm__ __volatile__("push %%rbp;" "push %%rcx;" "push %%rdx;" @@ -443,6 +454,9 @@ static inline int vmresume(void) { int ret; + if (enable_evmcs) + return evmcs_vmresume(); + __asm__ __volatile__("push %%rbp;" "push %%rcx;" "push %%rdx;" @@ -482,6 +496,9 @@ static inline int vmread(uint64_t encoding, uint64_t *value) uint64_t tmp; uint8_t ret; + if (enable_evmcs) + return evmcs_vmread(encoding, value); + __asm__ __volatile__("vmread %[encoding], %[value]; setna %[ret]" : [value]"=rm"(tmp), [ret]"=rm"(ret) : [encoding]"r"(encoding) @@ -506,6 +523,9 @@ static inline int vmwrite(uint64_t encoding, uint64_t value) { uint8_t ret; + if (enable_evmcs) + return evmcs_vmwrite(encoding, value); + __asm__ __volatile__ ("vmwrite %[value], %[encoding]; setna %[ret]" : [ret]"=rm"(ret) : [value]"rm"(value), [encoding]"r"(encoding) @@ -543,10 +563,19 @@ struct vmx_pages { void *vmwrite_hva; uint64_t vmwrite_gpa; void *vmwrite; + + void *vp_assist_hva; + uint64_t vp_assist_gpa; + void *vp_assist; + + void *enlightened_vmcs_hva; + uint64_t enlightened_vmcs_gpa; + void *enlightened_vmcs; }; struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva); bool prepare_for_vmx_operation(struct vmx_pages *vmx); void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp); +bool load_vmcs(struct vmx_pages *vmx); -#endif /* !SELFTEST_KVM_VMX_H */ +#endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c new file mode 100644 index 000000000000..b6022e2f116e --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AArch64 code + * + * Copyright (C) 2018, Red Hat, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include "kvm_util.h" +#include "../kvm_util_internal.h" +#include "processor.h" + +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 +#define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN 0xac0000 + +static uint64_t page_align(struct kvm_vm *vm, uint64_t v) +{ + return (v + vm->page_size) & ~(vm->page_size - 1); +} + +static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva) +{ + unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; + uint64_t mask = (1UL << (vm->va_bits - shift)) - 1; + + return (gva >> shift) & mask; +} + +static uint64_t pud_index(struct kvm_vm *vm, vm_vaddr_t gva) +{ + unsigned int shift = 2 * (vm->page_shift - 3) + vm->page_shift; + uint64_t mask = (1UL << (vm->page_shift - 3)) - 1; + + TEST_ASSERT(vm->pgtable_levels == 4, + "Mode %d does not have 4 page table levels", vm->mode); + + return (gva >> shift) & mask; +} + +static uint64_t pmd_index(struct kvm_vm *vm, vm_vaddr_t gva) +{ + unsigned int shift = (vm->page_shift - 3) + vm->page_shift; + uint64_t mask = (1UL << (vm->page_shift - 3)) - 1; + + TEST_ASSERT(vm->pgtable_levels >= 3, + "Mode %d does not have >= 3 page table levels", vm->mode); + + return (gva >> shift) & mask; +} + +static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva) +{ + uint64_t mask = (1UL << (vm->page_shift - 3)) - 1; + return (gva >> vm->page_shift) & mask; +} + +static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry) +{ + uint64_t mask = ((1UL << (vm->va_bits - vm->page_shift)) - 1) << vm->page_shift; + return entry & mask; +} + +static uint64_t ptrs_per_pgd(struct kvm_vm *vm) +{ + unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; + return 1 << (vm->va_bits - shift); +} + +static uint64_t ptrs_per_pte(struct kvm_vm *vm) +{ + return 1 << (vm->page_shift - 3); +} + +void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) +{ + int rc; + + if (!vm->pgd_created) { + vm_paddr_t paddr = vm_phy_pages_alloc(vm, + page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + vm->pgd = paddr; + vm->pgd_created = true; + } +} + +void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint32_t pgd_memslot, uint64_t flags) +{ + uint8_t attr_idx = flags & 7; + uint64_t *ptep; + + TEST_ASSERT((vaddr % vm->page_size) == 0, + "Virtual address not on page boundary,\n" + " vaddr: 0x%lx vm->page_size: 0x%x", vaddr, vm->page_size); + TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, + (vaddr >> vm->page_shift)), + "Invalid virtual address, vaddr: 0x%lx", vaddr); + TEST_ASSERT((paddr % vm->page_size) == 0, + "Physical address not on page boundary,\n" + " paddr: 0x%lx vm->page_size: 0x%x", paddr, vm->page_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + + ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8; + if (!*ptep) { + *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + *ptep |= 3; + } + + switch (vm->pgtable_levels) { + case 4: + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8; + if (!*ptep) { + *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + *ptep |= 3; + } + /* fall through */ + case 3: + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, vaddr) * 8; + if (!*ptep) { + *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + *ptep |= 3; + } + /* fall through */ + case 2: + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8; + break; + default: + TEST_ASSERT(false, "Page table levels must be 2, 3, or 4"); + } + + *ptep = paddr | 3; + *ptep |= (attr_idx << 2) | (1 << 10) /* Access Flag */; +} + +void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint32_t pgd_memslot) +{ + uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */ + + _virt_pg_map(vm, vaddr, paddr, pgd_memslot, attr_idx); +} + +vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + uint64_t *ptep; + + if (!vm->pgd_created) + goto unmapped_gva; + + ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8; + if (!ptep) + goto unmapped_gva; + + switch (vm->pgtable_levels) { + case 4: + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8; + if (!ptep) + goto unmapped_gva; + /* fall through */ + case 3: + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, gva) * 8; + if (!ptep) + goto unmapped_gva; + /* fall through */ + case 2: + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, gva) * 8; + if (!ptep) + goto unmapped_gva; + break; + default: + TEST_ASSERT(false, "Page table levels must be 2, 3, or 4"); + } + + return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1)); + +unmapped_gva: + TEST_ASSERT(false, "No mapping for vm virtual address, " + "gva: 0x%lx", gva); +} + +static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level) +{ +#ifdef DEBUG_VM + static const char * const type[] = { "", "pud", "pmd", "pte" }; + uint64_t pte, *ptep; + + if (level == 4) + return; + + for (pte = page; pte < page + ptrs_per_pte(vm) * 8; pte += 8) { + ptep = addr_gpa2hva(vm, pte); + if (!*ptep) + continue; + printf("%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep); + pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level + 1); + } +#endif +} + +void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + int level = 4 - (vm->pgtable_levels - 1); + uint64_t pgd, *ptep; + + if (!vm->pgd_created) + return; + + for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pgd(vm) * 8; pgd += 8) { + ptep = addr_gpa2hva(vm, pgd); + if (!*ptep) + continue; + printf("%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep); + pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level); + } +} + +struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, + void *guest_code) +{ + uint64_t ptrs_per_4k_pte = 512; + uint64_t extra_pg_pages = (extra_mem_pages / ptrs_per_4k_pte) * 2; + struct kvm_vm *vm; + + vm = vm_create(VM_MODE_P52V48_4K, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); + + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + vm_vcpu_add_default(vm, vcpuid, guest_code); + + return vm; +} + +void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +{ + size_t stack_size = vm->page_size == 4096 ? + DEFAULT_STACK_PGS * vm->page_size : + vm->page_size; + uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size, + DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, 0, 0); + + vm_vcpu_add(vm, vcpuid, 0, 0); + + set_reg(vm, vcpuid, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); + set_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); +} + +void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot) +{ + struct kvm_vcpu_init init; + uint64_t sctlr_el1, tcr_el1; + + memset(&init, 0, sizeof(init)); + init.target = KVM_ARM_TARGET_GENERIC_V8; + vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_INIT, &init); + + /* + * Enable FP/ASIMD to avoid trapping when accessing Q0-Q15 + * registers, which the variable argument list macros do. + */ + set_reg(vm, vcpuid, ARM64_SYS_REG(CPACR_EL1), 3 << 20); + + get_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), &sctlr_el1); + get_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), &tcr_el1); + + switch (vm->mode) { + case VM_MODE_P52V48_4K: + tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ + tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ + break; + case VM_MODE_P52V48_64K: + tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ + tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ + break; + case VM_MODE_P40V48_4K: + tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ + tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ + break; + case VM_MODE_P40V48_64K: + tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ + tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ + break; + default: + TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode); + } + + sctlr_el1 |= (1 << 0) | (1 << 2) | (1 << 12) /* M | C | I */; + /* TCR_EL1 |= IRGN0:WBWA | ORGN0:WBWA | SH0:Inner-Shareable */; + tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12); + tcr_el1 |= (64 - vm->va_bits) /* T0SZ */; + + set_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), sctlr_el1); + set_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), tcr_el1); + set_reg(vm, vcpuid, ARM64_SYS_REG(MAIR_EL1), DEFAULT_MAIR_EL1); + set_reg(vm, vcpuid, ARM64_SYS_REG(TTBR0_EL1), vm->pgd); +} + +void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +{ + uint64_t pstate, pc; + + get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate); + get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc); + + fprintf(stream, "%*spstate: 0x%.16llx pc: 0x%.16llx\n", + indent, "", pstate, pc); + +} diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c index cd01144d27c8..6398efe67885 100644 --- a/tools/testing/selftests/kvm/lib/assert.c +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -13,7 +13,7 @@ #include <execinfo.h> #include <sys/syscall.h> -#include "../../kselftest.h" +#include "kselftest.h" /* Dumps the current stack trace to stderr. */ static void __attribute__((noinline)) test_dump_stack(void); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 6fd8c089cafc..8c06da4f03db 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -16,10 +16,8 @@ #include <sys/stat.h> #include <linux/kernel.h> -#define KVM_DEV_PATH "/dev/kvm" - #define KVM_UTIL_PGS_PER_HUGEPG 512 -#define KVM_UTIL_MIN_PADDR 0x2000 +#define KVM_UTIL_MIN_PFN 2 /* Aligns x up to the next multiple of size. Size must be a power of 2. */ static void *align(void *x, size_t size) @@ -30,7 +28,8 @@ static void *align(void *x, size_t size) return (void *) (((size_t) x + mask) & ~mask); } -/* Capability +/* + * Capability * * Input Args: * cap - Capability @@ -92,16 +91,23 @@ static void vm_open(struct kvm_vm *vm, int perm) if (vm->kvm_fd < 0) exit(KSFT_SKIP); - /* Create VM. */ vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, NULL); TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " "rc: %i errno: %i", vm->fd, errno); } -/* VM Create +const char * const vm_guest_mode_string[] = { + "PA-bits:52, VA-bits:48, 4K pages", + "PA-bits:52, VA-bits:48, 64K pages", + "PA-bits:40, VA-bits:48, 4K pages", + "PA-bits:40, VA-bits:48, 64K pages", +}; + +/* + * VM Create * * Input Args: - * mode - VM Mode (e.g. VM_MODE_FLAT48PG) + * mode - VM Mode (e.g. VM_MODE_P52V48_4K) * phy_pages - Physical memory pages * perm - permission * @@ -110,7 +116,7 @@ static void vm_open(struct kvm_vm *vm, int perm) * Return: * Pointer to opaque structure that describes the created VM. * - * Creates a VM with the mode specified by mode (e.g. VM_MODE_FLAT48PG). + * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K). * When phy_pages is non-zero, a memory region of phy_pages physical pages * is created and mapped starting at guest physical address 0. The file * descriptor to control the created VM is created with the permissions @@ -121,7 +127,6 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) struct kvm_vm *vm; int kvm_fd; - /* Allocate memory. */ vm = calloc(1, sizeof(*vm)); TEST_ASSERT(vm != NULL, "Insufficent Memory"); @@ -130,26 +135,48 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) /* Setup mode specific traits. */ switch (vm->mode) { - case VM_MODE_FLAT48PG: + case VM_MODE_P52V48_4K: + vm->pgtable_levels = 4; vm->page_size = 0x1000; vm->page_shift = 12; - - /* Limit to 48-bit canonical virtual addresses. */ - vm->vpages_valid = sparsebit_alloc(); - sparsebit_set_num(vm->vpages_valid, - 0, (1ULL << (48 - 1)) >> vm->page_shift); - sparsebit_set_num(vm->vpages_valid, - (~((1ULL << (48 - 1)) - 1)) >> vm->page_shift, - (1ULL << (48 - 1)) >> vm->page_shift); - - /* Limit physical addresses to 52-bits. */ - vm->max_gfn = ((1ULL << 52) >> vm->page_shift) - 1; + vm->va_bits = 48; + break; + case VM_MODE_P52V48_64K: + vm->pgtable_levels = 3; + vm->pa_bits = 52; + vm->page_size = 0x10000; + vm->page_shift = 16; + vm->va_bits = 48; + break; + case VM_MODE_P40V48_4K: + vm->pgtable_levels = 4; + vm->pa_bits = 40; + vm->va_bits = 48; + vm->page_size = 0x1000; + vm->page_shift = 12; + break; + case VM_MODE_P40V48_64K: + vm->pgtable_levels = 3; + vm->pa_bits = 40; + vm->va_bits = 48; + vm->page_size = 0x10000; + vm->page_shift = 16; break; - default: TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode); } + /* Limit to VA-bit canonical virtual addresses. */ + vm->vpages_valid = sparsebit_alloc(); + sparsebit_set_num(vm->vpages_valid, + 0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift); + sparsebit_set_num(vm->vpages_valid, + (~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift, + (1ULL << (vm->va_bits - 1)) >> vm->page_shift); + + /* Limit physical addresses to PA-bits. */ + vm->max_gfn = ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; + /* Allocate and setup memory for guest. */ vm->vpages_mapped = sparsebit_alloc(); if (phy_pages != 0) @@ -159,7 +186,8 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) return vm; } -/* VM Restart +/* + * VM Restart * * Input Args: * vm - VM that has been released before @@ -186,7 +214,8 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm) " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" " guest_phys_addr: 0x%lx size: 0x%lx", - ret, errno, region->region.slot, region->region.flags, + ret, errno, region->region.slot, + region->region.flags, region->region.guest_phys_addr, region->region.memory_size); } @@ -202,7 +231,8 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) strerror(-ret)); } -/* Userspace Memory Region Find +/* + * Userspace Memory Region Find * * Input Args: * vm - Virtual Machine @@ -220,8 +250,8 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) * of the regions is returned. Null is returned only when no overlapping * region exists. */ -static struct userspace_mem_region *userspace_mem_region_find( - struct kvm_vm *vm, uint64_t start, uint64_t end) +static struct userspace_mem_region * +userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) { struct userspace_mem_region *region; @@ -237,7 +267,8 @@ static struct userspace_mem_region *userspace_mem_region_find( return NULL; } -/* KVM Userspace Memory Region Find +/* + * KVM Userspace Memory Region Find * * Input Args: * vm - Virtual Machine @@ -265,7 +296,8 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, return ®ion->region; } -/* VCPU Find +/* + * VCPU Find * * Input Args: * vm - Virtual Machine @@ -280,8 +312,7 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, * returns a pointer to it. Returns NULL if the VM doesn't contain a VCPU * for the specified vcpuid. */ -struct vcpu *vcpu_find(struct kvm_vm *vm, - uint32_t vcpuid) +struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) { struct vcpu *vcpup; @@ -293,7 +324,8 @@ struct vcpu *vcpu_find(struct kvm_vm *vm, return NULL; } -/* VM VCPU Remove +/* + * VM VCPU Remove * * Input Args: * vm - Virtual Machine @@ -330,11 +362,9 @@ void kvm_vm_release(struct kvm_vm *vmp) { int ret; - /* Free VCPUs. */ while (vmp->vcpu_head) vm_vcpu_rm(vmp, vmp->vcpu_head->id); - /* Close file descriptor for the VM. */ ret = close(vmp->fd); TEST_ASSERT(ret == 0, "Close of vm fd failed,\n" " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno); @@ -344,7 +374,8 @@ void kvm_vm_release(struct kvm_vm *vmp) " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno); } -/* Destroys and frees the VM pointed to by vmp. +/* + * Destroys and frees the VM pointed to by vmp. */ void kvm_vm_free(struct kvm_vm *vmp) { @@ -383,7 +414,8 @@ void kvm_vm_free(struct kvm_vm *vmp) free(vmp); } -/* Memory Compare, host virtual to guest virtual +/* + * Memory Compare, host virtual to guest virtual * * Input Args: * hva - Starting host virtual address @@ -405,23 +437,25 @@ void kvm_vm_free(struct kvm_vm *vmp) * a length of len, to the guest bytes starting at the guest virtual * address given by gva. */ -int kvm_memcmp_hva_gva(void *hva, - struct kvm_vm *vm, vm_vaddr_t gva, size_t len) +int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len) { size_t amt; - /* Compare a batch of bytes until either a match is found + /* + * Compare a batch of bytes until either a match is found * or all the bytes have been compared. */ for (uintptr_t offset = 0; offset < len; offset += amt) { uintptr_t ptr1 = (uintptr_t)hva + offset; - /* Determine host address for guest virtual address + /* + * Determine host address for guest virtual address * at offset. */ uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset); - /* Determine amount to compare on this pass. + /* + * Determine amount to compare on this pass. * Don't allow the comparsion to cross a page boundary. */ amt = len - offset; @@ -433,7 +467,8 @@ int kvm_memcmp_hva_gva(void *hva, assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift)); assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift)); - /* Perform the comparison. If there is a difference + /* + * Perform the comparison. If there is a difference * return that result to the caller, otherwise need * to continue on looking for a mismatch. */ @@ -442,109 +477,15 @@ int kvm_memcmp_hva_gva(void *hva, return ret; } - /* No mismatch found. Let the caller know the two memory + /* + * No mismatch found. Let the caller know the two memory * areas are equal. */ return 0; } -/* Allocate an instance of struct kvm_cpuid2 - * - * Input Args: None - * - * Output Args: None - * - * Return: A pointer to the allocated struct. The caller is responsible - * for freeing this struct. - * - * Since kvm_cpuid2 uses a 0-length array to allow a the size of the - * array to be decided at allocation time, allocation is slightly - * complicated. This function uses a reasonable default length for - * the array and performs the appropriate allocation. - */ -static struct kvm_cpuid2 *allocate_kvm_cpuid2(void) -{ - struct kvm_cpuid2 *cpuid; - int nent = 100; - size_t size; - - size = sizeof(*cpuid); - size += nent * sizeof(struct kvm_cpuid_entry2); - cpuid = malloc(size); - if (!cpuid) { - perror("malloc"); - abort(); - } - - cpuid->nent = nent; - - return cpuid; -} - -/* KVM Supported CPUID Get - * - * Input Args: None - * - * Output Args: - * - * Return: The supported KVM CPUID - * - * Get the guest CPUID supported by KVM. - */ -struct kvm_cpuid2 *kvm_get_supported_cpuid(void) -{ - static struct kvm_cpuid2 *cpuid; - int ret; - int kvm_fd; - - if (cpuid) - return cpuid; - - cpuid = allocate_kvm_cpuid2(); - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); - - ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); - TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n", - ret, errno); - - close(kvm_fd); - return cpuid; -} - -/* Locate a cpuid entry. - * - * Input Args: - * cpuid: The cpuid. - * function: The function of the cpuid entry to find. - * - * Output Args: None - * - * Return: A pointer to the cpuid entry. Never returns NULL. - */ -struct kvm_cpuid_entry2 * -kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) -{ - struct kvm_cpuid2 *cpuid; - struct kvm_cpuid_entry2 *entry = NULL; - int i; - - cpuid = kvm_get_supported_cpuid(); - for (i = 0; i < cpuid->nent; i++) { - if (cpuid->entries[i].function == function && - cpuid->entries[i].index == index) { - entry = &cpuid->entries[i]; - break; - } - } - - TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).", - function, index); - return entry; -} - -/* VM Userspace Memory Region Add +/* + * VM Userspace Memory Region Add * * Input Args: * vm - Virtual Machine @@ -586,7 +527,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, " vm->max_gfn: 0x%lx vm->page_size: 0x%x", guest_paddr, npages, vm->max_gfn, vm->page_size); - /* Confirm a mem region with an overlapping address doesn't + /* + * Confirm a mem region with an overlapping address doesn't * already exist. */ region = (struct userspace_mem_region *) userspace_mem_region_find( @@ -677,7 +619,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, vm->userspace_mem_region_head = region; } -/* Memslot to region +/* + * Memslot to region * * Input Args: * vm - Virtual Machine @@ -691,8 +634,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, * on error (e.g. currently no memory region using memslot as a KVM * memory slot ID). */ -static struct userspace_mem_region *memslot2region(struct kvm_vm *vm, - uint32_t memslot) +static struct userspace_mem_region * +memslot2region(struct kvm_vm *vm, uint32_t memslot) { struct userspace_mem_region *region; @@ -712,7 +655,8 @@ static struct userspace_mem_region *memslot2region(struct kvm_vm *vm, return region; } -/* VM Memory Region Flags Set +/* + * VM Memory Region Flags Set * * Input Args: * vm - Virtual Machine @@ -730,7 +674,6 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) int ret; struct userspace_mem_region *region; - /* Locate memory region. */ region = memslot2region(vm, slot); region->region.flags = flags; @@ -742,7 +685,8 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) ret, errno, slot, flags); } -/* VCPU mmap Size +/* + * VCPU mmap Size * * Input Args: None * @@ -772,7 +716,8 @@ static int vcpu_mmap_sz(void) return ret; } -/* VM VCPU Add +/* + * VM VCPU Add * * Input Args: * vm - Virtual Machine @@ -785,7 +730,8 @@ static int vcpu_mmap_sz(void) * Creates and adds to the VM specified by vm and virtual CPU with * the ID given by vcpuid. */ -void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_memslot) +void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, + int gdt_memslot) { struct vcpu *vcpu; @@ -823,7 +769,8 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_me vcpu_setup(vm, vcpuid, pgd_memslot, gdt_memslot); } -/* VM Virtual Address Unused Gap +/* + * VM Virtual Address Unused Gap * * Input Args: * vm - Virtual Machine @@ -843,14 +790,14 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_me * sz unallocated bytes >= vaddr_min is available. */ static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, - vm_vaddr_t vaddr_min) + vm_vaddr_t vaddr_min) { uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift; /* Determine lowest permitted virtual page index. */ uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift; if ((pgidx_start * vm->page_size) < vaddr_min) - goto no_va_found; + goto no_va_found; /* Loop over section with enough valid virtual page indexes. */ if (!sparsebit_is_set_num(vm->vpages_valid, @@ -909,7 +856,8 @@ va_found: return pgidx_start * vm->page_size; } -/* VM Virtual Address Allocate +/* + * VM Virtual Address Allocate * * Input Args: * vm - Virtual Machine @@ -930,13 +878,14 @@ va_found: * a page. */ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, - uint32_t data_memslot, uint32_t pgd_memslot) + uint32_t data_memslot, uint32_t pgd_memslot) { uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); virt_pgd_alloc(vm, pgd_memslot); - /* Find an unused range of virtual page addresses of at least + /* + * Find an unused range of virtual page addresses of at least * pages in length. */ vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min); @@ -946,7 +895,8 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, pages--, vaddr += vm->page_size) { vm_paddr_t paddr; - paddr = vm_phy_page_alloc(vm, KVM_UTIL_MIN_PADDR, data_memslot); + paddr = vm_phy_page_alloc(vm, + KVM_UTIL_MIN_PFN * vm->page_size, data_memslot); virt_pg_map(vm, vaddr, paddr, pgd_memslot); @@ -990,7 +940,8 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, } } -/* Address VM Physical to Host Virtual +/* + * Address VM Physical to Host Virtual * * Input Args: * vm - Virtual Machine @@ -1022,7 +973,8 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) return NULL; } -/* Address Host Virtual to VM Physical +/* + * Address Host Virtual to VM Physical * * Input Args: * vm - Virtual Machine @@ -1056,7 +1008,8 @@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) return -1; } -/* VM Create IRQ Chip +/* + * VM Create IRQ Chip * * Input Args: * vm - Virtual Machine @@ -1078,7 +1031,8 @@ void vm_create_irqchip(struct kvm_vm *vm) vm->has_irqchip = true; } -/* VM VCPU State +/* + * VM VCPU State * * Input Args: * vm - Virtual Machine @@ -1100,7 +1054,8 @@ struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid) return vcpu->state; } -/* VM VCPU Run +/* + * VM VCPU Run * * Input Args: * vm - Virtual Machine @@ -1126,13 +1081,14 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) int rc; TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - do { + do { rc = ioctl(vcpu->fd, KVM_RUN, NULL); } while (rc == -1 && errno == EINTR); return rc; } -/* VM VCPU Set MP State +/* + * VM VCPU Set MP State * * Input Args: * vm - Virtual Machine @@ -1147,7 +1103,7 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) * by mp_state. */ void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_mp_state *mp_state) + struct kvm_mp_state *mp_state) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; @@ -1159,7 +1115,8 @@ void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, "rc: %i errno: %i", ret, errno); } -/* VM VCPU Regs Get +/* + * VM VCPU Regs Get * * Input Args: * vm - Virtual Machine @@ -1173,21 +1130,20 @@ void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, * Obtains the current register state for the VCPU specified by vcpuid * and stores it at the location given by regs. */ -void vcpu_regs_get(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_regs *regs) +void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - /* Get the regs. */ ret = ioctl(vcpu->fd, KVM_GET_REGS, regs); TEST_ASSERT(ret == 0, "KVM_GET_REGS failed, rc: %i errno: %i", ret, errno); } -/* VM VCPU Regs Set +/* + * VM VCPU Regs Set * * Input Args: * vm - Virtual Machine @@ -1201,165 +1157,46 @@ void vcpu_regs_get(struct kvm_vm *vm, * Sets the regs of the VCPU specified by vcpuid to the values * given by regs. */ -void vcpu_regs_set(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_regs *regs) +void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - /* Set the regs. */ ret = ioctl(vcpu->fd, KVM_SET_REGS, regs); TEST_ASSERT(ret == 0, "KVM_SET_REGS failed, rc: %i errno: %i", ret, errno); } void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events) + struct kvm_vcpu_events *events) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - /* Get the regs. */ ret = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, events); TEST_ASSERT(ret == 0, "KVM_GET_VCPU_EVENTS, failed, rc: %i errno: %i", ret, errno); } void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events) + struct kvm_vcpu_events *events) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - /* Set the regs. */ ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events); TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i", ret, errno); } -/* VCPU Get MSR - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * msr_index - Index of MSR - * - * Output Args: None - * - * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. - * - * Get value of MSR for VCPU. - */ -uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - struct { - struct kvm_msrs header; - struct kvm_msr_entry entry; - } buffer = {}; - int r; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - buffer.header.nmsrs = 1; - buffer.entry.index = msr_index; - r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header); - TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" - " rc: %i errno: %i", r, errno); - - return buffer.entry.data; -} - -/* VCPU Set MSR - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * msr_index - Index of MSR - * msr_value - New value of MSR - * - * Output Args: None - * - * Return: On success, nothing. On failure a TEST_ASSERT is produced. - * - * Set value of MSR for VCPU. - */ -void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, - uint64_t msr_value) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - struct { - struct kvm_msrs header; - struct kvm_msr_entry entry; - } buffer = {}; - int r; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - memset(&buffer, 0, sizeof(buffer)); - buffer.header.nmsrs = 1; - buffer.entry.index = msr_index; - buffer.entry.data = msr_value; - r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header); - TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n" - " rc: %i errno: %i", r, errno); -} - -/* VM VCPU Args Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * num - number of arguments - * ... - arguments, each of type uint64_t - * - * Output Args: None - * - * Return: None - * - * Sets the first num function input arguments to the values - * given as variable args. Each of the variable args is expected to - * be of type uint64_t. - */ -void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) -{ - va_list ap; - struct kvm_regs regs; - - TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n" - " num: %u\n", - num); - - va_start(ap, num); - vcpu_regs_get(vm, vcpuid, ®s); - - if (num >= 1) - regs.rdi = va_arg(ap, uint64_t); - - if (num >= 2) - regs.rsi = va_arg(ap, uint64_t); - - if (num >= 3) - regs.rdx = va_arg(ap, uint64_t); - - if (num >= 4) - regs.rcx = va_arg(ap, uint64_t); - - if (num >= 5) - regs.r8 = va_arg(ap, uint64_t); - - if (num >= 6) - regs.r9 = va_arg(ap, uint64_t); - - vcpu_regs_set(vm, vcpuid, ®s); - va_end(ap); -} - -/* VM VCPU System Regs Get +/* + * VM VCPU System Regs Get * * Input Args: * vm - Virtual Machine @@ -1373,22 +1210,20 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) * Obtains the current system register state for the VCPU specified by * vcpuid and stores it at the location given by sregs. */ -void vcpu_sregs_get(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_sregs *sregs) +void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - /* Get the regs. */ - /* Get the regs. */ ret = ioctl(vcpu->fd, KVM_GET_SREGS, sregs); TEST_ASSERT(ret == 0, "KVM_GET_SREGS failed, rc: %i errno: %i", ret, errno); } -/* VM VCPU System Regs Set +/* + * VM VCPU System Regs Set * * Input Args: * vm - Virtual Machine @@ -1402,27 +1237,25 @@ void vcpu_sregs_get(struct kvm_vm *vm, * Sets the system regs of the VCPU specified by vcpuid to the values * given by sregs. */ -void vcpu_sregs_set(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_sregs *sregs) +void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) { int ret = _vcpu_sregs_set(vm, vcpuid, sregs); TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, " "rc: %i errno: %i", ret, errno); } -int _vcpu_sregs_set(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_sregs *sregs) +int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - /* Get the regs. */ return ioctl(vcpu->fd, KVM_SET_SREGS, sregs); } -/* VCPU Ioctl +/* + * VCPU Ioctl * * Input Args: * vm - Virtual Machine @@ -1434,8 +1267,8 @@ int _vcpu_sregs_set(struct kvm_vm *vm, * * Issues an arbitrary ioctl on a VCPU fd. */ -void vcpu_ioctl(struct kvm_vm *vm, - uint32_t vcpuid, unsigned long cmd, void *arg) +void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, + unsigned long cmd, void *arg) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; @@ -1447,7 +1280,8 @@ void vcpu_ioctl(struct kvm_vm *vm, cmd, ret, errno, strerror(errno)); } -/* VM Ioctl +/* + * VM Ioctl * * Input Args: * vm - Virtual Machine @@ -1467,7 +1301,8 @@ void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) cmd, ret, errno, strerror(errno)); } -/* VM Dump +/* + * VM Dump * * Input Args: * vm - Virtual Machine @@ -1514,38 +1349,6 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) vcpu_dump(stream, vm, vcpu->id, indent + 2); } -/* VM VCPU Dump - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * indent - Left margin indent amount - * - * Output Args: - * stream - Output FILE stream - * - * Return: None - * - * Dumps the current state of the VCPU specified by vcpuid, within the VM - * given by vm, to the FILE stream given by stream. - */ -void vcpu_dump(FILE *stream, struct kvm_vm *vm, - uint32_t vcpuid, uint8_t indent) -{ - struct kvm_regs regs; - struct kvm_sregs sregs; - - fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid); - - fprintf(stream, "%*sregs:\n", indent + 2, ""); - vcpu_regs_get(vm, vcpuid, ®s); - regs_dump(stream, ®s, indent + 4); - - fprintf(stream, "%*ssregs:\n", indent + 2, ""); - vcpu_sregs_get(vm, vcpuid, &sregs); - sregs_dump(stream, &sregs, indent + 4); -} - /* Known KVM exit reasons */ static struct exit_reason { unsigned int reason; @@ -1576,7 +1379,8 @@ static struct exit_reason { #endif }; -/* Exit Reason String +/* + * Exit Reason String * * Input Args: * exit_reason - Exit reason @@ -1602,10 +1406,12 @@ const char *exit_reason_str(unsigned int exit_reason) return "Unknown"; } -/* Physical Page Allocate +/* + * Physical Contiguous Page Allocator * * Input Args: * vm - Virtual Machine + * num - number of pages * paddr_min - Physical address minimum * memslot - Memory region to allocate page from * @@ -1614,47 +1420,59 @@ const char *exit_reason_str(unsigned int exit_reason) * Return: * Starting physical address * - * Within the VM specified by vm, locates an available physical page - * at or above paddr_min. If found, the page is marked as in use - * and its address is returned. A TEST_ASSERT failure occurs if no - * page is available at or above paddr_min. + * Within the VM specified by vm, locates a range of available physical + * pages at or above paddr_min. If found, the pages are marked as in use + * and thier base address is returned. A TEST_ASSERT failure occurs if + * not enough pages are available at or above paddr_min. */ -vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, - vm_paddr_t paddr_min, uint32_t memslot) +vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot) { struct userspace_mem_region *region; - sparsebit_idx_t pg; + sparsebit_idx_t pg, base; + + TEST_ASSERT(num > 0, "Must allocate at least one page"); TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address " "not divisible by page size.\n" " paddr_min: 0x%lx page_size: 0x%x", paddr_min, vm->page_size); - /* Locate memory region. */ region = memslot2region(vm, memslot); + base = pg = paddr_min >> vm->page_shift; - /* Locate next available physical page at or above paddr_min. */ - pg = paddr_min >> vm->page_shift; - - if (!sparsebit_is_set(region->unused_phy_pages, pg)) { - pg = sparsebit_next_set(region->unused_phy_pages, pg); - if (pg == 0) { - fprintf(stderr, "No guest physical page available, " - "paddr_min: 0x%lx page_size: 0x%x memslot: %u", - paddr_min, vm->page_size, memslot); - fputs("---- vm dump ----\n", stderr); - vm_dump(stderr, vm, 2); - abort(); + do { + for (; pg < base + num; ++pg) { + if (!sparsebit_is_set(region->unused_phy_pages, pg)) { + base = pg = sparsebit_next_set(region->unused_phy_pages, pg); + break; + } } + } while (pg && pg != base + num); + + if (pg == 0) { + fprintf(stderr, "No guest physical page available, " + "paddr_min: 0x%lx page_size: 0x%x memslot: %u\n", + paddr_min, vm->page_size, memslot); + fputs("---- vm dump ----\n", stderr); + vm_dump(stderr, vm, 2); + abort(); } - /* Specify page as in use and return its address. */ - sparsebit_clear(region->unused_phy_pages, pg); + for (pg = base; pg < base + num; ++pg) + sparsebit_clear(region->unused_phy_pages, pg); + + return base * vm->page_size; +} - return pg * vm->page_size; +vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, + uint32_t memslot) +{ + return vm_phy_pages_alloc(vm, 1, paddr_min, memslot); } -/* Address Guest Virtual to Host Virtual +/* + * Address Guest Virtual to Host Virtual * * Input Args: * vm - Virtual Machine @@ -1669,17 +1487,3 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva) { return addr_gpa2hva(vm, addr_gva2gpa(vm, gva)); } - -void guest_args_read(struct kvm_vm *vm, uint32_t vcpu_id, - struct guest_args *args) -{ - struct kvm_run *run = vcpu_state(vm, vcpu_id); - struct kvm_regs regs; - - memset(®s, 0, sizeof(regs)); - vcpu_regs_get(vm, vcpu_id, ®s); - - args->port = run->io.port; - args->arg0 = regs.rdi; - args->arg1 = regs.rsi; -} diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index 542ed606b338..52701db0f253 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -1,28 +1,29 @@ /* - * tools/testing/selftests/kvm/lib/kvm_util.c + * tools/testing/selftests/kvm/lib/kvm_util_internal.h * * Copyright (C) 2018, Google LLC. * * This work is licensed under the terms of the GNU GPL, version 2. */ -#ifndef KVM_UTIL_INTERNAL_H -#define KVM_UTIL_INTERNAL_H 1 +#ifndef SELFTEST_KVM_UTIL_INTERNAL_H +#define SELFTEST_KVM_UTIL_INTERNAL_H #include "sparsebit.h" +#define KVM_DEV_PATH "/dev/kvm" + #ifndef BITS_PER_BYTE -#define BITS_PER_BYTE 8 +#define BITS_PER_BYTE 8 #endif #ifndef BITS_PER_LONG -#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long)) +#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long)) #endif #define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) -#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) -/* Concrete definition of struct kvm_vm. */ struct userspace_mem_region { struct userspace_mem_region *next, *prev; struct kvm_userspace_memory_region region; @@ -45,14 +46,16 @@ struct kvm_vm { int mode; int kvm_fd; int fd; + unsigned int pgtable_levels; unsigned int page_size; unsigned int page_shift; + unsigned int pa_bits; + unsigned int va_bits; uint64_t max_gfn; struct vcpu *vcpu_head; struct userspace_mem_region *userspace_mem_region_head; struct sparsebit *vpages_valid; struct sparsebit *vpages_mapped; - bool has_irqchip; bool pgd_created; vm_paddr_t pgd; @@ -60,13 +63,11 @@ struct kvm_vm { vm_vaddr_t tss; }; -struct vcpu *vcpu_find(struct kvm_vm *vm, - uint32_t vcpuid); -void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot); +struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid); +void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, + int gdt_memslot); void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); -void regs_dump(FILE *stream, struct kvm_regs *regs, - uint8_t indent); -void sregs_dump(FILE *stream, struct kvm_sregs *sregs, - uint8_t indent); +void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent); +void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent); -#endif +#endif /* SELFTEST_KVM_UTIL_INTERNAL_H */ diff --git a/tools/testing/selftests/kvm/lib/ucall.c b/tools/testing/selftests/kvm/lib/ucall.c new file mode 100644 index 000000000000..4777f9bb5194 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/ucall.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ucall support. A ucall is a "hypercall to userspace". + * + * Copyright (C) 2018, Red Hat, Inc. + */ +#include "kvm_util.h" +#include "kvm_util_internal.h" + +#define UCALL_PIO_PORT ((uint16_t)0x1000) + +static ucall_type_t ucall_type; +static vm_vaddr_t *ucall_exit_mmio_addr; + +static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa) +{ + if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1)) + return false; + + virt_pg_map(vm, gpa, gpa, 0); + + ucall_exit_mmio_addr = (vm_vaddr_t *)gpa; + sync_global_to_guest(vm, ucall_exit_mmio_addr); + + return true; +} + +void ucall_init(struct kvm_vm *vm, ucall_type_t type, void *arg) +{ + ucall_type = type; + sync_global_to_guest(vm, ucall_type); + + if (type == UCALL_PIO) + return; + + if (type == UCALL_MMIO) { + vm_paddr_t gpa, start, end, step; + bool ret; + + if (arg) { + gpa = (vm_paddr_t)arg; + ret = ucall_mmio_init(vm, gpa); + TEST_ASSERT(ret, "Can't set ucall mmio address to %lx", gpa); + return; + } + + /* + * Find an address within the allowed virtual address space, + * that does _not_ have a KVM memory region associated with it. + * Identity mapping an address like this allows the guest to + * access it, but as KVM doesn't know what to do with it, it + * will assume it's something userspace handles and exit with + * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64. + * Here we start with a guess that the addresses around two + * thirds of the VA space are unmapped and then work both down + * and up from there in 1/6 VA space sized steps. + */ + start = 1ul << (vm->va_bits * 2 / 3); + end = 1ul << vm->va_bits; + step = 1ul << (vm->va_bits / 6); + for (gpa = start; gpa >= 0; gpa -= step) { + if (ucall_mmio_init(vm, gpa & ~(vm->page_size - 1))) + return; + } + for (gpa = start + step; gpa < end; gpa += step) { + if (ucall_mmio_init(vm, gpa & ~(vm->page_size - 1))) + return; + } + TEST_ASSERT(false, "Can't find a ucall mmio address"); + } +} + +void ucall_uninit(struct kvm_vm *vm) +{ + ucall_type = 0; + sync_global_to_guest(vm, ucall_type); + ucall_exit_mmio_addr = 0; + sync_global_to_guest(vm, ucall_exit_mmio_addr); +} + +static void ucall_pio_exit(struct ucall *uc) +{ +#ifdef __x86_64__ + asm volatile("in %[port], %%al" + : : [port] "d" (UCALL_PIO_PORT), "D" (uc) : "rax"); +#endif +} + +static void ucall_mmio_exit(struct ucall *uc) +{ + *ucall_exit_mmio_addr = (vm_vaddr_t)uc; +} + +void ucall(uint64_t cmd, int nargs, ...) +{ + struct ucall uc = { + .cmd = cmd, + }; + va_list va; + int i; + + nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; + + va_start(va, nargs); + for (i = 0; i < nargs; ++i) + uc.args[i] = va_arg(va, uint64_t); + va_end(va); + + switch (ucall_type) { + case UCALL_PIO: + ucall_pio_exit(&uc); + break; + case UCALL_MMIO: + ucall_mmio_exit(&uc); + break; + }; +} + +uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) +{ + struct kvm_run *run = vcpu_state(vm, vcpu_id); + + memset(uc, 0, sizeof(*uc)); + +#ifdef __x86_64__ + if (ucall_type == UCALL_PIO && run->exit_reason == KVM_EXIT_IO && + run->io.port == UCALL_PIO_PORT) { + struct kvm_regs regs; + vcpu_regs_get(vm, vcpu_id, ®s); + memcpy(uc, addr_gva2hva(vm, (vm_vaddr_t)regs.rdi), sizeof(*uc)); + return uc->cmd; + } +#endif + if (ucall_type == UCALL_MMIO && run->exit_reason == KVM_EXIT_MMIO && + run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) { + vm_vaddr_t gva; + TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8, + "Unexpected ucall exit mmio address access"); + gva = *(vm_vaddr_t *)run->mmio.data; + memcpy(uc, addr_gva2hva(vm, gva), sizeof(*uc)); + } + + return uc->cmd; +} diff --git a/tools/testing/selftests/kvm/lib/x86.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index a3122f1949a8..f28127f4a3af 100644 --- a/tools/testing/selftests/kvm/lib/x86.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1,5 +1,5 @@ /* - * tools/testing/selftests/kvm/lib/x86.c + * tools/testing/selftests/kvm/lib/x86_64/processor.c * * Copyright (C) 2018, Google LLC. * @@ -10,8 +10,8 @@ #include "test_util.h" #include "kvm_util.h" -#include "kvm_util_internal.h" -#include "x86.h" +#include "../kvm_util_internal.h" +#include "processor.h" /* Minimum physical address used for virtual translation tables. */ #define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 @@ -231,7 +231,7 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) { int rc; - TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use " + TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); /* If needed, create page map l4 table. */ @@ -264,7 +264,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint16_t index[4]; struct pageMapL4Entry *pml4e; - TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use " + TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); TEST_ASSERT((vaddr % vm->page_size) == 0, @@ -551,7 +551,7 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) struct pageTableEntry *pte; void *hva; - TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use " + TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); index[0] = (gva >> 12) & 0x1ffu; @@ -624,9 +624,9 @@ void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot) kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot); switch (vm->mode) { - case VM_MODE_FLAT48PG: + case VM_MODE_P52V48_4K: sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; - sregs.cr4 |= X86_CR4_PAE; + sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); kvm_seg_set_unusable(&sregs.ldt); @@ -672,6 +672,102 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) vcpu_set_mp_state(vm, vcpuid, &mp_state); } +/* Allocate an instance of struct kvm_cpuid2 + * + * Input Args: None + * + * Output Args: None + * + * Return: A pointer to the allocated struct. The caller is responsible + * for freeing this struct. + * + * Since kvm_cpuid2 uses a 0-length array to allow a the size of the + * array to be decided at allocation time, allocation is slightly + * complicated. This function uses a reasonable default length for + * the array and performs the appropriate allocation. + */ +static struct kvm_cpuid2 *allocate_kvm_cpuid2(void) +{ + struct kvm_cpuid2 *cpuid; + int nent = 100; + size_t size; + + size = sizeof(*cpuid); + size += nent * sizeof(struct kvm_cpuid_entry2); + cpuid = malloc(size); + if (!cpuid) { + perror("malloc"); + abort(); + } + + cpuid->nent = nent; + + return cpuid; +} + +/* KVM Supported CPUID Get + * + * Input Args: None + * + * Output Args: + * + * Return: The supported KVM CPUID + * + * Get the guest CPUID supported by KVM. + */ +struct kvm_cpuid2 *kvm_get_supported_cpuid(void) +{ + static struct kvm_cpuid2 *cpuid; + int ret; + int kvm_fd; + + if (cpuid) + return cpuid; + + cpuid = allocate_kvm_cpuid2(); + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); + TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n", + ret, errno); + + close(kvm_fd); + return cpuid; +} + +/* Locate a cpuid entry. + * + * Input Args: + * cpuid: The cpuid. + * function: The function of the cpuid entry to find. + * + * Output Args: None + * + * Return: A pointer to the cpuid entry. Never returns NULL. + */ +struct kvm_cpuid_entry2 * +kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) +{ + struct kvm_cpuid2 *cpuid; + struct kvm_cpuid_entry2 *entry = NULL; + int i; + + cpuid = kvm_get_supported_cpuid(); + for (i = 0; i < cpuid->nent; i++) { + if (cpuid->entries[i].function == function && + cpuid->entries[i].index == index) { + entry = &cpuid->entries[i]; + break; + } + } + + TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).", + function, index); + return entry; +} + /* VM VCPU CPUID Set * * Input Args: @@ -698,6 +794,7 @@ void vcpu_set_cpuid(struct kvm_vm *vm, rc, errno); } + /* Create a VM with reasonable defaults * * Input Args: @@ -726,7 +823,7 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; /* Create VM */ - vm = vm_create(VM_MODE_FLAT48PG, + vm = vm_create(VM_MODE_P52V48_4K, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); @@ -742,6 +839,154 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, return vm; } +/* VCPU Get MSR + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * msr_index - Index of MSR + * + * Output Args: None + * + * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. + * + * Get value of MSR for VCPU. + */ +uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" + " rc: %i errno: %i", r, errno); + + return buffer.entry.data; +} + +/* VCPU Set MSR + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * msr_index - Index of MSR + * msr_value - New value of MSR + * + * Output Args: None + * + * Return: On success, nothing. On failure a TEST_ASSERT is produced. + * + * Set value of MSR for VCPU. + */ +void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, + uint64_t msr_value) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + memset(&buffer, 0, sizeof(buffer)); + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + buffer.entry.data = msr_value; + r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n" + " rc: %i errno: %i", r, errno); +} + +/* VM VCPU Args Set + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * num - number of arguments + * ... - arguments, each of type uint64_t + * + * Output Args: None + * + * Return: None + * + * Sets the first num function input arguments to the values + * given as variable args. Each of the variable args is expected to + * be of type uint64_t. + */ +void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +{ + va_list ap; + struct kvm_regs regs; + + TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n" + " num: %u\n", + num); + + va_start(ap, num); + vcpu_regs_get(vm, vcpuid, ®s); + + if (num >= 1) + regs.rdi = va_arg(ap, uint64_t); + + if (num >= 2) + regs.rsi = va_arg(ap, uint64_t); + + if (num >= 3) + regs.rdx = va_arg(ap, uint64_t); + + if (num >= 4) + regs.rcx = va_arg(ap, uint64_t); + + if (num >= 5) + regs.r8 = va_arg(ap, uint64_t); + + if (num >= 6) + regs.r9 = va_arg(ap, uint64_t); + + vcpu_regs_set(vm, vcpuid, ®s); + va_end(ap); +} + +/* + * VM VCPU Dump + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * indent - Left margin indent amount + * + * Output Args: + * stream - Output FILE stream + * + * Return: None + * + * Dumps the current state of the VCPU specified by vcpuid, within the VM + * given by vm, to the FILE stream given by stream. + */ +void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +{ + struct kvm_regs regs; + struct kvm_sregs sregs; + + fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid); + + fprintf(stream, "%*sregs:\n", indent + 2, ""); + vcpu_regs_get(vm, vcpuid, ®s); + regs_dump(stream, ®s, indent + 4); + + fprintf(stream, "%*ssregs:\n", indent + 2, ""); + vcpu_sregs_get(vm, vcpuid, &sregs); + sregs_dump(stream, &sregs, indent + 4); +} + struct kvm_x86_state { struct kvm_vcpu_events events; struct kvm_mp_state mp_state; diff --git a/tools/testing/selftests/kvm/lib/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index b987c3c970eb..771ba6bf751c 100644 --- a/tools/testing/selftests/kvm/lib/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -1,5 +1,5 @@ /* - * tools/testing/selftests/kvm/lib/x86.c + * tools/testing/selftests/kvm/lib/x86_64/vmx.c * * Copyright (C) 2018, Google LLC. * @@ -10,9 +10,11 @@ #include "test_util.h" #include "kvm_util.h" -#include "x86.h" +#include "processor.h" #include "vmx.h" +bool enable_evmcs; + /* Allocate memory regions for nested VMX tests. * * Input Args: @@ -62,6 +64,20 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); + /* Setup of a region of guest memory for the VP Assist page. */ + vmx->vp_assist = (void *)vm_vaddr_alloc(vm, getpagesize(), + 0x10000, 0, 0); + vmx->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)vmx->vp_assist); + vmx->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vp_assist); + + /* Setup of a region of guest memory for the enlightened VMCS. */ + vmx->enlightened_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), + 0x10000, 0, 0); + vmx->enlightened_vmcs_hva = + addr_gva2hva(vm, (uintptr_t)vmx->enlightened_vmcs); + vmx->enlightened_vmcs_gpa = + addr_gva2gpa(vm, (uintptr_t)vmx->enlightened_vmcs); + *p_vmx_gva = vmx_gva; return vmx; } @@ -107,18 +123,31 @@ bool prepare_for_vmx_operation(struct vmx_pages *vmx) if (vmxon(vmx->vmxon_gpa)) return false; - /* Load a VMCS. */ - *(uint32_t *)(vmx->vmcs) = vmcs_revision(); - if (vmclear(vmx->vmcs_gpa)) - return false; - - if (vmptrld(vmx->vmcs_gpa)) - return false; + return true; +} - /* Setup shadow VMCS, do not load it yet. */ - *(uint32_t *)(vmx->shadow_vmcs) = vmcs_revision() | 0x80000000ul; - if (vmclear(vmx->shadow_vmcs_gpa)) - return false; +bool load_vmcs(struct vmx_pages *vmx) +{ + if (!enable_evmcs) { + /* Load a VMCS. */ + *(uint32_t *)(vmx->vmcs) = vmcs_revision(); + if (vmclear(vmx->vmcs_gpa)) + return false; + + if (vmptrld(vmx->vmcs_gpa)) + return false; + + /* Setup shadow VMCS, do not load it yet. */ + *(uint32_t *)(vmx->shadow_vmcs) = + vmcs_revision() | 0x80000000ul; + if (vmclear(vmx->shadow_vmcs_gpa)) + return false; + } else { + if (evmcs_vmptrld(vmx->enlightened_vmcs_gpa, + vmx->enlightened_vmcs)) + return false; + current_evmcs->revision_id = vmcs_revision(); + } return true; } diff --git a/tools/testing/selftests/kvm/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c index 11ec358bf969..d503a51fad30 100644 --- a/tools/testing/selftests/kvm/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -17,7 +17,7 @@ #include "test_util.h" #include "kvm_util.h" -#include "x86.h" +#include "processor.h" #define X86_FEATURE_XSAVE (1<<26) #define X86_FEATURE_OSXSAVE (1<<27) @@ -67,6 +67,7 @@ int main(int argc, char *argv[]) struct kvm_vm *vm; struct kvm_sregs sregs; struct kvm_cpuid_entry2 *entry; + struct ucall uc; int rc; entry = kvm_get_supported_cpuid_entry(1); @@ -87,21 +88,20 @@ int main(int argc, char *argv[]) rc = _vcpu_run(vm, VCPU_ID); if (run->exit_reason == KVM_EXIT_IO) { - switch (run->io.port) { - case GUEST_PORT_SYNC: + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_SYNC: /* emulate hypervisor clearing CR4.OSXSAVE */ vcpu_sregs_get(vm, VCPU_ID, &sregs); sregs.cr4 &= ~X86_CR4_OSXSAVE; vcpu_sregs_set(vm, VCPU_ID, &sregs); break; - case GUEST_PORT_ABORT: + case UCALL_ABORT: TEST_ASSERT(false, "Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); break; - case GUEST_PORT_DONE: + case UCALL_DONE: goto done; default: - TEST_ASSERT(false, "Unknown port 0x%x.", - run->io.port); + TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); } } } diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c new file mode 100644 index 000000000000..92c2cfd1b182 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018, Red Hat, Inc. + * + * Tests for Enlightened VMCS, including nested guest state. + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" + +#include "kvm_util.h" + +#include "vmx.h" + +#define VCPU_ID 5 + +static bool have_nested_state; + +void l2_guest_code(void) +{ + GUEST_SYNC(6); + + GUEST_SYNC(7); + + /* Done, exit to L1 and never come back. */ + vmcall(); +} + +void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist); + + GUEST_ASSERT(vmx_pages->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_SYNC(3); + GUEST_ASSERT(load_vmcs(vmx_pages)); + GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + + GUEST_SYNC(4); + GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(5); + GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + GUEST_SYNC(8); + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_SYNC(9); +} + +void guest_code(struct vmx_pages *vmx_pages) +{ + GUEST_SYNC(1); + GUEST_SYNC(2); + + if (vmx_pages) + l1_guest_code(vmx_pages); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + struct vmx_pages *vmx_pages = NULL; + vm_vaddr_t vmx_pages_gva = 0; + + struct kvm_regs regs1, regs2; + struct kvm_vm *vm; + struct kvm_run *run; + struct kvm_x86_state *state; + struct ucall uc; + int stage; + uint16_t evmcs_ver; + struct kvm_enable_cap enable_evmcs_cap = { + .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS, + .args[0] = (unsigned long)&evmcs_ver + }; + + struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + if (!kvm_check_cap(KVM_CAP_NESTED_STATE) || + !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { + printf("capabilities not available, skipping test\n"); + exit(KSFT_SKIP); + } + + vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap); + + run = vcpu_state(vm, VCPU_ID); + + vcpu_regs_get(vm, VCPU_ID, ®s1); + + vmx_pages = vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + + for (stage = 1;; stage++) { + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vm, VCPU_ID, ®s1); + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); + } + + /* UCALL_SYNC is handled here. */ + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, "Unexpected register values vmexit #%lx, got %lx", + stage, (ulong)uc.args[1]); + + state = vcpu_save_state(vm, VCPU_ID); + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + kvm_vm_restart(vm, O_RDWR); + vm_vcpu_add(vm, VCPU_ID, 0, 0); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vcpu_load_state(vm, VCPU_ID, state); + run = vcpu_state(vm, VCPU_ID); + free(state); + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vm, VCPU_ID, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); + } + +done: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index 3764e7121265..eb3e7a838cb4 100644 --- a/tools/testing/selftests/kvm/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -19,7 +19,7 @@ #include "test_util.h" #include "kvm_util.h" -#include "x86.h" +#include "processor.h" #define VCPU_ID 0 #define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00 @@ -48,7 +48,7 @@ static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable) static void test_msr_platform_info_enabled(struct kvm_vm *vm) { struct kvm_run *run = vcpu_state(vm, VCPU_ID); - struct guest_args args; + struct ucall uc; set_msr_platform_info_enabled(vm, true); vcpu_run(vm, VCPU_ID); @@ -56,11 +56,11 @@ static void test_msr_platform_info_enabled(struct kvm_vm *vm) "Exit_reason other than KVM_EXIT_IO: %u (%s),\n", run->exit_reason, exit_reason_str(run->exit_reason)); - guest_args_read(vm, VCPU_ID, &args); - TEST_ASSERT(args.port == GUEST_PORT_SYNC, - "Received IO from port other than PORT_HOST_SYNC: %u\n", - run->io.port); - TEST_ASSERT((args.arg1 & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == + get_ucall(vm, VCPU_ID, &uc); + TEST_ASSERT(uc.cmd == UCALL_SYNC, + "Received ucall other than UCALL_SYNC: %u\n", + ucall); + TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == MSR_PLATFORM_INFO_MAX_TURBO_RATIO, "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.", MSR_PLATFORM_INFO_MAX_TURBO_RATIO); diff --git a/tools/testing/selftests/kvm/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 881419d5746e..35640e8e95bc 100644 --- a/tools/testing/selftests/kvm/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -22,7 +22,7 @@ #include "test_util.h" #include "kvm_util.h" -#include "x86.h" +#include "processor.h" #define VCPU_ID 5 diff --git a/tools/testing/selftests/kvm/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 900e3e9dfb9f..03da41f0f736 100644 --- a/tools/testing/selftests/kvm/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -17,7 +17,7 @@ #include "test_util.h" #include "kvm_util.h" -#include "x86.h" +#include "processor.h" #include "vmx.h" #define VCPU_ID 5 @@ -26,20 +26,20 @@ static bool have_nested_state; void l2_guest_code(void) { - GUEST_SYNC(5); + GUEST_SYNC(6); /* Exit to L1 */ vmcall(); /* L1 has now set up a shadow VMCS for us. */ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee); - GUEST_SYNC(9); + GUEST_SYNC(10); GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee); GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0fffee)); - GUEST_SYNC(10); + GUEST_SYNC(11); GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0fffee); GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0ffffee)); - GUEST_SYNC(11); + GUEST_SYNC(12); /* Done, exit to L1 and never come back. */ vmcall(); @@ -52,15 +52,17 @@ void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(vmx_pages->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_SYNC(3); + GUEST_ASSERT(load_vmcs(vmx_pages)); GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); - GUEST_SYNC(3); + GUEST_SYNC(4); GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); - GUEST_SYNC(4); + GUEST_SYNC(5); GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); GUEST_ASSERT(!vmlaunch()); GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); @@ -72,7 +74,7 @@ void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(!vmresume()); GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); - GUEST_SYNC(6); + GUEST_SYNC(7); GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); GUEST_ASSERT(!vmresume()); @@ -85,12 +87,12 @@ void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa)); GUEST_ASSERT(vmlaunch()); - GUEST_SYNC(7); + GUEST_SYNC(8); GUEST_ASSERT(vmlaunch()); GUEST_ASSERT(vmresume()); vmwrite(GUEST_RIP, 0xc0ffee); - GUEST_SYNC(8); + GUEST_SYNC(9); GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee); GUEST_ASSERT(!vmptrld(vmx_pages->vmcs_gpa)); @@ -101,7 +103,7 @@ void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee); GUEST_ASSERT(vmlaunch()); GUEST_ASSERT(vmresume()); - GUEST_SYNC(12); + GUEST_SYNC(13); GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee); GUEST_ASSERT(vmlaunch()); GUEST_ASSERT(vmresume()); @@ -127,6 +129,7 @@ int main(int argc, char *argv[]) struct kvm_vm *vm; struct kvm_run *run; struct kvm_x86_state *state; + struct ucall uc; int stage; struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); @@ -155,23 +158,23 @@ int main(int argc, char *argv[]) memset(®s1, 0, sizeof(regs1)); vcpu_regs_get(vm, VCPU_ID, ®s1); - switch (run->io.port) { - case GUEST_PORT_ABORT: - TEST_ASSERT(false, "%s at %s:%d", (const char *) regs1.rdi, - __FILE__, regs1.rsi); + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0], + __FILE__, uc.args[1]); /* NOT REACHED */ - case GUEST_PORT_SYNC: + case UCALL_SYNC: break; - case GUEST_PORT_DONE: + case UCALL_DONE: goto done; default: - TEST_ASSERT(false, "Unknown port 0x%x.", run->io.port); + TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); } - /* PORT_SYNC is handled here. */ - TEST_ASSERT(!strcmp((const char *)regs1.rdi, "hello") && - regs1.rsi == stage, "Unexpected register values vmexit #%lx, got %lx", - stage, (ulong) regs1.rsi); + /* UCALL_SYNC is handled here. */ + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, "Unexpected register values vmexit #%lx, got %lx", + stage, (ulong)uc.args[1]); state = vcpu_save_state(vm, VCPU_ID); kvm_vm_release(vm); diff --git a/tools/testing/selftests/kvm/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index 213343e5dff9..c8478ce9ea77 100644 --- a/tools/testing/selftests/kvm/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -19,7 +19,7 @@ #include "test_util.h" #include "kvm_util.h" -#include "x86.h" +#include "processor.h" #define VCPU_ID 5 diff --git a/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c index 49bcc68b0235..18fa64db0d7a 100644 --- a/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c @@ -1,5 +1,5 @@ /* - * gtests/tests/vmx_tsc_adjust_test.c + * vmx_tsc_adjust_test * * Copyright (C) 2018, Google LLC. * @@ -22,13 +22,13 @@ #include "test_util.h" #include "kvm_util.h" -#include "x86.h" +#include "processor.h" #include "vmx.h" #include <string.h> #include <sys/ioctl.h> -#include "../kselftest.h" +#include "kselftest.h" #ifndef MSR_IA32_TSC_ADJUST #define MSR_IA32_TSC_ADJUST 0x3b @@ -94,6 +94,7 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE); GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); /* Prepare the VMCS for L2 execution. */ prepare_vmcs(vmx_pages, l2_guest_code, @@ -146,26 +147,25 @@ int main(int argc, char *argv[]) for (;;) { volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); - struct guest_args args; + struct ucall uc; vcpu_run(vm, VCPU_ID); - guest_args_read(vm, VCPU_ID, &args); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (args.port) { - case GUEST_PORT_ABORT: - TEST_ASSERT(false, "%s", (const char *) args.arg0); + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_ASSERT(false, "%s", (const char *)uc.args[0]); /* NOT REACHED */ - case GUEST_PORT_SYNC: - report(args.arg1); + case UCALL_SYNC: + report(uc.args[1]); break; - case GUEST_PORT_DONE: + case UCALL_DONE: goto done; default: - TEST_ASSERT(false, "Unknown port 0x%x.", args.port); + TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); } } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index c92053bc3f96..11b98b2b0486 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -120,8 +120,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret, cpu; - if (type) - return -EINVAL; + ret = kvm_arm_setup_stage2(kvm, type); + if (ret) + return ret; kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran)); if (!kvm->arch.last_vcpu_ran) @@ -212,6 +213,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_READONLY_MEM: case KVM_CAP_MP_STATE: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_VCPU_EVENTS: r = 1; break; case KVM_CAP_ARM_SET_DEVICE_ADDR: @@ -240,7 +242,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; default: - r = kvm_arch_dev_ioctl_check_extension(kvm, ext); + r = kvm_arch_vm_ioctl_check_extension(kvm, ext); break; } return r; @@ -544,7 +546,7 @@ static void update_vttbr(struct kvm *kvm) /* update vttbr to be used with the new vmid */ pgd_phys = virt_to_phys(kvm->arch.pgd); - BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK); + BUG_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)); vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid; @@ -1295,8 +1297,6 @@ static void cpu_init_hyp_mode(void *dummy) __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); __cpu_init_stage2(); - - kvm_arm_init_debug(); } static void cpu_hyp_reset(void) @@ -1309,16 +1309,12 @@ static void cpu_hyp_reinit(void) { cpu_hyp_reset(); - if (is_kernel_in_hyp_mode()) { - /* - * __cpu_init_stage2() is safe to call even if the PM - * event was cancelled before the CPU was reset. - */ - __cpu_init_stage2(); + if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); - } else { + else cpu_init_hyp_mode(NULL); - } + + kvm_arm_init_debug(); if (vgic_present) kvm_vgic_init_cpu_hardware(); @@ -1412,6 +1408,8 @@ static int init_common_resources(void) kvm_vmid_bits = kvm_get_vmid_bits(); kvm_info("%d-bit VMID\n", kvm_vmid_bits); + kvm_set_ipa_limit(); + return 0; } diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index ed162a6c57c5..c23a1b323aad 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -45,7 +45,6 @@ static phys_addr_t hyp_idmap_vector; static unsigned long io_map_base; -#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t)) #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) @@ -150,20 +149,20 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { - pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL); - stage2_pgd_clear(pgd); + pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); + stage2_pgd_clear(kvm, pgd); kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pud_free(pud_table); + stage2_pud_free(kvm, pud_table); put_page(virt_to_page(pgd)); } static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { - pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0); - VM_BUG_ON(stage2_pud_huge(*pud)); - stage2_pud_clear(pud); + pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); + VM_BUG_ON(stage2_pud_huge(kvm, *pud)); + stage2_pud_clear(kvm, pud); kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pmd_free(pmd_table); + stage2_pmd_free(kvm, pmd_table); put_page(virt_to_page(pud)); } @@ -252,7 +251,7 @@ static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, } } while (pte++, addr += PAGE_SIZE, addr != end); - if (stage2_pte_table_empty(start_pte)) + if (stage2_pte_table_empty(kvm, start_pte)) clear_stage2_pmd_entry(kvm, pmd, start_addr); } @@ -262,9 +261,9 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, phys_addr_t next, start_addr = addr; pmd_t *pmd, *start_pmd; - start_pmd = pmd = stage2_pmd_offset(pud, addr); + start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); do { - next = stage2_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(kvm, addr, end); if (!pmd_none(*pmd)) { if (pmd_thp_or_huge(*pmd)) { pmd_t old_pmd = *pmd; @@ -281,7 +280,7 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, } } while (pmd++, addr = next, addr != end); - if (stage2_pmd_table_empty(start_pmd)) + if (stage2_pmd_table_empty(kvm, start_pmd)) clear_stage2_pud_entry(kvm, pud, start_addr); } @@ -291,14 +290,14 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, phys_addr_t next, start_addr = addr; pud_t *pud, *start_pud; - start_pud = pud = stage2_pud_offset(pgd, addr); + start_pud = pud = stage2_pud_offset(kvm, pgd, addr); do { - next = stage2_pud_addr_end(addr, end); - if (!stage2_pud_none(*pud)) { - if (stage2_pud_huge(*pud)) { + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { + if (stage2_pud_huge(kvm, *pud)) { pud_t old_pud = *pud; - stage2_pud_clear(pud); + stage2_pud_clear(kvm, pud); kvm_tlb_flush_vmid_ipa(kvm, addr); kvm_flush_dcache_pud(old_pud); put_page(virt_to_page(pud)); @@ -308,7 +307,7 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, } } while (pud++, addr = next, addr != end); - if (stage2_pud_table_empty(start_pud)) + if (stage2_pud_table_empty(kvm, start_pud)) clear_stage2_pgd_entry(kvm, pgd, start_addr); } @@ -332,7 +331,7 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) assert_spin_locked(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - pgd = kvm->arch.pgd + stage2_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); do { /* * Make sure the page table is still active, as another thread @@ -341,8 +340,8 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) */ if (!READ_ONCE(kvm->arch.pgd)) break; - next = stage2_pgd_addr_end(addr, end); - if (!stage2_pgd_none(*pgd)) + next = stage2_pgd_addr_end(kvm, addr, end); + if (!stage2_pgd_none(kvm, *pgd)) unmap_stage2_puds(kvm, pgd, addr, next); /* * If the range is too large, release the kvm->mmu_lock @@ -371,9 +370,9 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, pmd_t *pmd; phys_addr_t next; - pmd = stage2_pmd_offset(pud, addr); + pmd = stage2_pmd_offset(kvm, pud, addr); do { - next = stage2_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(kvm, addr, end); if (!pmd_none(*pmd)) { if (pmd_thp_or_huge(*pmd)) kvm_flush_dcache_pmd(*pmd); @@ -389,11 +388,11 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, pud_t *pud; phys_addr_t next; - pud = stage2_pud_offset(pgd, addr); + pud = stage2_pud_offset(kvm, pgd, addr); do { - next = stage2_pud_addr_end(addr, end); - if (!stage2_pud_none(*pud)) { - if (stage2_pud_huge(*pud)) + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { + if (stage2_pud_huge(kvm, *pud)) kvm_flush_dcache_pud(*pud); else stage2_flush_pmds(kvm, pud, addr, next); @@ -409,10 +408,11 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t next; pgd_t *pgd; - pgd = kvm->arch.pgd + stage2_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); do { - next = stage2_pgd_addr_end(addr, end); - stage2_flush_puds(kvm, pgd, addr, next); + next = stage2_pgd_addr_end(kvm, addr, end); + if (!stage2_pgd_none(kvm, *pgd)) + stage2_flush_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -897,7 +897,7 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) } /* Allocate the HW PGD, making sure that each page gets its own refcount */ - pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO); + pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); if (!pgd) return -ENOMEM; @@ -986,7 +986,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) spin_lock(&kvm->mmu_lock); if (kvm->arch.pgd) { - unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); + unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; } @@ -994,7 +994,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) /* Free the HW pgd, one page at a time */ if (pgd) - free_pages_exact(pgd, S2_PGD_SIZE); + free_pages_exact(pgd, stage2_pgd_size(kvm)); } static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -1003,16 +1003,16 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pgd_t *pgd; pud_t *pud; - pgd = kvm->arch.pgd + stage2_pgd_index(addr); - if (WARN_ON(stage2_pgd_none(*pgd))) { + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + if (stage2_pgd_none(kvm, *pgd)) { if (!cache) return NULL; pud = mmu_memory_cache_alloc(cache); - stage2_pgd_populate(pgd, pud); + stage2_pgd_populate(kvm, pgd, pud); get_page(virt_to_page(pgd)); } - return stage2_pud_offset(pgd, addr); + return stage2_pud_offset(kvm, pgd, addr); } static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -1025,15 +1025,15 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache if (!pud) return NULL; - if (stage2_pud_none(*pud)) { + if (stage2_pud_none(kvm, *pud)) { if (!cache) return NULL; pmd = mmu_memory_cache_alloc(cache); - stage2_pud_populate(pud, pmd); + stage2_pud_populate(kvm, pud, pmd); get_page(virt_to_page(pud)); } - return stage2_pmd_offset(pud, addr); + return stage2_pmd_offset(kvm, pud, addr); } static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache @@ -1207,8 +1207,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (writable) pte = kvm_s2pte_mkwrite(pte); - ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, - KVM_NR_MEM_OBJS); + ret = mmu_topup_memory_cache(&cache, + kvm_mmu_cache_min_pages(kvm), + KVM_NR_MEM_OBJS); if (ret) goto out; spin_lock(&kvm->mmu_lock); @@ -1230,8 +1231,14 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) { kvm_pfn_t pfn = *pfnp; gfn_t gfn = *ipap >> PAGE_SHIFT; + struct page *page = pfn_to_page(pfn); - if (PageTransCompoundMap(pfn_to_page(pfn))) { + /* + * PageTransCompoungMap() returns true for THP and + * hugetlbfs. Make sure the adjustment is done only for THP + * pages. + */ + if (!PageHuge(page) && PageTransCompoundMap(page)) { unsigned long mask; /* * The address we faulted on is backed by a transparent huge @@ -1296,19 +1303,21 @@ static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) /** * stage2_wp_pmds - write protect PUD range + * kvm: kvm instance for the VM * @pud: pointer to pud entry * @addr: range start address * @end: range end address */ -static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, + phys_addr_t addr, phys_addr_t end) { pmd_t *pmd; phys_addr_t next; - pmd = stage2_pmd_offset(pud, addr); + pmd = stage2_pmd_offset(kvm, pud, addr); do { - next = stage2_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(kvm, addr, end); if (!pmd_none(*pmd)) { if (pmd_thp_or_huge(*pmd)) { if (!kvm_s2pmd_readonly(pmd)) @@ -1328,18 +1337,19 @@ static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) * * Process PUD entries, for a huge PUD we cause a panic. */ -static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) { pud_t *pud; phys_addr_t next; - pud = stage2_pud_offset(pgd, addr); + pud = stage2_pud_offset(kvm, pgd, addr); do { - next = stage2_pud_addr_end(addr, end); - if (!stage2_pud_none(*pud)) { + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { /* TODO:PUD not supported, revisit later if supported */ - BUG_ON(stage2_pud_huge(*pud)); - stage2_wp_pmds(pud, addr, next); + BUG_ON(stage2_pud_huge(kvm, *pud)); + stage2_wp_pmds(kvm, pud, addr, next); } } while (pud++, addr = next, addr != end); } @@ -1355,7 +1365,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) pgd_t *pgd; phys_addr_t next; - pgd = kvm->arch.pgd + stage2_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); do { /* * Release kvm_mmu_lock periodically if the memory region is @@ -1369,9 +1379,9 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) cond_resched_lock(&kvm->mmu_lock); if (!READ_ONCE(kvm->arch.pgd)) break; - next = stage2_pgd_addr_end(addr, end); - if (stage2_pgd_present(*pgd)) - stage2_wp_puds(pgd, addr, next); + next = stage2_pgd_addr_end(kvm, addr, end); + if (stage2_pgd_present(kvm, *pgd)) + stage2_wp_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -1520,7 +1530,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, up_read(¤t->mm->mmap_sem); /* We need minimum second+third level pages */ - ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, + ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), KVM_NR_MEM_OBJS); if (ret) return ret; @@ -1763,7 +1773,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) } /* Userspace should not be able to register out-of-bounds IPAs */ - VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE); + VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); if (fault_status == FSC_ACCESS) { handle_access_fault(vcpu, fault_ipa); @@ -2062,7 +2072,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * space addressable by the KVM guest IPA space. */ if (memslot->base_gfn + memslot->npages >= - (KVM_PHYS_SIZE >> PAGE_SHIFT)) + (kvm_phys_size(kvm) >> PAGE_SHIFT)) return -EFAULT; down_read(¤t->mm->mmap_sem); diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 12502251727e..eb2a390a6c86 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -241,13 +241,6 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id, list_for_each_entry(dev, &(its)->device_list, dev_list) \ list_for_each_entry(ite, &(dev)->itt_head, ite_list) -/* - * We only implement 48 bits of PA at the moment, although the ITS - * supports more. Let's be restrictive here. - */ -#define BASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16)) -#define CBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) - #define GIC_LPI_OFFSET 8192 #define VITS_TYPER_IDBITS 16 @@ -759,6 +752,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, { int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; u64 indirect_ptr, type = GITS_BASER_TYPE(baser); + phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); int esz = GITS_BASER_ENTRY_SIZE(baser); int index; gfn_t gfn; @@ -783,7 +777,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, if (id >= (l1_tbl_size / esz)) return false; - addr = BASER_ADDRESS(baser) + id * esz; + addr = base + id * esz; gfn = addr >> PAGE_SHIFT; if (eaddr) @@ -798,7 +792,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, /* Each 1st level entry is represented by a 64-bit value. */ if (kvm_read_guest_lock(its->dev->kvm, - BASER_ADDRESS(baser) + index * sizeof(indirect_ptr), + base + index * sizeof(indirect_ptr), &indirect_ptr, sizeof(indirect_ptr))) return false; @@ -808,11 +802,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, if (!(indirect_ptr & BIT_ULL(63))) return false; - /* - * Mask the guest physical address and calculate the frame number. - * Any address beyond our supported 48 bits of PA will be caught - * by the actual check in the final step. - */ + /* Mask the guest physical address and calculate the frame number. */ indirect_ptr &= GENMASK_ULL(51, 16); /* Find the address of the actual entry */ @@ -1304,9 +1294,6 @@ static u64 vgic_sanitise_its_baser(u64 reg) GITS_BASER_OUTER_CACHEABILITY_SHIFT, vgic_sanitise_outer_cacheability); - /* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */ - reg &= ~GENMASK_ULL(15, 12); - /* We support only one (ITS) page size: 64K */ reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; @@ -1325,11 +1312,8 @@ static u64 vgic_sanitise_its_cbaser(u64 reg) GITS_CBASER_OUTER_CACHEABILITY_SHIFT, vgic_sanitise_outer_cacheability); - /* - * Sanitise the physical address to be 64k aligned. - * Also limit the physical addresses to 48 bits. - */ - reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12)); + /* Sanitise the physical address to be 64k aligned. */ + reg &= ~GENMASK_ULL(15, 12); return reg; } @@ -1375,7 +1359,7 @@ static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its) if (!its->enabled) return; - cbaser = CBASER_ADDRESS(its->cbaser); + cbaser = GITS_CBASER_ADDRESS(its->cbaser); while (its->cwriter != its->creadr) { int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr, @@ -2233,7 +2217,7 @@ static int vgic_its_restore_device_tables(struct vgic_its *its) if (!(baser & GITS_BASER_VALID)) return 0; - l1_gpa = BASER_ADDRESS(baser); + l1_gpa = GITS_BASER_ADDR_48_to_52(baser); if (baser & GITS_BASER_INDIRECT) { l1_esz = GITS_LVL1_ENTRY_SIZE; @@ -2305,7 +2289,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); u64 baser = its->baser_coll_table; - gpa_t gpa = BASER_ADDRESS(baser); + gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); struct its_collection *collection; u64 val; size_t max_size, filled = 0; @@ -2354,7 +2338,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) if (!(baser & GITS_BASER_VALID)) return 0; - gpa = BASER_ADDRESS(baser); + gpa = GITS_BASER_ADDR_48_to_52(baser); max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 6ada2432e37c..114dce9f4bf5 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -25,7 +25,7 @@ int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, phys_addr_t addr, phys_addr_t alignment) { - if (addr & ~KVM_PHYS_MASK) + if (addr & ~kvm_phys_mask(kvm)) return -E2BIG; if (!IS_ALIGNED(addr, alignment)) diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index a2a175b08b17..b3d1f0985117 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -364,7 +364,6 @@ static u64 vgic_sanitise_pendbaser(u64 reg) vgic_sanitise_outer_cacheability); reg &= ~PENDBASER_RES0_MASK; - reg &= ~GENMASK_ULL(51, 48); return reg; } @@ -382,7 +381,6 @@ static u64 vgic_sanitise_propbaser(u64 reg) vgic_sanitise_outer_cacheability); reg &= ~PROPBASER_RES0_MASK; - reg &= ~GENMASK_ULL(51, 48); return reg; } diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 9e65feb6fa58..3710342cf6ad 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -83,6 +83,7 @@ static int coalesced_mmio_write(struct kvm_vcpu *vcpu, ring->coalesced_mmio[ring->last].phys_addr = addr; ring->coalesced_mmio[ring->last].len = len; memcpy(ring->coalesced_mmio[ring->last].data, val, len); + ring->coalesced_mmio[ring->last].pio = dev->zone.pio; smp_wmb(); ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; spin_unlock(&dev->kvm->ring_lock); @@ -140,6 +141,9 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, int ret; struct kvm_coalesced_mmio_dev *dev; + if (zone->pio != 1 && zone->pio != 0) + return -EINVAL; + dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); if (!dev) return -ENOMEM; @@ -149,8 +153,9 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, dev->zone = *zone; mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr, - zone->size, &dev->dev); + ret = kvm_io_bus_register_dev(kvm, + zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, + zone->addr, zone->size, &dev->dev); if (ret < 0) goto out_free_dev; list_add_tail(&dev->list, &kvm->coalesced_zones); @@ -174,7 +179,8 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) { - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev); + kvm_io_bus_unregister_dev(kvm, + zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev); kvm_iodevice_destructor(&dev->dev); } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f986e31fa68c..786ade1843a2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -219,7 +219,7 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, me = get_cpu(); kvm_for_each_vcpu(i, vcpu, kvm) { - if (!test_bit(i, vcpu_bitmap)) + if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) continue; kvm_make_request(req, vcpu); @@ -243,12 +243,10 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { cpumask_var_t cpus; bool called; - static unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] - = {[0 ... BITS_TO_LONGS(KVM_MAX_VCPUS)-1] = ULONG_MAX}; zalloc_cpumask_var(&cpus, GFP_ATOMIC); - called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap, cpus); + called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus); free_cpumask_var(cpus); return called; @@ -807,20 +805,25 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) * sorted array and known changed memslot position. */ static void update_memslots(struct kvm_memslots *slots, - struct kvm_memory_slot *new) + struct kvm_memory_slot *new, + enum kvm_mr_change change) { int id = new->id; int i = slots->id_to_index[id]; struct kvm_memory_slot *mslots = slots->memslots; WARN_ON(mslots[i].id != id); - if (!new->npages) { - WARN_ON(!mslots[i].npages); - if (mslots[i].npages) - slots->used_slots--; - } else { - if (!mslots[i].npages) - slots->used_slots++; + switch (change) { + case KVM_MR_CREATE: + slots->used_slots++; + WARN_ON(mslots[i].npages || !new->npages); + break; + case KVM_MR_DELETE: + slots->used_slots--; + WARN_ON(new->npages || !mslots[i].npages); + break; + default: + break; } while (i < KVM_MEM_SLOTS_NUM - 1 && @@ -1056,7 +1059,7 @@ int __kvm_set_memory_region(struct kvm *kvm, memset(&new.arch, 0, sizeof(new.arch)); } - update_memslots(slots, &new); + update_memslots(slots, &new, change); old_memslots = install_new_memslots(kvm, as_id, slots); kvm_arch_commit_memory_region(kvm, mem, &old, &new, change); @@ -1311,8 +1314,12 @@ unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); /* - * If writable is set to false, the hva returned by this function is only - * allowed to be read. + * Return the hva of a @gfn and the R/W attribute if possible. + * + * @slot: the kvm_memory_slot which contains @gfn + * @gfn: the gfn to be translated + * @writable: used to return the read/write attribute of the @slot if the hva + * is valid and @writable is not NULL */ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, bool *writable) @@ -2946,6 +2953,8 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: return KVM_COALESCED_MMIO_PAGE_OFFSET; + case KVM_CAP_COALESCED_PIO: + return 1; #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING case KVM_CAP_IRQ_ROUTING: |