diff options
93 files changed, 2840 insertions, 1384 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f5a27f067db9..05161afd7642 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2339,13 +2339,35 @@ kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. Default is 0 (don't ignore, but inject #GP) + kvm.eager_page_split= + [KVM,X86] Controls whether or not KVM will try to + proactively split all huge pages during dirty logging. + Eager page splitting reduces interruptions to vCPU + execution by eliminating the write-protection faults + and MMU lock contention that would otherwise be + required to split huge pages lazily. + + VM workloads that rarely perform writes or that write + only to a small region of VM memory may benefit from + disabling eager page splitting to allow huge pages to + still be used for reads. + + The behavior of eager page splitting depends on whether + KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If + disabled, all huge pages in a memslot will be eagerly + split when dirty logging is enabled on that memslot. If + enabled, eager page splitting will be performed during + the KVM_CLEAR_DIRTY ioctl, and only for the pages being + cleared. + + Eager page splitting currently only supports splitting + huge pages mapped by the TDP MMU. + + Default is Y (on). + kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface. Default is false (don't support). - kvm.mmu_audit= [KVM] This is a R/W parameter which allows audit - KVM MMU at runtime. - Default is 0 (off) - kvm.nx_huge_pages= [KVM] Controls the software workaround for the X86_BUG_ITLB_MULTIHIT bug. diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9954568c7eab..c3e68c1531f0 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3683,15 +3683,17 @@ The fields in each entry are defined as follows: 4.89 KVM_S390_MEM_OP -------------------- -:Capability: KVM_CAP_S390_MEM_OP +:Capability: KVM_CAP_S390_MEM_OP, KVM_CAP_S390_PROTECTED, KVM_CAP_S390_MEM_OP_EXTENSION :Architectures: s390 -:Type: vcpu ioctl +:Type: vm ioctl, vcpu ioctl :Parameters: struct kvm_s390_mem_op (in) :Returns: = 0 on success, < 0 on generic error (e.g. -EFAULT or -ENOMEM), > 0 if an exception occurred while walking the page tables -Read or write data from/to the logical (virtual) memory of a VCPU. +Read or write data from/to the VM's memory. +The KVM_CAP_S390_MEM_OP_EXTENSION capability specifies what functionality is +supported. Parameters are specified via the following structure:: @@ -3701,33 +3703,99 @@ Parameters are specified via the following structure:: __u32 size; /* amount of bytes */ __u32 op; /* type of operation */ __u64 buf; /* buffer in userspace */ - __u8 ar; /* the access register number */ - __u8 reserved[31]; /* should be set to 0 */ + union { + struct { + __u8 ar; /* the access register number */ + __u8 key; /* access key, ignored if flag unset */ + }; + __u32 sida_offset; /* offset into the sida */ + __u8 reserved[32]; /* ignored */ + }; }; -The type of operation is specified in the "op" field. It is either -KVM_S390_MEMOP_LOGICAL_READ for reading from logical memory space or -KVM_S390_MEMOP_LOGICAL_WRITE for writing to logical memory space. The -KVM_S390_MEMOP_F_CHECK_ONLY flag can be set in the "flags" field to check -whether the corresponding memory access would create an access exception -(without touching the data in the memory at the destination). In case an -access exception occurred while walking the MMU tables of the guest, the -ioctl returns a positive error number to indicate the type of exception. -This exception is also raised directly at the corresponding VCPU if the -flag KVM_S390_MEMOP_F_INJECT_EXCEPTION is set in the "flags" field. - The start address of the memory region has to be specified in the "gaddr" field, and the length of the region in the "size" field (which must not be 0). The maximum value for "size" can be obtained by checking the KVM_CAP_S390_MEM_OP capability. "buf" is the buffer supplied by the userspace application where the read data should be written to for -KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written is -stored for a KVM_S390_MEMOP_LOGICAL_WRITE. When KVM_S390_MEMOP_F_CHECK_ONLY -is specified, "buf" is unused and can be NULL. "ar" designates the access -register number to be used; the valid range is 0..15. +a read access, or where the data that should be written is stored for +a write access. The "reserved" field is meant for future extensions. +Reserved and unused values are ignored. Future extension that add members must +introduce new flags. + +The type of operation is specified in the "op" field. Flags modifying +their behavior can be set in the "flags" field. Undefined flag bits must +be set to 0. + +Possible operations are: + * ``KVM_S390_MEMOP_LOGICAL_READ`` + * ``KVM_S390_MEMOP_LOGICAL_WRITE`` + * ``KVM_S390_MEMOP_ABSOLUTE_READ`` + * ``KVM_S390_MEMOP_ABSOLUTE_WRITE`` + * ``KVM_S390_MEMOP_SIDA_READ`` + * ``KVM_S390_MEMOP_SIDA_WRITE`` + +Logical read/write: +^^^^^^^^^^^^^^^^^^^ + +Access logical memory, i.e. translate the given guest address to an absolute +address given the state of the VCPU and use the absolute address as target of +the access. "ar" designates the access register number to be used; the valid +range is 0..15. +Logical accesses are permitted for the VCPU ioctl only. +Logical accesses are permitted for non-protected guests only. + +Supported flags: + * ``KVM_S390_MEMOP_F_CHECK_ONLY`` + * ``KVM_S390_MEMOP_F_INJECT_EXCEPTION`` + * ``KVM_S390_MEMOP_F_SKEY_PROTECTION`` + +The KVM_S390_MEMOP_F_CHECK_ONLY flag can be set to check whether the +corresponding memory access would cause an access exception; however, +no actual access to the data in memory at the destination is performed. +In this case, "buf" is unused and can be NULL. + +In case an access exception occurred during the access (or would occur +in case of KVM_S390_MEMOP_F_CHECK_ONLY), the ioctl returns a positive +error number indicating the type of exception. This exception is also +raised directly at the corresponding VCPU if the flag +KVM_S390_MEMOP_F_INJECT_EXCEPTION is set. + +If the KVM_S390_MEMOP_F_SKEY_PROTECTION flag is set, storage key +protection is also in effect and may cause exceptions if accesses are +prohibited given the access key designated by "key"; the valid range is 0..15. +KVM_S390_MEMOP_F_SKEY_PROTECTION is available if KVM_CAP_S390_MEM_OP_EXTENSION +is > 0. + +Absolute read/write: +^^^^^^^^^^^^^^^^^^^^ + +Access absolute memory. This operation is intended to be used with the +KVM_S390_MEMOP_F_SKEY_PROTECTION flag, to allow accessing memory and performing +the checks required for storage key protection as one operation (as opposed to +user space getting the storage keys, performing the checks, and accessing +memory thereafter, which could lead to a delay between check and access). +Absolute accesses are permitted for the VM ioctl if KVM_CAP_S390_MEM_OP_EXTENSION +is > 0. +Currently absolute accesses are not permitted for VCPU ioctls. +Absolute accesses are permitted for non-protected guests only. + +Supported flags: + * ``KVM_S390_MEMOP_F_CHECK_ONLY`` + * ``KVM_S390_MEMOP_F_SKEY_PROTECTION`` + +The semantics of the flags are as for logical accesses. + +SIDA read/write: +^^^^^^^^^^^^^^^^ + +Access the secure instruction data area which contains memory operands necessary +for instruction emulation for protected guests. +SIDA accesses are available if the KVM_CAP_S390_PROTECTED capability is available. +SIDA accesses are permitted for the VCPU ioctl only. +SIDA accesses are permitted for protected guests only. -The "reserved" field is meant for future extensions. It is not used by -KVM with the currently defined set of flags. +No flags are supported. 4.90 KVM_S390_GET_SKEYS ----------------------- diff --git a/MAINTAINERS b/MAINTAINERS index 69a2935daf6c..fe332650d032 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10547,8 +10547,8 @@ F: arch/riscv/kvm/ KERNEL VIRTUAL MACHINE for s390 (KVM/s390) M: Christian Borntraeger <[email protected]> M: Janosch Frank <[email protected]> +M: Claudio Imbrenda <[email protected]> R: David Hildenbrand <[email protected]> -R: Claudio Imbrenda <[email protected]> S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ @@ -13571,7 +13571,7 @@ F: tools/testing/selftests/nci/ NFS, SUNRPC, AND LOCKD CLIENTS M: Trond Myklebust <[email protected]> -M: Anna Schumaker <[email protected]> +M: Anna Schumaker <[email protected]> S: Maintained W: http://client.linux-nfs.org diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h index 04dc65f8901d..c800199a376b 100644 --- a/arch/s390/include/asm/ctl_reg.h +++ b/arch/s390/include/asm/ctl_reg.h @@ -12,6 +12,8 @@ #define CR0_CLOCK_COMPARATOR_SIGN BIT(63 - 10) #define CR0_LOW_ADDRESS_PROTECTION BIT(63 - 35) +#define CR0_FETCH_PROTECTION_OVERRIDE BIT(63 - 38) +#define CR0_STORAGE_PROTECTION_OVERRIDE BIT(63 - 39) #define CR0_EMERGENCY_SIGNAL_SUBMASK BIT(63 - 49) #define CR0_EXTERNAL_CALL_SUBMASK BIT(63 - 50) #define CR0_CLOCK_COMPARATOR_SUBMASK BIT(63 - 52) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index d98d17a36c7b..cfc4d6fb2385 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -20,6 +20,8 @@ #define PAGE_SIZE _PAGE_SIZE #define PAGE_MASK _PAGE_MASK #define PAGE_DEFAULT_ACC 0 +/* storage-protection override */ +#define PAGE_SPO_ACC 9 #define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4) #define HPAGE_SHIFT 20 diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index d74e26b48604..ba1bcb91af95 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -44,6 +44,28 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n); #define INLINE_COPY_TO_USER #endif +unsigned long __must_check +_copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key); + +static __always_inline unsigned long __must_check +copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key) +{ + if (likely(check_copy_size(to, n, false))) + n = _copy_from_user_key(to, from, n, key); + return n; +} + +unsigned long __must_check +_copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key); + +static __always_inline unsigned long __must_check +copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key) +{ + if (likely(check_copy_size(from, n, true))) + n = _copy_to_user_key(to, from, n, key); + return n; +} + int __put_user_bad(void) __attribute__((noreturn)); int __get_user_bad(void) __attribute__((noreturn)); diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 4460808c3b9a..d53a183c2005 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -10,6 +10,7 @@ #include <linux/mm_types.h> #include <linux/err.h> #include <linux/pgtable.h> +#include <linux/bitfield.h> #include <asm/gmap.h> #include "kvm-s390.h" @@ -794,6 +795,108 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, return 1; } +static int vm_check_access_key(struct kvm *kvm, u8 access_key, + enum gacc_mode mode, gpa_t gpa) +{ + u8 storage_key, access_control; + bool fetch_protected; + unsigned long hva; + int r; + + if (access_key == 0) + return 0; + + hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + + mmap_read_lock(current->mm); + r = get_guest_storage_key(current->mm, hva, &storage_key); + mmap_read_unlock(current->mm); + if (r) + return r; + access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); + if (access_control == access_key) + return 0; + fetch_protected = storage_key & _PAGE_FP_BIT; + if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected) + return 0; + return PGM_PROTECTION; +} + +static bool fetch_prot_override_applicable(struct kvm_vcpu *vcpu, enum gacc_mode mode, + union asce asce) +{ + psw_t *psw = &vcpu->arch.sie_block->gpsw; + unsigned long override; + + if (mode == GACC_FETCH || mode == GACC_IFETCH) { + /* check if fetch protection override enabled */ + override = vcpu->arch.sie_block->gcr[0]; + override &= CR0_FETCH_PROTECTION_OVERRIDE; + /* not applicable if subject to DAT && private space */ + override = override && !(psw_bits(*psw).dat && asce.p); + return override; + } + return false; +} + +static bool fetch_prot_override_applies(unsigned long ga, unsigned int len) +{ + return ga < 2048 && ga + len <= 2048; +} + +static bool storage_prot_override_applicable(struct kvm_vcpu *vcpu) +{ + /* check if storage protection override enabled */ + return vcpu->arch.sie_block->gcr[0] & CR0_STORAGE_PROTECTION_OVERRIDE; +} + +static bool storage_prot_override_applies(u8 access_control) +{ + /* matches special storage protection override key (9) -> allow */ + return access_control == PAGE_SPO_ACC; +} + +static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key, + enum gacc_mode mode, union asce asce, gpa_t gpa, + unsigned long ga, unsigned int len) +{ + u8 storage_key, access_control; + unsigned long hva; + int r; + + /* access key 0 matches any storage key -> allow */ + if (access_key == 0) + return 0; + /* + * caller needs to ensure that gfn is accessible, so we can + * assume that this cannot fail + */ + hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa)); + mmap_read_lock(current->mm); + r = get_guest_storage_key(current->mm, hva, &storage_key); + mmap_read_unlock(current->mm); + if (r) + return r; + access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); + /* access key matches storage key -> allow */ + if (access_control == access_key) + return 0; + if (mode == GACC_FETCH || mode == GACC_IFETCH) { + /* it is a fetch and fetch protection is off -> allow */ + if (!(storage_key & _PAGE_FP_BIT)) + return 0; + if (fetch_prot_override_applicable(vcpu, mode, asce) && + fetch_prot_override_applies(ga, len)) + return 0; + } + if (storage_prot_override_applicable(vcpu) && + storage_prot_override_applies(access_control)) + return 0; + return PGM_PROTECTION; +} + /** * guest_range_to_gpas() - Calculate guest physical addresses of page fragments * covering a logical range @@ -804,6 +907,7 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, * @len: length of range in bytes * @asce: address-space-control element to use for translation * @mode: access mode + * @access_key: access key to mach the range's storage keys against * * Translate a logical range to a series of guest absolute addresses, * such that the concatenation of page fragments starting at each gpa make up @@ -830,7 +934,8 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, */ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, unsigned long *gpas, unsigned long len, - const union asce asce, enum gacc_mode mode) + const union asce asce, enum gacc_mode mode, + u8 access_key) { psw_t *psw = &vcpu->arch.sie_block->gpsw; unsigned int offset = offset_in_page(ga); @@ -857,6 +962,10 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, } if (rc) return trans_exc(vcpu, rc, ga, ar, mode, prot); + rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga, + fragment_len); + if (rc) + return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC); if (gpas) *gpas++ = gpa; offset = 0; @@ -880,16 +989,74 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, return rc; } -int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, - unsigned long len, enum gacc_mode mode) +static int +access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len, u8 access_key) +{ + struct kvm_memory_slot *slot; + bool writable; + gfn_t gfn; + hva_t hva; + int rc; + + gfn = gpa >> PAGE_SHIFT; + slot = gfn_to_memslot(kvm, gfn); + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + /* + * Check if it's a ro memslot, even tho that can't occur (they're unsupported). + * Don't try to actually handle that case. + */ + if (!writable && mode == GACC_STORE) + return -EOPNOTSUPP; + hva += offset_in_page(gpa); + if (mode == GACC_STORE) + rc = copy_to_user_key((void __user *)hva, data, len, access_key); + else + rc = copy_from_user_key(data, (void __user *)hva, len, access_key); + if (rc) + return PGM_PROTECTION; + if (mode == GACC_STORE) + mark_page_dirty_in_slot(kvm, slot, gfn); + return 0; +} + +int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len, enum gacc_mode mode, u8 access_key) +{ + int offset = offset_in_page(gpa); + int fragment_len; + int rc; + + while (min(PAGE_SIZE - offset, len) > 0) { + fragment_len = min(PAGE_SIZE - offset, len); + rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key); + if (rc) + return rc; + offset = 0; + len -= fragment_len; + data += fragment_len; + gpa += fragment_len; + } + return 0; +} + +int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, enum gacc_mode mode, + u8 access_key) { psw_t *psw = &vcpu->arch.sie_block->gpsw; unsigned long nr_pages, idx; unsigned long gpa_array[2]; unsigned int fragment_len; unsigned long *gpas; + enum prot_type prot; int need_ipte_lock; union asce asce; + bool try_storage_prot_override; + bool try_fetch_prot_override; int rc; if (!len) @@ -904,16 +1071,47 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, gpas = vmalloc(array_size(nr_pages, sizeof(unsigned long))); if (!gpas) return -ENOMEM; + try_fetch_prot_override = fetch_prot_override_applicable(vcpu, mode, asce); + try_storage_prot_override = storage_prot_override_applicable(vcpu); need_ipte_lock = psw_bits(*psw).dat && !asce.r; if (need_ipte_lock) ipte_lock(vcpu); - rc = guest_range_to_gpas(vcpu, ga, ar, gpas, len, asce, mode); - for (idx = 0; idx < nr_pages && !rc; idx++) { + /* + * Since we do the access further down ultimately via a move instruction + * that does key checking and returns an error in case of a protection + * violation, we don't need to do the check during address translation. + * Skip it by passing access key 0, which matches any storage key, + * obviating the need for any further checks. As a result the check is + * handled entirely in hardware on access, we only need to take care to + * forego key protection checking if fetch protection override applies or + * retry with the special key 9 in case of storage protection override. + */ + rc = guest_range_to_gpas(vcpu, ga, ar, gpas, len, asce, mode, 0); + if (rc) + goto out_unlock; + for (idx = 0; idx < nr_pages; idx++) { fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len); - rc = access_guest_page(vcpu->kvm, mode, gpas[idx], data, fragment_len); + if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) { + rc = access_guest_page(vcpu->kvm, mode, gpas[idx], + data, fragment_len); + } else { + rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], + data, fragment_len, access_key); + } + if (rc == PGM_PROTECTION && try_storage_prot_override) + rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], + data, fragment_len, PAGE_SPO_ACC); + if (rc == PGM_PROTECTION) + prot = PROT_TYPE_KEYC; + if (rc) + break; len -= fragment_len; data += fragment_len; + ga = kvm_s390_logical_to_effective(vcpu, ga + fragment_len); } + if (rc > 0) + rc = trans_exc(vcpu, rc, ga, ar, mode, prot); +out_unlock: if (need_ipte_lock) ipte_unlock(vcpu); if (nr_pages > ARRAY_SIZE(gpa_array)) @@ -940,12 +1138,13 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, } /** - * guest_translate_address - translate guest logical into guest absolute address + * guest_translate_address_with_key - translate guest logical into guest absolute address * @vcpu: virtual cpu * @gva: Guest virtual address * @ar: Access register * @gpa: Guest physical address * @mode: Translation access mode + * @access_key: access key to mach the storage key with * * Parameter semantics are the same as the ones from guest_translate. * The memory contents at the guest address are not changed. @@ -953,8 +1152,9 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * Note: The IPTE lock is not taken during this function, so the caller * has to take care of this. */ -int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, - unsigned long *gpa, enum gacc_mode mode) +int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, + unsigned long *gpa, enum gacc_mode mode, + u8 access_key) { union asce asce; int rc; @@ -963,7 +1163,8 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); if (rc) return rc; - return guest_range_to_gpas(vcpu, gva, ar, gpa, 1, asce, mode); + return guest_range_to_gpas(vcpu, gva, ar, gpa, 1, asce, mode, + access_key); } /** @@ -973,9 +1174,10 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, * @ar: Access register * @length: Length of test range * @mode: Translation access mode + * @access_key: access key to mach the storage keys with */ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, - unsigned long length, enum gacc_mode mode) + unsigned long length, enum gacc_mode mode, u8 access_key) { union asce asce; int rc = 0; @@ -984,13 +1186,37 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, if (rc) return rc; ipte_lock(vcpu); - rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode); + rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode, + access_key); ipte_unlock(vcpu); return rc; } /** + * check_gpa_range - test a range of guest physical addresses for accessibility + * @kvm: virtual machine instance + * @gpa: guest physical address + * @length: length of test range + * @mode: access mode to test, relevant for storage keys + * @access_key: access key to mach the storage keys with + */ +int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length, + enum gacc_mode mode, u8 access_key) +{ + unsigned int fragment_len; + int rc = 0; + + while (length && !rc) { + fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length); + rc = vm_check_access_key(kvm, access_key, mode, gpa); + length -= fragment_len; + gpa += fragment_len; + } + return rc; +} + +/** * kvm_s390_check_low_addr_prot_real - check for low-address protection * @vcpu: virtual cpu * @gra: Guest real address diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 7c72a5e3449f..1124ff282012 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -186,24 +186,34 @@ enum gacc_mode { GACC_IFETCH, }; -int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, - u8 ar, unsigned long *gpa, enum gacc_mode mode); +int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, + unsigned long *gpa, enum gacc_mode mode, + u8 access_key); + int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, - unsigned long length, enum gacc_mode mode); + unsigned long length, enum gacc_mode mode, u8 access_key); + +int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length, + enum gacc_mode mode, u8 access_key); + +int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len, enum gacc_mode mode, u8 access_key); -int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, - unsigned long len, enum gacc_mode mode); +int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, enum gacc_mode mode, + u8 access_key); int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len, enum gacc_mode mode); /** - * write_guest - copy data from kernel space to guest space + * write_guest_with_key - copy data from kernel space to guest space * @vcpu: virtual cpu * @ga: guest address * @ar: access register * @data: source address in kernel space * @len: number of bytes to copy + * @access_key: access key the storage key needs to match * * Copy @len bytes from @data (kernel space) to @ga (guest address). * In order to copy data to guest space the PSW of the vcpu is inspected: @@ -214,8 +224,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * The addressing mode of the PSW is also inspected, so that address wrap * around is taken into account for 24-, 31- and 64-bit addressing mode, * if the to be copied data crosses page boundaries in guest address space. - * In addition also low address and DAT protection are inspected before - * copying any data (key protection is currently not implemented). + * In addition low address, DAT and key protection checks are performed before + * copying any data. * * This function modifies the 'struct kvm_s390_pgm_info pgm' member of @vcpu. * In case of an access exception (e.g. protection exception) pgm will contain @@ -243,10 +253,53 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * if data has been changed in guest space in case of an exception. */ static inline __must_check +int write_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, u8 access_key) +{ + return access_guest_with_key(vcpu, ga, ar, data, len, GACC_STORE, + access_key); +} + +/** + * write_guest - copy data from kernel space to guest space + * @vcpu: virtual cpu + * @ga: guest address + * @ar: access register + * @data: source address in kernel space + * @len: number of bytes to copy + * + * The behaviour of write_guest is identical to write_guest_with_key, except + * that the PSW access key is used instead of an explicit argument. + */ +static inline __must_check int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len) { - return access_guest(vcpu, ga, ar, data, len, GACC_STORE); + u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key; + + return write_guest_with_key(vcpu, ga, ar, data, len, access_key); +} + +/** + * read_guest_with_key - copy data from guest space to kernel space + * @vcpu: virtual cpu + * @ga: guest address + * @ar: access register + * @data: destination address in kernel space + * @len: number of bytes to copy + * @access_key: access key the storage key needs to match + * + * Copy @len bytes from @ga (guest address) to @data (kernel space). + * + * The behaviour of read_guest_with_key is identical to write_guest_with_key, + * except that data will be copied from guest space to kernel space. + */ +static inline __must_check +int read_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, u8 access_key) +{ + return access_guest_with_key(vcpu, ga, ar, data, len, GACC_FETCH, + access_key); } /** @@ -259,14 +312,16 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, * * Copy @len bytes from @ga (guest address) to @data (kernel space). * - * The behaviour of read_guest is identical to write_guest, except that - * data will be copied from guest space to kernel space. + * The behaviour of read_guest is identical to read_guest_with_key, except + * that the PSW access key is used instead of an explicit argument. */ static inline __must_check int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len) { - return access_guest(vcpu, ga, ar, data, len, GACC_FETCH); + u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key; + + return read_guest_with_key(vcpu, ga, ar, data, len, access_key); } /** @@ -287,7 +342,10 @@ static inline __must_check int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data, unsigned long len) { - return access_guest(vcpu, ga, 0, data, len, GACC_IFETCH); + u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key; + + return access_guest_with_key(vcpu, ga, 0, data, len, GACC_IFETCH, + access_key); } /** diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index d07ff646d844..8bd42a20d924 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -331,18 +331,18 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) kvm_s390_get_regs_rre(vcpu, ®1, ®2); - /* Make sure that the source is paged-in */ - rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2], - reg2, &srcaddr, GACC_FETCH); + /* Ensure that the source is paged-in, no actual access -> no key checking */ + rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg2], + reg2, &srcaddr, GACC_FETCH, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0); if (rc != 0) return rc; - /* Make sure that the destination is paged-in */ - rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1], - reg1, &dstaddr, GACC_STORE); + /* Ensure that the source is paged-in, no actual access -> no key checking */ + rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg1], + reg1, &dstaddr, GACC_STORE, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 577f1ead6a51..e056ad86ccd2 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -564,6 +564,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_VCPU_RESETS: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_S390_DIAG318: + case KVM_CAP_S390_MEM_OP_EXTENSION: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -2359,6 +2360,83 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) return r; } +static bool access_key_invalid(u8 access_key) +{ + return access_key > 0xf; +} + +static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + u64 supported_flags; + void *tmpbuf = NULL; + int r, srcu_idx; + + supported_flags = KVM_S390_MEMOP_F_SKEY_PROTECTION + | KVM_S390_MEMOP_F_CHECK_ONLY; + if (mop->flags & ~supported_flags || !mop->size) + return -EINVAL; + if (mop->size > MEM_OP_MAX_SIZE) + return -E2BIG; + if (kvm_s390_pv_is_protected(kvm)) + return -EINVAL; + if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { + if (access_key_invalid(mop->key)) + return -EINVAL; + } else { + mop->key = 0; + } + if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { + tmpbuf = vmalloc(mop->size); + if (!tmpbuf) + return -ENOMEM; + } + + srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_is_error_gpa(kvm, mop->gaddr)) { + r = PGM_ADDRESSING; + goto out_unlock; + } + + switch (mop->op) { + case KVM_S390_MEMOP_ABSOLUTE_READ: { + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_FETCH, mop->key); + } else { + r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, + mop->size, GACC_FETCH, mop->key); + if (r == 0) { + if (copy_to_user(uaddr, tmpbuf, mop->size)) + r = -EFAULT; + } + } + break; + } + case KVM_S390_MEMOP_ABSOLUTE_WRITE: { + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_STORE, mop->key); + } else { + if (copy_from_user(tmpbuf, uaddr, mop->size)) { + r = -EFAULT; + break; + } + r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, + mop->size, GACC_STORE, mop->key); + } + break; + } + default: + r = -EINVAL; + } + +out_unlock: + srcu_read_unlock(&kvm->srcu, srcu_idx); + + vfree(tmpbuf); + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2483,6 +2561,15 @@ long kvm_arch_vm_ioctl(struct file *filp, } break; } + case KVM_S390_MEM_OP: { + struct kvm_s390_mem_op mem_op; + + if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) + r = kvm_s390_vm_mem_op(kvm, &mem_op); + else + r = -EFAULT; + break; + } default: r = -ENOTTY; } @@ -4655,8 +4742,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } -static long kvm_s390_guest_sida_op(struct kvm_vcpu *vcpu, - struct kvm_s390_mem_op *mop) +static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; int r = 0; @@ -4667,6 +4754,8 @@ static long kvm_s390_guest_sida_op(struct kvm_vcpu *vcpu, return -EINVAL; if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block)) return -E2BIG; + if (!kvm_s390_pv_cpu_is_protected(vcpu)) + return -EINVAL; switch (mop->op) { case KVM_S390_MEMOP_SIDA_READ: @@ -4683,24 +4772,29 @@ static long kvm_s390_guest_sida_op(struct kvm_vcpu *vcpu, } return r; } -static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, - struct kvm_s390_mem_op *mop) + +static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; void *tmpbuf = NULL; int r = 0; const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION - | KVM_S390_MEMOP_F_CHECK_ONLY; + | KVM_S390_MEMOP_F_CHECK_ONLY + | KVM_S390_MEMOP_F_SKEY_PROTECTION; if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size) return -EINVAL; - if (mop->size > MEM_OP_MAX_SIZE) return -E2BIG; - if (kvm_s390_pv_cpu_is_protected(vcpu)) return -EINVAL; - + if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { + if (access_key_invalid(mop->key)) + return -EINVAL; + } else { + mop->key = 0; + } if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { tmpbuf = vmalloc(mop->size); if (!tmpbuf) @@ -4710,11 +4804,12 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, switch (mop->op) { case KVM_S390_MEMOP_LOGICAL_READ: if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gva_range(vcpu, mop->gaddr, mop->ar, - mop->size, GACC_FETCH); + r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, + GACC_FETCH, mop->key); break; } - r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size); + r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, + mop->size, mop->key); if (r == 0) { if (copy_to_user(uaddr, tmpbuf, mop->size)) r = -EFAULT; @@ -4722,15 +4817,16 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, break; case KVM_S390_MEMOP_LOGICAL_WRITE: if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gva_range(vcpu, mop->gaddr, mop->ar, - mop->size, GACC_STORE); + r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, + GACC_STORE, mop->key); break; } if (copy_from_user(tmpbuf, uaddr, mop->size)) { r = -EFAULT; break; } - r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size); + r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, + mop->size, mop->key); break; } @@ -4741,8 +4837,8 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, return r; } -static long kvm_s390_guest_memsida_op(struct kvm_vcpu *vcpu, - struct kvm_s390_mem_op *mop) +static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) { int r, srcu_idx; @@ -4751,12 +4847,12 @@ static long kvm_s390_guest_memsida_op(struct kvm_vcpu *vcpu, switch (mop->op) { case KVM_S390_MEMOP_LOGICAL_READ: case KVM_S390_MEMOP_LOGICAL_WRITE: - r = kvm_s390_guest_mem_op(vcpu, mop); + r = kvm_s390_vcpu_mem_op(vcpu, mop); break; case KVM_S390_MEMOP_SIDA_READ: case KVM_S390_MEMOP_SIDA_WRITE: /* we are locked against sida going away by the vcpu->mutex */ - r = kvm_s390_guest_sida_op(vcpu, mop); + r = kvm_s390_vcpu_sida_op(vcpu, mop); break; default: r = -EINVAL; @@ -4919,7 +5015,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_s390_mem_op mem_op; if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) - r = kvm_s390_guest_memsida_op(vcpu, &mem_op); + r = kvm_s390_vcpu_memsida_op(vcpu, &mem_op); else r = -EFAULT; break; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 417154b314a6..30b24c42ef99 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1443,10 +1443,11 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) static int handle_tprot(struct kvm_vcpu *vcpu) { - u64 address1, address2; - unsigned long hva, gpa; - int ret = 0, cc = 0; + u64 address, operand2; + unsigned long gpa; + u8 access_key; bool writable; + int ret, cc; u8 ar; vcpu->stat.instruction_tprot++; @@ -1454,43 +1455,46 @@ static int handle_tprot(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - kvm_s390_get_base_disp_sse(vcpu, &address1, &address2, &ar, NULL); + kvm_s390_get_base_disp_sse(vcpu, &address, &operand2, &ar, NULL); + access_key = (operand2 & 0xf0) >> 4; - /* we only handle the Linux memory detection case: - * access key == 0 - * everything else goes to userspace. */ - if (address2 & 0xf0) - return -EOPNOTSUPP; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) ipte_lock(vcpu); - ret = guest_translate_address(vcpu, address1, ar, &gpa, GACC_STORE); - if (ret == PGM_PROTECTION) { + + ret = guest_translate_address_with_key(vcpu, address, ar, &gpa, + GACC_STORE, access_key); + if (ret == 0) { + gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable); + } else if (ret == PGM_PROTECTION) { + writable = false; /* Write protected? Try again with read-only... */ - cc = 1; - ret = guest_translate_address(vcpu, address1, ar, &gpa, - GACC_FETCH); + ret = guest_translate_address_with_key(vcpu, address, ar, &gpa, + GACC_FETCH, access_key); } - if (ret) { - if (ret == PGM_ADDRESSING || ret == PGM_TRANSLATION_SPEC) { - ret = kvm_s390_inject_program_int(vcpu, ret); - } else if (ret > 0) { - /* Translation not available */ - kvm_s390_set_psw_cc(vcpu, 3); + if (ret >= 0) { + cc = -1; + + /* Fetching permitted; storing permitted */ + if (ret == 0 && writable) + cc = 0; + /* Fetching permitted; storing not permitted */ + else if (ret == 0 && !writable) + cc = 1; + /* Fetching not permitted; storing not permitted */ + else if (ret == PGM_PROTECTION) + cc = 2; + /* Translation not available */ + else if (ret != PGM_ADDRESSING && ret != PGM_TRANSLATION_SPEC) + cc = 3; + + if (cc != -1) { + kvm_s390_set_psw_cc(vcpu, cc); ret = 0; + } else { + ret = kvm_s390_inject_program_int(vcpu, ret); } - goto out_unlock; } - hva = gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable); - if (kvm_is_error_hva(hva)) { - ret = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - } else { - if (!writable) - cc = 1; /* Write not permitted ==> read-only */ - kvm_s390_set_psw_cc(vcpu, cc); - /* Note: CC2 only occurs for storage keys (not supported yet) */ - } -out_unlock: if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) ipte_unlock(vcpu); return ret; diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 8a5d21461889..b709239feb5d 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -59,11 +59,13 @@ static inline int copy_with_mvcos(void) #endif static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr, - unsigned long size) + unsigned long size, unsigned long key) { unsigned long tmp1, tmp2; union oac spec = { + .oac2.key = key, .oac2.as = PSW_BITS_AS_SECONDARY, + .oac2.k = 1, .oac2.a = 1, }; @@ -94,19 +96,19 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr } static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, - unsigned long size) + unsigned long size, unsigned long key) { unsigned long tmp1, tmp2; tmp1 = -256UL; asm volatile( " sacf 0\n" - "0: mvcp 0(%0,%2),0(%1),%3\n" + "0: mvcp 0(%0,%2),0(%1),%[key]\n" "7: jz 5f\n" "1: algr %0,%3\n" " la %1,256(%1)\n" " la %2,256(%2)\n" - "2: mvcp 0(%0,%2),0(%1),%3\n" + "2: mvcp 0(%0,%2),0(%1),%[key]\n" "8: jnz 1b\n" " j 5f\n" "3: la %4,255(%1)\n" /* %4 = ptr + 255 */ @@ -115,7 +117,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, " slgr %4,%1\n" " clgr %0,%4\n" /* copy crosses next page boundary? */ " jnh 6f\n" - "4: mvcp 0(%4,%2),0(%1),%3\n" + "4: mvcp 0(%4,%2),0(%1),%[key]\n" "9: slgr %0,%4\n" " j 6f\n" "5: slgr %0,%0\n" @@ -123,24 +125,49 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,6b) EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : : "cc", "memory"); + : [key] "d" (key << 4) + : "cc", "memory"); return size; } -unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) +static unsigned long raw_copy_from_user_key(void *to, const void __user *from, + unsigned long n, unsigned long key) { if (copy_with_mvcos()) - return copy_from_user_mvcos(to, from, n); - return copy_from_user_mvcp(to, from, n); + return copy_from_user_mvcos(to, from, n, key); + return copy_from_user_mvcp(to, from, n, key); +} + +unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) +{ + return raw_copy_from_user_key(to, from, n, 0); } EXPORT_SYMBOL(raw_copy_from_user); +unsigned long _copy_from_user_key(void *to, const void __user *from, + unsigned long n, unsigned long key) +{ + unsigned long res = n; + + might_fault(); + if (!should_fail_usercopy()) { + instrument_copy_from_user(to, from, n); + res = raw_copy_from_user_key(to, from, n, key); + } + if (unlikely(res)) + memset(to + (n - res), 0, res); + return res; +} +EXPORT_SYMBOL(_copy_from_user_key); + static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x, - unsigned long size) + unsigned long size, unsigned long key) { unsigned long tmp1, tmp2; union oac spec = { + .oac1.key = key, .oac1.as = PSW_BITS_AS_SECONDARY, + .oac1.k = 1, .oac1.a = 1, }; @@ -171,19 +198,19 @@ static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x, } static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, - unsigned long size) + unsigned long size, unsigned long key) { unsigned long tmp1, tmp2; tmp1 = -256UL; asm volatile( " sacf 0\n" - "0: mvcs 0(%0,%1),0(%2),%3\n" + "0: mvcs 0(%0,%1),0(%2),%[key]\n" "7: jz 5f\n" "1: algr %0,%3\n" " la %1,256(%1)\n" " la %2,256(%2)\n" - "2: mvcs 0(%0,%1),0(%2),%3\n" + "2: mvcs 0(%0,%1),0(%2),%[key]\n" "8: jnz 1b\n" " j 5f\n" "3: la %4,255(%1)\n" /* %4 = ptr + 255 */ @@ -192,7 +219,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, " slgr %4,%1\n" " clgr %0,%4\n" /* copy crosses next page boundary? */ " jnh 6f\n" - "4: mvcs 0(%4,%1),0(%2),%3\n" + "4: mvcs 0(%4,%1),0(%2),%[key]\n" "9: slgr %0,%4\n" " j 6f\n" "5: slgr %0,%0\n" @@ -200,18 +227,36 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,6b) EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : : "cc", "memory"); + : [key] "d" (key << 4) + : "cc", "memory"); return size; } -unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) +static unsigned long raw_copy_to_user_key(void __user *to, const void *from, + unsigned long n, unsigned long key) { if (copy_with_mvcos()) - return copy_to_user_mvcos(to, from, n); - return copy_to_user_mvcs(to, from, n); + return copy_to_user_mvcos(to, from, n, key); + return copy_to_user_mvcs(to, from, n, key); +} + +unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + return raw_copy_to_user_key(to, from, n, 0); } EXPORT_SYMBOL(raw_copy_to_user); +unsigned long _copy_to_user_key(void __user *to, const void *from, + unsigned long n, unsigned long key) +{ + might_fault(); + if (should_fail_usercopy()) + return n; + instrument_copy_to_user(to, from, n); + return raw_copy_to_user_key(to, from, n, key); +} +EXPORT_SYMBOL(_copy_to_user_key); + static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size) { unsigned long tmp1, tmp2; diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index d39e0de06be2..29affccb353c 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -1,29 +1,30 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_NULL) +#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_OPTIONAL) BUILD_BUG_ON(1) #endif /* - * KVM_X86_OP() and KVM_X86_OP_NULL() are used to help generate - * "static_call()"s. They are also intended for use when defining - * the vmx/svm kvm_x86_ops. KVM_X86_OP() can be used for those - * functions that follow the [svm|vmx]_func_name convention. - * KVM_X86_OP_NULL() can leave a NULL definition for the - * case where there is no definition or a function name that - * doesn't match the typical naming convention is supplied. + * KVM_X86_OP() and KVM_X86_OP_OPTIONAL() are used to help generate + * both DECLARE/DEFINE_STATIC_CALL() invocations and + * "static_call_update()" calls. + * + * KVM_X86_OP_OPTIONAL() can be used for those functions that can have + * a NULL definition, for example if "static_call_cond()" will be used + * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise + * to make a definition optional, but in this case the default will + * be __static_call_return0. */ -KVM_X86_OP_NULL(hardware_enable) -KVM_X86_OP_NULL(hardware_disable) -KVM_X86_OP_NULL(hardware_unsetup) -KVM_X86_OP_NULL(cpu_has_accelerated_tpr) +KVM_X86_OP(hardware_enable) +KVM_X86_OP(hardware_disable) +KVM_X86_OP(hardware_unsetup) KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) KVM_X86_OP(vm_init) -KVM_X86_OP_NULL(vm_destroy) +KVM_X86_OP_OPTIONAL(vm_destroy) KVM_X86_OP(vcpu_create) KVM_X86_OP(vcpu_free) KVM_X86_OP(vcpu_reset) -KVM_X86_OP(prepare_guest_switch) +KVM_X86_OP(prepare_switch_to_guest) KVM_X86_OP(vcpu_load) KVM_X86_OP(vcpu_put) KVM_X86_OP(update_exception_bitmap) @@ -33,9 +34,9 @@ KVM_X86_OP(get_segment_base) KVM_X86_OP(get_segment) KVM_X86_OP(get_cpl) KVM_X86_OP(set_segment) -KVM_X86_OP_NULL(get_cs_db_l_bits) +KVM_X86_OP(get_cs_db_l_bits) KVM_X86_OP(set_cr0) -KVM_X86_OP_NULL(post_set_cr3) +KVM_X86_OP_OPTIONAL(post_set_cr3) KVM_X86_OP(is_valid_cr4) KVM_X86_OP(set_cr4) KVM_X86_OP(set_efer) @@ -49,22 +50,22 @@ KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) -KVM_X86_OP(tlb_flush_all) -KVM_X86_OP(tlb_flush_current) -KVM_X86_OP_NULL(tlb_remote_flush) -KVM_X86_OP_NULL(tlb_remote_flush_with_range) -KVM_X86_OP(tlb_flush_gva) -KVM_X86_OP(tlb_flush_guest) +KVM_X86_OP(flush_tlb_all) +KVM_X86_OP(flush_tlb_current) +KVM_X86_OP_OPTIONAL(tlb_remote_flush) +KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range) +KVM_X86_OP(flush_tlb_gva) +KVM_X86_OP(flush_tlb_guest) KVM_X86_OP(vcpu_pre_run) -KVM_X86_OP(run) -KVM_X86_OP_NULL(handle_exit) -KVM_X86_OP_NULL(skip_emulated_instruction) -KVM_X86_OP_NULL(update_emulated_instruction) +KVM_X86_OP(vcpu_run) +KVM_X86_OP(handle_exit) +KVM_X86_OP(skip_emulated_instruction) +KVM_X86_OP_OPTIONAL(update_emulated_instruction) KVM_X86_OP(set_interrupt_shadow) KVM_X86_OP(get_interrupt_shadow) KVM_X86_OP(patch_hypercall) -KVM_X86_OP(set_irq) -KVM_X86_OP(set_nmi) +KVM_X86_OP(inject_irq) +KVM_X86_OP(inject_nmi) KVM_X86_OP(queue_exception) KVM_X86_OP(cancel_injection) KVM_X86_OP(interrupt_allowed) @@ -73,22 +74,22 @@ KVM_X86_OP(get_nmi_mask) KVM_X86_OP(set_nmi_mask) KVM_X86_OP(enable_nmi_window) KVM_X86_OP(enable_irq_window) -KVM_X86_OP(update_cr8_intercept) +KVM_X86_OP_OPTIONAL(update_cr8_intercept) KVM_X86_OP(check_apicv_inhibit_reasons) KVM_X86_OP(refresh_apicv_exec_ctrl) -KVM_X86_OP(hwapic_irr_update) -KVM_X86_OP(hwapic_isr_update) -KVM_X86_OP_NULL(guest_apic_has_interrupt) -KVM_X86_OP(load_eoi_exitmap) -KVM_X86_OP(set_virtual_apic_mode) -KVM_X86_OP_NULL(set_apic_access_page_addr) +KVM_X86_OP_OPTIONAL(hwapic_irr_update) +KVM_X86_OP_OPTIONAL(hwapic_isr_update) +KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt) +KVM_X86_OP_OPTIONAL(load_eoi_exitmap) +KVM_X86_OP_OPTIONAL(set_virtual_apic_mode) +KVM_X86_OP_OPTIONAL(set_apic_access_page_addr) KVM_X86_OP(deliver_interrupt) -KVM_X86_OP_NULL(sync_pir_to_irr) -KVM_X86_OP(set_tss_addr) -KVM_X86_OP(set_identity_map_addr) -KVM_X86_OP(get_mt_mask) +KVM_X86_OP_OPTIONAL(sync_pir_to_irr) +KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) +KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) +KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP(load_mmu_pgd) -KVM_X86_OP_NULL(has_wbinvd_exit) +KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) KVM_X86_OP(get_l2_tsc_multiplier) KVM_X86_OP(write_tsc_offset) @@ -96,32 +97,36 @@ KVM_X86_OP(write_tsc_multiplier) KVM_X86_OP(get_exit_info) KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) -KVM_X86_OP_NULL(request_immediate_exit) +KVM_X86_OP(request_immediate_exit) KVM_X86_OP(sched_in) -KVM_X86_OP_NULL(update_cpu_dirty_logging) -KVM_X86_OP_NULL(vcpu_blocking) -KVM_X86_OP_NULL(vcpu_unblocking) -KVM_X86_OP_NULL(update_pi_irte) -KVM_X86_OP_NULL(start_assignment) -KVM_X86_OP_NULL(apicv_post_state_restore) -KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt) -KVM_X86_OP_NULL(set_hv_timer) -KVM_X86_OP_NULL(cancel_hv_timer) +KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) +KVM_X86_OP_OPTIONAL(vcpu_blocking) +KVM_X86_OP_OPTIONAL(vcpu_unblocking) +KVM_X86_OP_OPTIONAL(pi_update_irte) +KVM_X86_OP_OPTIONAL(pi_start_assignment) +KVM_X86_OP_OPTIONAL(apicv_post_state_restore) +KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) +KVM_X86_OP_OPTIONAL(set_hv_timer) +KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) KVM_X86_OP(smi_allowed) KVM_X86_OP(enter_smm) KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) -KVM_X86_OP_NULL(mem_enc_op) -KVM_X86_OP_NULL(mem_enc_reg_region) -KVM_X86_OP_NULL(mem_enc_unreg_region) +KVM_X86_OP_OPTIONAL(mem_enc_ioctl) +KVM_X86_OP_OPTIONAL(mem_enc_register_region) +KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) +KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) +KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) KVM_X86_OP(get_msr_feature) KVM_X86_OP(can_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) -KVM_X86_OP_NULL(enable_direct_tlbflush) -KVM_X86_OP_NULL(migrate_timers) +KVM_X86_OP_OPTIONAL(enable_direct_tlbflush) +KVM_X86_OP_OPTIONAL(migrate_timers) KVM_X86_OP(msr_filter_changed) -KVM_X86_OP_NULL(complete_emulated_msr) +KVM_X86_OP(complete_emulated_msr) +KVM_X86_OP(vcpu_deliver_sipi_vector) #undef KVM_X86_OP -#undef KVM_X86_OP_NULL +#undef KVM_X86_OP_OPTIONAL +#undef KVM_X86_OP_OPTIONAL_RET0 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6dcccb304775..884c926e4359 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1128,10 +1128,6 @@ struct kvm_arch { struct kvm_hv hyperv; struct kvm_xen xen; - #ifdef CONFIG_KVM_MMU_AUDIT - int audit_point; - #endif - bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; u32 bsp_vcpu_id; @@ -1318,7 +1314,6 @@ struct kvm_x86_ops { int (*hardware_enable)(void); void (*hardware_disable)(void); void (*hardware_unsetup)(void); - bool (*cpu_has_accelerated_tpr)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); @@ -1331,7 +1326,7 @@ struct kvm_x86_ops { void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); - void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); + void (*prepare_switch_to_guest)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); @@ -1361,8 +1356,8 @@ struct kvm_x86_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); bool (*get_if_flag)(struct kvm_vcpu *vcpu); - void (*tlb_flush_all)(struct kvm_vcpu *vcpu); - void (*tlb_flush_current)(struct kvm_vcpu *vcpu); + void (*flush_tlb_all)(struct kvm_vcpu *vcpu); + void (*flush_tlb_current)(struct kvm_vcpu *vcpu); int (*tlb_remote_flush)(struct kvm *kvm); int (*tlb_remote_flush_with_range)(struct kvm *kvm, struct kvm_tlb_range *range); @@ -1373,16 +1368,16 @@ struct kvm_x86_ops { * Can potentially get non-canonical addresses through INVLPGs, which * the implementation may choose to ignore if appropriate. */ - void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); + void (*flush_tlb_gva)(struct kvm_vcpu *vcpu, gva_t addr); /* * Flush any TLB entries created by the guest. Like tlb_flush_gva(), * does not need to flush GPA->HPA mappings. */ - void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); + void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); - enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); + enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@ -1391,8 +1386,8 @@ struct kvm_x86_ops { u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); - void (*set_irq)(struct kvm_vcpu *vcpu); - void (*set_nmi)(struct kvm_vcpu *vcpu); + void (*inject_irq)(struct kvm_vcpu *vcpu); + void (*inject_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); @@ -1459,9 +1454,9 @@ struct kvm_x86_ops { void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); - int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, + int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); - void (*start_assignment)(struct kvm *kvm); + void (*pi_start_assignment)(struct kvm *kvm); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); @@ -1476,9 +1471,9 @@ struct kvm_x86_ops { int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); void (*enable_smi_window)(struct kvm_vcpu *vcpu); - int (*mem_enc_op)(struct kvm *kvm, void __user *argp); - int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); - int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); + int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); + int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); + int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); @@ -1541,15 +1536,22 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> static inline void kvm_ops_static_call_update(void) { -#define KVM_X86_OP(func) \ +#define __KVM_X86_OP(func) \ static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func ? : \ + (void *) __static_call_return0); #include <asm/kvm-x86-ops.h> +#undef __KVM_X86_OP } #define __KVM_HAVE_ARCH_VM_ALLOC @@ -1587,6 +1589,13 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, const struct kvm_memory_slot *memslot, int start_level); +void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + int target_level); +void kvm_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + u64 start, u64 end, + int target_level); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, @@ -1724,7 +1733,6 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); @@ -1878,7 +1886,7 @@ static inline bool kvm_is_supported_user_return_msr(u32 msr) return kvm_find_user_return_msr(msr) >= 0; } -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio); +u64 kvm_scale_tsc(u64 tsc, u64 ratio); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2b1548da00eb..e3cbd7706136 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -126,13 +126,6 @@ config KVM_XEN If in doubt, say "N". -config KVM_MMU_AUDIT - bool "Audit KVM MMU" - depends on KVM && TRACEPOINTS - help - This option adds a R/W kVM module parameter 'mmu_audit', which allows - auditing of KVM MMU events at runtime. - config KVM_EXTERNAL_WRITE_TRACKING bool diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 494d4d351859..ff756cdc31ce 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -712,9 +712,17 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, entry = &array->entries[array->nent++]; + memset(entry, 0, sizeof(*entry)); entry->function = function; entry->index = index; - entry->flags = 0; + switch (function & 0xC0000000) { + case 0x40000000: + /* Hypervisor leaves are always synthesized by __do_cpuid_func. */ + return entry; + + default: + break; + } cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5719d8cfdbd9..b7990defca9c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2600,8 +2600,7 @@ emulate_shutdown: } static void -setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, - struct desc_struct *cs, struct desc_struct *ss) +setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss) { cs->l = 0; /* will be adjusted later */ set_desc_base(cs, 0); /* flat segment */ @@ -2690,7 +2689,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) if (!(efer & EFER_SCE)) return emulate_ud(ctxt); - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(&cs, &ss); ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); @@ -2758,7 +2757,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if ((msr_data & 0xfffc) == 0x0) return emulate_gp(ctxt, 0); - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(&cs, &ss); ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK; ss_sel = cs_sel + 8; @@ -2795,7 +2794,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ctxt->mode == X86EMUL_MODE_VM86) return emulate_gp(ctxt, 0); - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(&cs, &ss); if ((ctxt->rex_prefix & 0x8) != 0x0) usermode = X86EMUL_MODE_PROT64; @@ -3013,8 +3012,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static int task_switch_16(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, u16 old_tss_sel, +static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { struct tss_segment_16 tss_seg; @@ -3152,8 +3150,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, return ret; } -static int task_switch_32(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, u16 old_tss_sel, +static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { struct tss_segment_32 tss_seg; @@ -3261,10 +3258,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, old_tss_sel = 0xffff; if (next_tss_desc.type & 8) - ret = task_switch_32(ctxt, tss_selector, old_tss_sel, - old_tss_base, &next_tss_desc); + ret = task_switch_32(ctxt, old_tss_sel, old_tss_base, &next_tss_desc); else - ret = task_switch_16(ctxt, tss_selector, old_tss_sel, + ret = task_switch_16(ctxt, old_tss_sel, old_tss_base, &next_tss_desc); if (ret != X86EMUL_CONTINUE) return ret; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6e38a7d22e97..653e08c993c4 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -112,6 +112,9 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, if (!!auto_eoi_old == !!auto_eoi_new) return; + if (!enable_apicv) + return; + down_write(&vcpu->kvm->arch.apicv_update_lock); if (auto_eoi_new) @@ -1710,32 +1713,47 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) return kvm_hv_get_msr(vcpu, msr, pdata, host); } -static __always_inline unsigned long *sparse_set_to_vcpu_mask( - struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, - u64 *vp_bitmap, unsigned long *vcpu_bitmap) +static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks, + u64 valid_bank_mask, unsigned long *vcpu_mask) { struct kvm_hv *hv = to_kvm_hv(kvm); + bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes); + u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; struct kvm_vcpu *vcpu; int bank, sbank = 0; unsigned long i; + u64 *bitmap; + + BUILD_BUG_ON(sizeof(vp_bitmap) > + sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS)); - memset(vp_bitmap, 0, - KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap)); + /* + * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else + * fill a temporary buffer and manually test each vCPU's VP index. + */ + if (likely(!has_mismatch)) + bitmap = (u64 *)vcpu_mask; + else + bitmap = vp_bitmap; + + /* + * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask + * having a '1' for each bank that exists in sparse_banks. Sets must + * be in ascending order, i.e. bank0..bankN. + */ + memset(bitmap, 0, sizeof(vp_bitmap)); for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, KVM_HV_MAX_SPARSE_VCPU_SET_BITS) - vp_bitmap[bank] = sparse_banks[sbank++]; + bitmap[bank] = sparse_banks[sbank++]; - if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) { - /* for all vcpus vp_index == vcpu_idx */ - return (unsigned long *)vp_bitmap; - } + if (likely(!has_mismatch)) + return; - bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); + bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap)) - __set_bit(i, vcpu_bitmap); + __set_bit(i, vcpu_mask); } - return vcpu_bitmap; } struct kvm_hv_hcall { @@ -1743,6 +1761,7 @@ struct kvm_hv_hcall { u64 ingpa; u64 outgpa; u16 code; + u16 var_cnt; u16 rep_cnt; u16 rep_idx; bool fast; @@ -1750,21 +1769,40 @@ struct kvm_hv_hcall { sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; }; +static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, + u64 *sparse_banks, gpa_t offset) +{ + u16 var_cnt; + + if (hc->var_cnt > 64) + return -EINVAL; + + /* Ignore banks that cannot possibly contain a legal VP index. */ + var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS); + + return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks, + var_cnt * sizeof(*sparse_banks)); +} + static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex) { int i; - gpa_t gpa; struct kvm *kvm = vcpu->kvm; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; - u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; - DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); - unsigned long *vcpu_mask; + DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); u64 valid_bank_mask; - u64 sparse_banks[64]; - int sparse_banks_len; + u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; bool all_cpus; + /* + * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the + * valid mask is a u64. Fail the build if KVM's max allowed number of + * vCPUs (>4096) would exceed this limit, KVM will additional changes + * for Hyper-V support to avoid setting the guest up to fail. + */ + BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64); + if (!ex) { if (hc->fast) { flush.address_space = hc->ingpa; @@ -1812,30 +1850,32 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; - sparse_banks_len = bitmap_weight((unsigned long *)&valid_bank_mask, 64); + if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + if (all_cpus) + goto do_flush; - if (!sparse_banks_len && !all_cpus) + if (!hc->var_cnt) goto ret_success; - if (!all_cpus) { - if (hc->fast) { - if (sparse_banks_len > HV_HYPERCALL_MAX_XMM_REGISTERS - 1) - return HV_STATUS_INVALID_HYPERCALL_INPUT; - for (i = 0; i < sparse_banks_len; i += 2) { - sparse_banks[i] = sse128_lo(hc->xmm[i / 2 + 1]); - sparse_banks[i + 1] = sse128_hi(hc->xmm[i / 2 + 1]); - } - } else { - gpa = hc->ingpa + offsetof(struct hv_tlb_flush_ex, - hv_vp_set.bank_contents); - if (unlikely(kvm_read_guest(kvm, gpa, sparse_banks, - sparse_banks_len * - sizeof(sparse_banks[0])))) - return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->fast) { + if (hc->var_cnt > HV_HYPERCALL_MAX_XMM_REGISTERS - 1) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + for (i = 0; i < hc->var_cnt; i += 2) { + sparse_banks[i] = sse128_lo(hc->xmm[i / 2 + 1]); + sparse_banks[i + 1] = sse128_hi(hc->xmm[i / 2 + 1]); } + goto do_flush; } + + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, + offsetof(struct hv_tlb_flush_ex, + hv_vp_set.bank_contents))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; } +do_flush: /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. @@ -1843,11 +1883,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool if (all_cpus) { kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST); } else { - vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, - vp_bitmap, vcpu_bitmap); + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); - kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, - vcpu_mask); + kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask); } ret_success: @@ -1880,12 +1918,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; - u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; - DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); - unsigned long *vcpu_mask; + DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); unsigned long valid_bank_mask; - u64 sparse_banks[64]; - int sparse_banks_len; + u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; u32 vector; bool all_cpus; @@ -1918,22 +1953,20 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool vector = send_ipi_ex.vector; valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; - sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) * - sizeof(sparse_banks[0]); - all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; + if (hc->var_cnt != bitmap_weight(&valid_bank_mask, 64)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (all_cpus) goto check_and_send_ipi; - if (!sparse_banks_len) + if (!hc->var_cnt) goto ret_success; - if (kvm_read_guest(kvm, - hc->ingpa + offsetof(struct hv_send_ipi_ex, - vp_set.bank_contents), - sparse_banks, - sparse_banks_len)) + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, + offsetof(struct hv_send_ipi_ex, + vp_set.bank_contents))) return HV_STATUS_INVALID_HYPERCALL_INPUT; } @@ -1941,11 +1974,13 @@ check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - vcpu_mask = all_cpus ? NULL : - sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, - vp_bitmap, vcpu_bitmap); + if (all_cpus) { + kvm_send_ipi_to_many(kvm, vector, NULL); + } else { + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); - kvm_send_ipi_to_many(kvm, vector, vcpu_mask); + kvm_send_ipi_to_many(kvm, vector, vcpu_mask); + } ret_success: return HV_STATUS_SUCCESS; @@ -2017,11 +2052,6 @@ int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce) return ret; } -bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id; -} - static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { bool longmode; @@ -2191,19 +2221,25 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } hc.code = hc.param & 0xffff; + hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET; hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT); hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; hc.rep = !!(hc.rep_cnt || hc.rep_idx); - trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx, - hc.ingpa, hc.outgpa); + trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt, + hc.rep_idx, hc.ingpa, hc.outgpa); if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) { ret = HV_STATUS_ACCESS_DENIED; goto hypercall_complete; } + if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + goto hypercall_complete; + } + if (hc.fast && is_xmm_fast_hypercall(&hc)) { if (unlikely(hv_vcpu->enforce_cpuid && !(hv_vcpu->cpuid_cache.features_edx & @@ -2217,14 +2253,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: - if (unlikely(hc.rep)) { + if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } kvm_vcpu_on_spin(vcpu, true); break; case HVCALL_SIGNAL_EVENT: - if (unlikely(hc.rep)) { + if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } @@ -2234,7 +2270,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ - if (unlikely(hc.rep || !to_hv_synic(vcpu)->active)) { + if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } @@ -2247,14 +2283,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) kvm_hv_hypercall_complete_userspace; return 0; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: - if (unlikely(!hc.rep_cnt || hc.rep_idx)) { + if (unlikely(!hc.rep_cnt || hc.rep_idx || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc, false); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: - if (unlikely(hc.rep)) { + if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } @@ -2275,7 +2311,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) ret = kvm_hv_flush_tlb(vcpu, &hc, true); break; case HVCALL_SEND_IPI: - if (unlikely(hc.rep)) { + if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } @@ -2417,10 +2453,6 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, if (kvm_x86_ops.nested_ops->get_evmcs_version) evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu); - /* Skip NESTED_FEATURES if eVMCS is not supported */ - if (!evmcs_ver) - --nent; - if (cpuid->nent < nent) return -E2BIG; @@ -2520,8 +2552,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; - if (evmcs_ver) - ent->eax |= HV_X64_NESTED_MSR_BITMAP; + ent->eax |= HV_X64_NESTED_MSR_BITMAP; break; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index ed1c4e546d04..e19c00ee9ab3 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -89,7 +89,11 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu) int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); -bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu); +static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id; +} + int kvm_hv_hypercall(struct kvm_vcpu *vcpu); void kvm_hv_irq_routing_update(struct kvm *kvm); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 814064d06016..be99dc86293d 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -437,13 +437,13 @@ static u32 pic_ioport_read(void *opaque, u32 addr) return ret; } -static void elcr_ioport_write(void *opaque, u32 addr, u32 val) +static void elcr_ioport_write(void *opaque, u32 val) { struct kvm_kpic_state *s = opaque; s->elcr = val & s->elcr_mask; } -static u32 elcr_ioport_read(void *opaque, u32 addr1) +static u32 elcr_ioport_read(void *opaque) { struct kvm_kpic_state *s = opaque; return s->elcr; @@ -474,7 +474,7 @@ static int picdev_write(struct kvm_pic *s, case 0x4d0: case 0x4d1: pic_lock(s); - elcr_ioport_write(&s->pics[addr & 1], addr, data); + elcr_ioport_write(&s->pics[addr & 1], data); pic_unlock(s); break; default: @@ -505,7 +505,7 @@ static int picdev_read(struct kvm_pic *s, case 0x4d0: case 0x4d1: pic_lock(s); - *data = elcr_ioport_read(&s->pics[addr & 1], addr); + *data = elcr_ioport_read(&s->pics[addr & 1]); pic_unlock(s); break; default: diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index decfa36b7891..765943d7cfa5 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -54,9 +54,7 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu, int trigger_mode, int pin); -static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, - unsigned long addr, - unsigned long length) +static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic) { unsigned long result = 0; @@ -593,7 +591,7 @@ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, break; case IOAPIC_REG_WINDOW: - result = ioapic_read_indirect(ioapic, addr, len); + result = ioapic_read_indirect(ioapic); break; default: diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index b469f45e3fe4..ee4f696a0782 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -92,3 +92,17 @@ int hv_remote_flush_tlb(struct kvm *kvm) return hv_remote_flush_tlb_with_range(kvm, NULL); } EXPORT_SYMBOL_GPL(hv_remote_flush_tlb); + +void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) +{ + struct kvm_arch *kvm_arch = &vcpu->kvm->arch; + + if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { + spin_lock(&kvm_arch->hv_root_tdp_lock); + vcpu->arch.hv_root_tdp = root_tdp; + if (root_tdp != kvm_arch->hv_root_tdp) + kvm_arch->hv_root_tdp = INVALID_PAGE; + spin_unlock(&kvm_arch->hv_root_tdp_lock); + } +} +EXPORT_SYMBOL_GPL(hv_track_root_tdp); diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h index 1c67abf2eba9..287e98ef9df3 100644 --- a/arch/x86/kvm/kvm_onhyperv.h +++ b/arch/x86/kvm/kvm_onhyperv.h @@ -10,19 +10,7 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm, struct kvm_tlb_range *range); int hv_remote_flush_tlb(struct kvm *kvm); - -static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) -{ - struct kvm_arch *kvm_arch = &vcpu->kvm->arch; - - if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { - spin_lock(&kvm_arch->hv_root_tdp_lock); - vcpu->arch.hv_root_tdp = root_tdp; - if (root_tdp != kvm_arch->hv_root_tdp) - kvm_arch->hv_root_tdp = INVALID_PAGE; - spin_unlock(&kvm_arch->hv_root_tdp_lock); - } -} +void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp); #else /* !CONFIG_HYPERV */ static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9322e6340a74..47f8606559a9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) { - return pi_inject_timer && kvm_vcpu_apicv_active(vcpu); + return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) && + (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm)); } bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) @@ -491,8 +492,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) if (unlikely(vcpu->arch.apicv_active)) { /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - static_call(kvm_x86_hwapic_irr_update)(vcpu, - apic_find_highest_irr(apic)); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); } else { apic->irr_pending = false; kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); @@ -522,7 +522,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(vcpu->arch.apicv_active)) - static_call(kvm_x86_hwapic_isr_update)(vcpu, vec); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -570,8 +570,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(vcpu->arch.apicv_active)) - static_call(kvm_x86_hwapic_isr_update)(vcpu, - apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -2287,7 +2286,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) - static_call(kvm_x86_set_virtual_apic_mode)(vcpu); + static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -2373,9 +2372,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (vcpu->arch.apicv_active) { - static_call(kvm_x86_apicv_post_state_restore)(vcpu); - static_call(kvm_x86_hwapic_irr_update)(vcpu, -1); - static_call(kvm_x86_hwapic_isr_update)(vcpu, -1); + static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1); } vcpu->arch.apic_arb_prio = 0; @@ -2638,11 +2637,9 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { - static_call(kvm_x86_apicv_post_state_restore)(vcpu); - static_call(kvm_x86_hwapic_irr_update)(vcpu, - apic_find_highest_irr(apic)); - static_call(kvm_x86_hwapic_isr_update)(vcpu, - apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) @@ -2933,7 +2930,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector); + static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e9fbb2c8bbe2..51faa2c76ca5 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -203,44 +203,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } /* - * Currently, we have two sorts of write-protection, a) the first one - * write-protects guest page to sync the guest modification, b) another one is - * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences - * between these two sorts are: - * 1) the first case clears MMU-writable bit. - * 2) the first case requires flushing tlb immediately avoiding corrupting - * shadow page table between all vcpus so it should be in the protection of - * mmu-lock. And the another case does not need to flush tlb until returning - * the dirty bitmap to userspace since it only write-protects the page - * logged in the bitmap, that means the page in the dirty bitmap is not - * missed, so it can flush tlb out of mmu-lock. - * - * So, there is the problem: the first case can meet the corrupted tlb caused - * by another case which write-protects pages but without flush tlb - * immediately. In order to making the first case be aware this problem we let - * it flush tlb if we try to write-protect a spte whose MMU-writable bit - * is set, it works since another case never touches MMU-writable bit. - * - * Anyway, whenever a spte is updated (only permission and status bits are - * changed) we need to check whether the spte with MMU-writable becomes - * readonly, if that happens, we need to flush tlb. Fortunately, - * mmu_spte_update() has already handled it perfectly. - * - * The rules to use MMU-writable and PT_WRITABLE_MASK: - * - if we want to see if it has writable tlb entry or if the spte can be - * writable on the mmu mapping, check MMU-writable, this is the most - * case, otherwise - * - if we fix page fault on the spte or do write-protection by dirty logging, - * check PT_WRITABLE_MASK. - * - * TODO: introduce APIs to split these two cases. - */ -static inline bool is_writable_pte(unsigned long pte) -{ - return pte & PT_WRITABLE_MASK; -} - -/* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE * access rights (in ACC_* format). diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 593093b52395..0620480b99e0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -104,15 +104,6 @@ static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; -enum { - AUDIT_PRE_PAGE_FAULT, - AUDIT_POST_PAGE_FAULT, - AUDIT_PRE_PTE_WRITE, - AUDIT_POST_PTE_WRITE, - AUDIT_PRE_SYNC, - AUDIT_POST_SYNC -}; - #ifdef MMU_DEBUG bool dbg = 0; module_param(dbg, bool, 0644); @@ -529,6 +520,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) u64 old_spte = *sptep; WARN_ON(!is_shadow_present_pte(new_spte)); + check_spte_writable_invariants(new_spte); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); @@ -548,11 +540,9 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) /* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changed. * - * Whenever we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB, the return value indicates this - * case. + * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote + * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only + * spte, even though the writable spte might be cached on a CPU's TLB. * * Returns true if the TLB needs to be flushed */ @@ -646,24 +636,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -/* Restore an acc-track PTE back to a regular PTE */ -static u64 restore_acc_track_spte(u64 spte) -{ - u64 new_spte = spte; - u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) - & SHADOW_ACC_TRACK_SAVED_BITS_MASK; - - WARN_ON_ONCE(spte_ad_enabled(spte)); - WARN_ON_ONCE(!is_access_track_spte(spte)); - - new_spte &= ~shadow_acc_track_mask; - new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << - SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); - new_spte |= saved_bits; - - return new_spte; -} - /* Returns the Accessed status of the PTE and resets it at the same time. */ static bool mmu_spte_age(u64 *sptep) { @@ -1229,9 +1201,8 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) return mmu_spte_update(sptep, spte); } -static bool __rmap_write_protect(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - bool pt_protect) +static bool rmap_write_protect(struct kvm_rmap_head *rmap_head, + bool pt_protect) { u64 *sptep; struct rmap_iterator iter; @@ -1311,7 +1282,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); - __rmap_write_protect(kvm, rmap_head, false); + rmap_write_protect(rmap_head, false); /* clear the first set bit */ mask &= mask - 1; @@ -1378,6 +1349,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); + if (READ_ONCE(eager_page_split)) + kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K); + kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); /* Cross two large pages? */ @@ -1410,7 +1384,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmap_head, true); + write_protected |= rmap_write_protect(rmap_head, true); } } @@ -1421,7 +1395,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, return write_protected; } -static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) +static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm_memory_slot *slot; @@ -1921,13 +1895,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, return true; } -#ifdef CONFIG_KVM_MMU_AUDIT -#include "mmu_audit.c" -#else -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } -static void mmu_audit_disable(void) { } -#endif - static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { if (sp->role.invalid) @@ -2024,7 +1991,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, bool protected = false; for_each_sp(pages, sp, parents, i) - protected |= rmap_write_protect(vcpu, sp->gfn); + protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn); if (protected) { kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); @@ -2149,7 +2116,7 @@ trace_get_page: hlist_add_head(&sp->hash_link, sp_list); if (!direct) { account_shadowed(vcpu->kvm, sp); - if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) + if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); } trace_kvm_mmu_get_page(sp, true); @@ -2307,7 +2274,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm, return zapped; } -static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) +static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; @@ -2351,7 +2318,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); - kvm_mmu_unlink_parents(kvm, sp); + kvm_mmu_unlink_parents(sp); /* Zapping children means active_mmu_pages has become unstable. */ list_unstable = *nr_zapped; @@ -3687,17 +3654,12 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) return; write_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); - mmu_sync_children(vcpu, sp, true); - - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); write_unlock(&vcpu->kvm->mmu_lock); return; } write_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu->pae_root[i]; @@ -3709,7 +3671,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) } } - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); write_unlock(&vcpu->kvm->mmu_lock); } @@ -4474,8 +4435,7 @@ static inline bool boot_cpu_is_amd(void) * possible, however, kvm currently does not do execution-protection. */ static void -reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) { struct rsvd_bits_validate *shadow_zero_check; int i; @@ -4506,8 +4466,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, * is the shadow page table for intel nested guest. */ static void -reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, bool execonly) +reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, reserved_hpa_bits(), execonly, @@ -4794,7 +4753,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; reset_guest_paging_metadata(vcpu, context); - reset_tdp_shadow_zero_bits_mask(vcpu, context); + reset_tdp_shadow_zero_bits_mask(context); } static union kvm_mmu_role @@ -4948,7 +4907,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, update_permission_bitmask(context, true); context->pkru_mask = 0; reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); - reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); + reset_ept_shadow_zero_bits_mask(context, execonly); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); @@ -5100,7 +5059,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); kvm_mmu_load_pgd(vcpu); - static_call(kvm_x86_tlb_flush_current)(vcpu); + static_call(kvm_x86_flush_tlb_current)(vcpu); out: return r; } @@ -5260,7 +5219,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); ++vcpu->kvm->stat.mmu_pte_write; - kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || @@ -5285,7 +5243,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } } kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); - kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); write_unlock(&vcpu->kvm->mmu_lock); } @@ -5360,7 +5317,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (is_noncanonical_address(gva, vcpu)) return; - static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); + static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); } if (!mmu->invlpg) @@ -5416,7 +5373,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } if (tlb_flush) - static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); + static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); ++vcpu->stat.invlpg; @@ -5802,7 +5759,7 @@ static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { - return __rmap_write_protect(kvm, rmap_head, false); + return rmap_write_protect(rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -5846,12 +5803,52 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * will clear a separate software-only bit (MMU-writable) and skip the * flush if-and-only-if this bit was already clear. * - * See DEFAULT_SPTE_MMU_WRITEABLE for more details. + * See is_writable_pte() for more details. */ if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } +/* Must be called with the mmu_lock held in write-mode. */ +void kvm_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + u64 start, u64 end, + int target_level) +{ + if (is_tdp_mmu_enabled(kvm)) + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, + target_level, false); + + /* + * A TLB flush is unnecessary at this point for the same resons as in + * kvm_mmu_slot_try_split_huge_pages(). + */ +} + +void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + int target_level) +{ + u64 start = memslot->base_gfn; + u64 end = start + memslot->npages; + + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); + read_unlock(&kvm->mmu_lock); + } + + /* + * No TLB flush is necessary here. KVM will flush TLBs after + * write-protecting and/or clearing dirty on the newly split SPTEs to + * ensure that guest writes are reflected in the dirty log before the + * ioctl to enable dirty logging on this memslot completes. Since the + * split SPTEs retain the write and dirty bits of the huge SPTE, it is + * safe for KVM to decide if a TLB flush is necessary based on the split + * SPTEs. + */ +} + static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) @@ -6191,7 +6188,6 @@ void kvm_mmu_module_exit(void) mmu_destroy_caches(); percpu_counter_destroy(&kvm_total_used_mmu_pages); unregister_shrinker(&mmu_shrinker); - mmu_audit_disable(); } /* diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c deleted file mode 100644 index 9e7dcf999f08..000000000000 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ /dev/null @@ -1,303 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * mmu_audit.c: - * - * Audit code for KVM MMU - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Yaniv Kamay <[email protected]> - * Avi Kivity <[email protected]> - * Marcelo Tosatti <[email protected]> - * Xiao Guangrong <[email protected]> - */ - -#include <linux/ratelimit.h> - -static char const *audit_point_name[] = { - "pre page fault", - "post page fault", - "pre pte write", - "post pte write", - "pre sync", - "post sync" -}; - -#define audit_printk(kvm, fmt, args...) \ - printk(KERN_ERR "audit: (%s) error: " \ - fmt, audit_point_name[kvm->arch.audit_point], ##args) - -typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); - -static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - inspect_spte_fn fn, int level) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 *ent = sp->spt; - - fn(vcpu, ent + i, level); - - if (is_shadow_present_pte(ent[i]) && - !is_last_spte(ent[i], level)) { - struct kvm_mmu_page *child; - - child = to_shadow_page(ent[i] & PT64_BASE_ADDR_MASK); - __mmu_spte_walk(vcpu, child, fn, level - 1); - } - } -} - -static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) -{ - int i; - struct kvm_mmu_page *sp; - - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return; - - if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu->root_hpa; - - sp = to_shadow_page(root); - __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level); - return; - } - - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu->pae_root[i]; - - if (IS_VALID_PAE_ROOT(root)) { - root &= PT64_BASE_ADDR_MASK; - sp = to_shadow_page(root); - __mmu_spte_walk(vcpu, sp, fn, 2); - } - } - - return; -} - -typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp); - -static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) -{ - struct kvm_mmu_page *sp; - - list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) - fn(kvm, sp); -} - -static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - struct kvm_mmu_page *sp; - gfn_t gfn; - kvm_pfn_t pfn; - hpa_t hpa; - - sp = sptep_to_sp(sptep); - - if (sp->unsync) { - if (level != PG_LEVEL_4K) { - audit_printk(vcpu->kvm, "unsync sp: %p " - "level = %d\n", sp, level); - return; - } - } - - if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) - return; - - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn); - - if (is_error_pfn(pfn)) - return; - - hpa = pfn << PAGE_SHIFT; - if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) - audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " - "ent %llxn", vcpu->arch.mmu->root_level, pfn, - hpa, *sptep); -} - -static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) -{ - static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - struct kvm_rmap_head *rmap_head; - struct kvm_mmu_page *rev_sp; - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - gfn_t gfn; - - rev_sp = sptep_to_sp(sptep); - gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - - slots = kvm_memslots_for_spte_role(kvm, rev_sp->role); - slot = __gfn_to_memslot(slots, gfn); - if (!slot) { - if (!__ratelimit(&ratelimit_state)) - return; - audit_printk(kvm, "no memslot for gfn %llx\n", gfn); - audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", - (long int)(sptep - rev_sp->spt), rev_sp->gfn); - dump_stack(); - return; - } - - rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot); - if (!rmap_head->val) { - if (!__ratelimit(&ratelimit_state)) - return; - audit_printk(kvm, "no rmap for writable spte %llx\n", - *sptep); - dump_stack(); - } -} - -static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level)) - inspect_spte_has_rmap(vcpu->kvm, sptep); -} - -static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - struct kvm_mmu_page *sp = sptep_to_sp(sptep); - - if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) - audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " - "root.\n", sp); -} - -static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - int i; - - if (sp->role.level != PG_LEVEL_4K) - return; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_shadow_present_pte(sp->spt[i])) - continue; - - inspect_spte_has_rmap(kvm, sp->spt + i); - } -} - -static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - struct kvm_rmap_head *rmap_head; - u64 *sptep; - struct rmap_iterator iter; - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - - if (sp->role.direct || sp->unsync || sp->role.invalid) - return; - - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, sp->gfn); - rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); - - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (is_writable_pte(*sptep)) - audit_printk(kvm, "shadow page has writable " - "mappings: gfn %llx role %x\n", - sp->gfn, sp->role.word); - } -} - -static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - check_mappings_rmap(kvm, sp); - audit_write_protection(kvm, sp); -} - -static void audit_all_active_sps(struct kvm *kvm) -{ - walk_all_active_sps(kvm, audit_sp); -} - -static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - audit_sptes_have_rmaps(vcpu, sptep, level); - audit_mappings(vcpu, sptep, level); - audit_spte_after_sync(vcpu, sptep, level); -} - -static void audit_vcpu_spte(struct kvm_vcpu *vcpu) -{ - mmu_spte_walk(vcpu, audit_spte); -} - -static bool mmu_audit; -static DEFINE_STATIC_KEY_FALSE(mmu_audit_key); - -static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) -{ - static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - - if (!__ratelimit(&ratelimit_state)) - return; - - vcpu->kvm->arch.audit_point = point; - audit_all_active_sps(vcpu->kvm); - audit_vcpu_spte(vcpu); -} - -static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) -{ - if (static_branch_unlikely((&mmu_audit_key))) - __kvm_mmu_audit(vcpu, point); -} - -static void mmu_audit_enable(void) -{ - if (mmu_audit) - return; - - static_branch_inc(&mmu_audit_key); - mmu_audit = true; -} - -static void mmu_audit_disable(void) -{ - if (!mmu_audit) - return; - - static_branch_dec(&mmu_audit_key); - mmu_audit = false; -} - -static int mmu_audit_set(const char *val, const struct kernel_param *kp) -{ - int ret; - unsigned long enable; - - ret = kstrtoul(val, 10, &enable); - if (ret < 0) - return -EINVAL; - - switch (enable) { - case 0: - mmu_audit_disable(); - break; - case 1: - mmu_audit_enable(); - break; - default: - return -EINVAL; - } - - return 0; -} - -static const struct kernel_param_ops audit_param_ops = { - .set = mmu_audit_set, - .get = param_get_bool, -}; - -arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index de5e8e4e1aa7..12247b96af01 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -416,6 +416,29 @@ TRACE_EVENT( ) ); +TRACE_EVENT( + kvm_mmu_split_huge_page, + TP_PROTO(u64 gfn, u64 spte, int level, int errno), + TP_ARGS(gfn, spte, level, errno), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, spte) + __field(int, level) + __field(int, errno) + ), + + TP_fast_assign( + __entry->gfn = gfn; + __entry->spte = spte; + __entry->level = level; + __entry->errno = errno; + ), + + TP_printk("gfn %llx spte %llx level %d errno %d", + __entry->gfn, __entry->spte, __entry->level, __entry->errno) +); + #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 5b5bdac97c7b..aa0e3c246aca 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -904,12 +904,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (is_page_fault_stale(vcpu, fault, mmu_seq)) goto out_unlock; - kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; r = FNAME(fetch)(vcpu, fault, &walker); - kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: write_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 73cfe62fdad1..4739b53c9734 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -192,6 +192,65 @@ out: return wrprot; } +static u64 make_spte_executable(u64 spte) +{ + bool is_access_track = is_access_track_spte(spte); + + if (is_access_track) + spte = restore_acc_track_spte(spte); + + spte &= ~shadow_nx_mask; + spte |= shadow_x_mask; + + if (is_access_track) + spte = mark_spte_for_access_track(spte); + + return spte; +} + +/* + * Construct an SPTE that maps a sub-page of the given huge page SPTE where + * `index` identifies which sub-page. + * + * This is used during huge page splitting to build the SPTEs that make up the + * new page table. + */ +u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) +{ + u64 child_spte; + int child_level; + + if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte))) + return 0; + + if (WARN_ON_ONCE(!is_large_pte(huge_spte))) + return 0; + + child_spte = huge_spte; + child_level = huge_level - 1; + + /* + * The child_spte already has the base address of the huge page being + * split. So we just have to OR in the offset to the page at the next + * lower level for the given index. + */ + child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT; + + if (child_level == PG_LEVEL_4K) { + child_spte &= ~PT_PAGE_SIZE_MASK; + + /* + * When splitting to a 4K page, mark the page executable as the + * NX hugepage mitigation no longer applies. + */ + if (is_nx_huge_page_enabled()) + child_spte = make_spte_executable(child_spte); + } + + return child_spte; +} + + u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { u64 spte = SPTE_MMU_PRESENT_MASK; @@ -250,14 +309,7 @@ u64 mark_spte_for_access_track(u64 spte) if (is_access_track_spte(spte)) return spte; - /* - * Making an Access Tracking PTE will result in removal of write access - * from the PTE. So, verify that we will be able to restore the write - * access in the fast page fault path later on. - */ - WARN_ONCE((spte & PT_WRITABLE_MASK) && - !spte_can_locklessly_be_made_writable(spte), - "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + check_spte_writable_invariants(spte); WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT), @@ -368,8 +420,8 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_acc_track_mask = 0; shadow_me_mask = sme_me_mask; - shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE; - shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE; + shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE; + shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE; /* * Set a reserved PA bit in MMIO SPTEs to generate page faults with diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index be6a007a4af3..73f12615416f 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -75,33 +75,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); /* - * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits - * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs - * that map guest pages in read-only memslots and read-only VMAs. - * - * Invariants: - * - If Host-writable is clear, PT_WRITABLE_MASK must be clear. - * - * - * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU - * allows writes to the guest page mapped by the SPTE. This bit is cleared when - * the guest page mapped by the SPTE contains a page table that is being - * monitored for shadow paging. In this case the SPTE can only be made writable - * by unsyncing the shadow page under the mmu_lock. - * - * Invariants: - * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear. - * - If MMU-writable is set, Host-writable must be set. - * - * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared - * to track writes for dirty logging. For such SPTEs, KVM will locklessly set - * PT_WRITABLE_MASK upon the next write from the guest and record the write in - * the dirty log (see fast_page_fault()). + * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given + * SPTE is write-protected. See is_writable_pte() for details. */ /* Bits 9 and 10 are ignored by all non-EPT PTEs. */ -#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) -#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) +#define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9) +#define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10) /* * Low ignored bits are at a premium for EPT, use high ignored bits, taking care @@ -339,15 +319,86 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, __is_rsvd_bits_set(rsvd_check, spte, level); } -static inline bool spte_can_locklessly_be_made_writable(u64 spte) +/* + * An shadow-present leaf SPTE may be non-writable for 3 possible reasons: + * + * 1. To intercept writes for dirty logging. KVM write-protects huge pages + * so that they can be split be split down into the dirty logging + * granularity (4KiB) whenever the guest writes to them. KVM also + * write-protects 4KiB pages so that writes can be recorded in the dirty log + * (e.g. if not using PML). SPTEs are write-protected for dirty logging + * during the VM-iotcls that enable dirty logging. + * + * 2. To intercept writes to guest page tables that KVM is shadowing. When a + * guest writes to its page table the corresponding shadow page table will + * be marked "unsync". That way KVM knows which shadow page tables need to + * be updated on the next TLB flush, INVLPG, etc. and which do not. + * + * 3. To prevent guest writes to read-only memory, such as for memory in a + * read-only memslot or guest memory backed by a read-only VMA. Writes to + * such pages are disallowed entirely. + * + * To keep track of why a given SPTE is write-protected, KVM uses 2 + * software-only bits in the SPTE: + * + * shadow_mmu_writable_mask, aka MMU-writable - + * Cleared on SPTEs that KVM is currently write-protecting for shadow paging + * purposes (case 2 above). + * + * shadow_host_writable_mask, aka Host-writable - + * Cleared on SPTEs that are not host-writable (case 3 above) + * + * Note, not all possible combinations of PT_WRITABLE_MASK, + * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given + * SPTE can be in only one of the following states, which map to the + * aforementioned 3 cases: + * + * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK + * ------------------------- | ------------------------ | ---------------- + * 1 | 1 | 1 (writable) + * 1 | 1 | 0 (case 1) + * 1 | 0 | 0 (case 2) + * 0 | 0 | 0 (case 3) + * + * The valid combinations of these bits are checked by + * check_spte_writable_invariants() whenever an SPTE is modified. + * + * Clearing the MMU-writable bit is always done under the MMU lock and always + * accompanied by a TLB flush before dropping the lock to avoid corrupting the + * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging + * (which does not clear the MMU-writable bit), does not flush TLBs before + * dropping the lock, as it only needs to synchronize guest writes with the + * dirty bitmap. + * + * So, there is the problem: clearing the MMU-writable bit can encounter a + * write-protected SPTE while CPUs still have writable mappings for that SPTE + * cached in their TLB. To address this, KVM always flushes TLBs when + * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE. + * + * The Host-writable bit is not modified on present SPTEs, it is only set or + * cleared when an SPTE is first faulted in from non-present and then remains + * immutable. + */ +static inline bool is_writable_pte(unsigned long pte) { - if (spte & shadow_mmu_writable_mask) { - WARN_ON_ONCE(!(spte & shadow_host_writable_mask)); - return true; - } + return pte & PT_WRITABLE_MASK; +} + +/* Note: spte must be a shadow-present leaf SPTE. */ +static inline void check_spte_writable_invariants(u64 spte) +{ + if (spte & shadow_mmu_writable_mask) + WARN_ONCE(!(spte & shadow_host_writable_mask), + "kvm: MMU-writable SPTE is not Host-writable: %llx", + spte); + else + WARN_ONCE(is_writable_pte(spte), + "kvm: Writable SPTE is not MMU-writable: %llx", spte); +} - WARN_ON_ONCE(spte & PT_WRITABLE_MASK); - return false; +static inline bool spte_can_locklessly_be_made_writable(u64 spte) +{ + return spte & shadow_mmu_writable_mask; } static inline u64 get_mmio_spte_generation(u64 spte) @@ -364,9 +415,25 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); +u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); + +/* Restore an acc-track PTE back to a regular PTE */ +static inline u64 restore_acc_track_spte(u64 spte) +{ + u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) + & SHADOW_ACC_TRACK_SAVED_BITS_MASK; + + spte &= ~shadow_acc_track_mask; + spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << + SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); + spte |= saved_bits; + + return spte; +} + u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); void kvm_mmu_reset_all_pte_masks(void); diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index caa96c270b95..be3f096db2eb 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -40,17 +40,19 @@ void tdp_iter_restart(struct tdp_iter *iter) * Sets a TDP iterator to walk a pre-order traversal of the paging structure * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ -void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, +void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn) { + int root_level = root->role.level; + WARN_ON(root_level < 1); WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); iter->next_last_level_gfn = next_last_level_gfn; iter->root_level = root_level; iter->min_level = min_level; - iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt; - iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt)); + iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt; + iter->as_id = kvm_mmu_page_as_id(root); tdp_iter_restart(iter); } diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index e19cabbcb65c..216ebbe76ddd 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -57,17 +57,17 @@ struct tdp_iter { * Iterates over every SPTE mapping the GFN range [start, end) in a * preorder traversal. */ -#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \ - for (tdp_iter_start(&iter, root, root_level, min_level, start); \ +#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \ + for (tdp_iter_start(&iter, root, min_level, start); \ iter.valid && iter.gfn < end; \ tdp_iter_next(&iter)) -#define for_each_tdp_pte(iter, root, root_level, start, end) \ - for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) +#define for_each_tdp_pte(iter, root, start, end) \ + for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) tdp_ptep_t spte_to_child_pt(u64 pte, int level); -void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, +void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index bc9e3553fba2..8def8f810cb0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -99,15 +99,18 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Finds the next valid root after root (or the first valid root if root - * is NULL), takes a reference on it, and returns that next root. If root - * is not NULL, this thread should have already taken a reference on it, and - * that reference will be dropped. If no valid root is found, this - * function will return NULL. + * Returns the next root after @prev_root (or the first root if @prev_root is + * NULL). A reference to the returned root is acquired, and the reference to + * @prev_root is released (the caller obviously must hold a reference to + * @prev_root if it's non-NULL). + * + * If @only_valid is true, invalid roots are skipped. + * + * Returns NULL if the end of tdp_mmu_roots was reached. */ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, struct kvm_mmu_page *prev_root, - bool shared) + bool shared, bool only_valid) { struct kvm_mmu_page *next_root; @@ -121,9 +124,14 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, typeof(*next_root), link); - while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root)) + while (next_root) { + if ((!only_valid || !next_root->role.invalid) && + kvm_tdp_mmu_get_root(next_root)) + break; + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, &next_root->link, typeof(*next_root), link); + } rcu_read_unlock(); @@ -143,13 +151,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * mode. In the unlikely event that this thread must free a root, the lock * will be temporarily dropped and reacquired in write mode. */ -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ - for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \ - _root; \ - _root = tdp_mmu_next_root(_kvm, _root, _shared)) \ - if (kvm_mmu_page_as_id(_root) != _as_id) { \ +#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ + for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ + _root; \ + _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ + if (kvm_mmu_page_as_id(_root) != _as_id) { \ } else +#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) + +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false) + #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \ lockdep_is_held_type(&kvm->mmu_lock, 0) || \ @@ -157,57 +171,63 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, if (kvm_mmu_page_as_id(_root) != _as_id) { \ } else -static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, - int level) +static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) { - union kvm_mmu_page_role role; + struct kvm_mmu_page *sp; - role = vcpu->arch.mmu->mmu_role.base; - role.level = level; - role.direct = true; - role.has_4_byte_gpte = false; - role.access = ACC_ALL; - role.ad_disabled = !shadow_accessed_mask; + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); - return role; + return sp; } -static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, - int level) +static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, gfn_t gfn, + union kvm_mmu_page_role role) { - struct kvm_mmu_page *sp; - - sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - sp->role.word = page_role_for_level(vcpu, level).word; + sp->role = role; sp->gfn = gfn; sp->tdp_mmu_page = true; trace_kvm_mmu_get_page(sp, true); +} - return sp; +static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, + struct tdp_iter *iter) +{ + struct kvm_mmu_page *parent_sp; + union kvm_mmu_page_role role; + + parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); + + role = parent_sp->role; + role.level--; + + tdp_mmu_init_sp(child_sp, iter->gfn, role); } hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) { - union kvm_mmu_page_role role; + union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; lockdep_assert_held_write(&kvm->mmu_lock); - role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); - - /* Check for an existing root before allocating a new one. */ + /* + * Check for an existing root before allocating a new one. Note, the + * role check prevents consuming an invalid root. + */ for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { if (root->role.word == role.word && - kvm_tdp_mmu_get_root(kvm, root)) + kvm_tdp_mmu_get_root(root)) goto out; } - root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); + root = tdp_mmu_alloc_sp(vcpu); + tdp_mmu_init_sp(root, 0, role); + refcount_set(&root->tdp_mmu_root_count, 1); spin_lock(&kvm->arch.tdp_mmu_pages_lock); @@ -252,25 +272,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, } /** - * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU - * - * @kvm: kvm instance - * @sp: the new page - * @account_nx: This page replaces a NX large page and should be marked for - * eventual reclaim. - */ -static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool account_nx) -{ - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - list_add(&sp->link, &kvm->arch.tdp_mmu_pages); - if (account_nx) - account_huge_nx_page(kvm, sp); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); -} - -/** - * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU + * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages * * @kvm: kvm instance * @sp: the page to be removed @@ -278,8 +280,8 @@ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, * the MMU lock and the operation must synchronize with other * threads that might be adding or removing pages. */ -static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool shared) +static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, + bool shared) { if (shared) spin_lock(&kvm->arch.tdp_mmu_pages_lock); @@ -295,7 +297,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, } /** - * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure + * handle_removed_pt() - handle a page table removed from the TDP structure * * @kvm: kvm instance * @pt: the page removed from the paging structure @@ -311,8 +313,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, * this thread will be responsible for ensuring the page is freed. Hence the * early rcu_dereferences in the function. */ -static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, - bool shared) +static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) { struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; @@ -321,7 +322,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, trace_kvm_mmu_prepare_zap_page(sp); - tdp_mmu_unlink_page(kvm, sp, shared); + tdp_mmu_unlink_sp(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { u64 *sptep = rcu_dereference(pt) + i; @@ -435,6 +436,9 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + if (is_leaf) + check_spte_writable_invariants(new_spte); + /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ @@ -472,8 +476,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * the paging structure. */ if (was_present && !was_leaf && (pfn_changed || !is_present)) - handle_removed_tdp_mmu_page(kvm, - spte_to_child_pt(old_spte, level), shared); + handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, @@ -492,16 +495,25 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * and handle the associated bookkeeping. Do not mark the page dirty * in KVM's dirty bitmaps. * + * If setting the SPTE fails because it has changed, iter->old_spte will be + * refreshed to the current value of the spte. + * * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set * @new_spte: The value the SPTE should be set to - * Returns: true if the SPTE was set, false if it was not. If false is returned, - * this function will have no side-effects. + * Return: + * * 0 - If the SPTE was set. + * * -EBUSY - If the SPTE cannot be set. In this case this function will have + * no side-effects other than setting iter->old_spte to the last + * known value of the spte. */ -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { + u64 *sptep = rcu_dereference(iter->sptep); + u64 old_spte; + WARN_ON_ONCE(iter->yielded); lockdep_assert_held_read(&kvm->mmu_lock); @@ -511,34 +523,45 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, * may modify it. */ if (is_removed_spte(iter->old_spte)) - return false; + return -EBUSY; /* * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and * does not hold the mmu_lock. */ - if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, - new_spte) != iter->old_spte) - return false; + old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); + if (old_spte != iter->old_spte) { + /* + * The page table entry was modified by a different logical + * CPU. Refresh iter->old_spte with the current value so the + * caller operates on fresh data, e.g. if it retries + * tdp_mmu_set_spte_atomic(). + */ + iter->old_spte = old_spte; + return -EBUSY; + } __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level, true); handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); - return true; + return 0; } -static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter) +static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter) { + int ret; + /* * Freeze the SPTE by setting it to a special, * non-present value. This will stop other threads from * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) - return false; + ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); + if (ret) + return ret; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, KVM_PAGES_PER_HPAGE(iter->level)); @@ -553,7 +576,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, */ WRITE_ONCE(*rcu_dereference(iter->sptep), 0); - return true; + return 0; } @@ -624,7 +647,7 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, } #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ - for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) + for_each_tdp_pte(_iter, _root, _start, _end) #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ tdp_root_for_each_pte(_iter, _root, _start, _end) \ @@ -634,8 +657,7 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, else #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ - for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ - _mmu->shadow_root_level, _start, _end) + for_each_tdp_pte(_iter, to_shadow_page(_mmu->root_hpa), _start, _end) /* * Yield if the MMU lock is contended or this thread needs to return control @@ -724,8 +746,7 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root->spt, root->role.level, - min_level, start, end) { + for_each_tdp_pte_min_level(iter, root, min_level, start, end) { retry: if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { @@ -750,12 +771,7 @@ retry: if (!shared) { tdp_mmu_set_spte(kvm, &iter, 0); flush = true; - } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { - /* - * The iter must explicitly re-read the SPTE because - * the atomic cmpxchg failed. - */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + } else if (tdp_mmu_zap_spte_atomic(kvm, &iter)) { goto retry; } } @@ -913,7 +929,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; /* @@ -947,6 +963,44 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, } /* + * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the + * provided page table. + * + * @kvm: kvm instance + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @sp: The new TDP page table to install. + * @account_nx: True if this page table is being installed to split a + * non-executable huge page. + * @shared: This operation is running under the MMU lock in read mode. + * + * Returns: 0 if the new page table was installed. Non-0 if the page table + * could not be installed (e.g. the atomic compare-exchange failed). + */ +static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_mmu_page *sp, bool account_nx, + bool shared) +{ + u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); + int ret = 0; + + if (shared) { + ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); + if (ret) + return ret; + } else { + tdp_mmu_set_spte(kvm, iter, spte); + } + + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + list_add(&sp->link, &kvm->arch.tdp_mmu_pages); + if (account_nx) + account_huge_nx_page(kvm, sp); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + + return 0; +} + +/* * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. */ @@ -955,8 +1009,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) struct kvm_mmu *mmu = vcpu->arch.mmu; struct tdp_iter iter; struct kvm_mmu_page *sp; - u64 *child_pt; - u64 new_spte; int ret; kvm_mmu_hugepage_adjust(vcpu, fault); @@ -979,7 +1031,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) */ if (is_shadow_present_pte(iter.old_spte) && is_large_pte(iter.old_spte)) { - if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) + if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) break; /* @@ -991,6 +1043,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } if (!is_shadow_present_pte(iter.old_spte)) { + bool account_nx = fault->huge_page_disallowed && + fault->req_level >= iter.level; + /* * If SPTE has been frozen by another thread, just * give up and retry, avoiding unnecessary page table @@ -999,19 +1054,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (is_removed_spte(iter.old_spte)) break; - sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); - child_pt = sp->spt; - - new_spte = make_nonleaf_spte(child_pt, - !shadow_accessed_mask); + sp = tdp_mmu_alloc_sp(vcpu); + tdp_mmu_init_child_sp(sp, &iter); - if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) { - tdp_mmu_link_page(vcpu->kvm, sp, - fault->huge_page_disallowed && - fault->req_level >= iter.level); - - trace_kvm_mmu_get_page(sp, true); - } else { + if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { tdp_mmu_free_sp(sp); break; } @@ -1032,13 +1078,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush) { - struct kvm_mmu_page *root; - - for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) - flush = zap_gfn_range(kvm, root, range->start, range->end, - range->may_block, flush, false); - - return flush; + return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start, + range->end, range->may_block, flush); } typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, @@ -1180,8 +1221,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); - for_each_tdp_pte_min_level(iter, root->spt, root->role.level, - min_level, start, end) { + for_each_tdp_pte_min_level(iter, root, min_level, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; @@ -1193,14 +1233,9 @@ retry: new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { - /* - * The iter must explicitly re-read the SPTE because - * the atomic cmpxchg failed. - */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) goto retry; - } + spte_set = true; } @@ -1221,13 +1256,197 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); return spte_set; } +static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) +{ + struct kvm_mmu_page *sp; + + gfp |= __GFP_ZERO; + + sp = kmem_cache_alloc(mmu_page_header_cache, gfp); + if (!sp) + return NULL; + + sp->spt = (void *)__get_free_page(gfp); + if (!sp->spt) { + kmem_cache_free(mmu_page_header_cache, sp); + return NULL; + } + + return sp; +} + +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, + struct tdp_iter *iter, + bool shared) +{ + struct kvm_mmu_page *sp; + + /* + * Since we are allocating while under the MMU lock we have to be + * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct + * reclaim and to avoid making any filesystem callbacks (which can end + * up invoking KVM MMU notifiers, resulting in a deadlock). + * + * If this allocation fails we drop the lock and retry with reclaim + * allowed. + */ + sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); + if (sp) + return sp; + + rcu_read_unlock(); + + if (shared) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); + + iter->yielded = true; + sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); + + if (shared) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); + + rcu_read_lock(); + + return sp; +} + +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_mmu_page *sp, bool shared) +{ + const u64 huge_spte = iter->old_spte; + const int level = iter->level; + int ret, i; + + tdp_mmu_init_child_sp(sp, iter); + + /* + * No need for atomics when writing to sp->spt since the page table has + * not been linked in yet and thus is not reachable from any other CPU. + */ + for (i = 0; i < PT64_ENT_PER_PAGE; i++) + sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); + + /* + * Replace the huge spte with a pointer to the populated lower level + * page table. Since we are making this change without a TLB flush vCPUs + * will see a mix of the split mappings and the original huge mapping, + * depending on what's currently in their TLB. This is fine from a + * correctness standpoint since the translation will be the same either + * way. + */ + ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); + if (ret) + goto out; + + /* + * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we + * are overwriting from the page stats. But we have to manually update + * the page stats with the new present child pages. + */ + kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); + +out: + trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); + return ret; +} + +static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, + struct kvm_mmu_page *root, + gfn_t start, gfn_t end, + int target_level, bool shared) +{ + struct kvm_mmu_page *sp = NULL; + struct tdp_iter iter; + int ret = 0; + + rcu_read_lock(); + + /* + * Traverse the page table splitting all huge pages above the target + * level into one lower level. For example, if we encounter a 1GB page + * we split it into 512 2MB pages. + * + * Since the TDP iterator uses a pre-order traversal, we are guaranteed + * to visit an SPTE before ever visiting its children, which means we + * will correctly recursively split huge pages that are more than one + * level above the target level (e.g. splitting a 1GB to 512 2MB pages, + * and then splitting each of those to 512 4KB pages). + */ + for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) + continue; + + if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) + continue; + + if (!sp) { + sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); + if (!sp) { + ret = -ENOMEM; + trace_kvm_mmu_split_huge_page(iter.gfn, + iter.old_spte, + iter.level, ret); + break; + } + + if (iter.yielded) + continue; + } + + if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) + goto retry; + + sp = NULL; + } + + rcu_read_unlock(); + + /* + * It's possible to exit the loop having never used the last sp if, for + * example, a vCPU doing HugePage NX splitting wins the race and + * installs its own sp in place of the last sp we tried to split. + */ + if (sp) + tdp_mmu_free_sp(sp); + + return ret; +} + + +/* + * Try to split all huge pages mapped by the TDP MMU down to the target level. + */ +void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, + int target_level, bool shared) +{ + struct kvm_mmu_page *root; + int r = 0; + + kvm_lockdep_assert_mmu_lock_held(kvm, shared); + + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { + r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); + if (r) { + kvm_tdp_mmu_put_root(kvm, root, shared); + break; + } + } +} + /* * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. @@ -1261,14 +1480,9 @@ retry: continue; } - if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { - /* - * The iter must explicitly re-read the SPTE because - * the atomic cmpxchg failed. - */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) goto retry; - } + spte_set = true; } @@ -1291,7 +1505,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); @@ -1392,14 +1606,8 @@ retry: continue; /* Note, a successful atomic zap also does a remote TLB flush. */ - if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { - /* - * The iter must explicitly re-read the SPTE because - * the atomic cmpxchg failed. - */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + if (tdp_mmu_zap_spte_atomic(kvm, &iter)) goto retry; - } } rcu_read_unlock(); @@ -1416,7 +1624,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) zap_collapsible_spte_range(kvm, root, slot); } @@ -1436,8 +1644,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root->spt, root->role.level, - min_level, gfn, gfn + 1) { + for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) continue; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 3899004a5d91..3f987785702a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -7,12 +7,8 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); -__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm, - struct kvm_mmu_page *root) +__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) { - if (root->role.invalid) - return false; - return refcount_inc_not_zero(&root->tdp_mmu_root_count); } @@ -71,6 +67,11 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level); +void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, + int target_level, bool shared); + static inline void kvm_tdp_mmu_walk_lockless_begin(void) { rcu_read_lock(); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index fb3e20791338..d4fa8c4f3a9a 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -578,7 +578,7 @@ int avic_init_vcpu(struct vcpu_svm *svm) return ret; } -void avic_post_state_restore(struct kvm_vcpu *vcpu) +void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) { if (avic_handle_apic_id_update(vcpu) != 0) return; @@ -586,20 +586,7 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) -{ - return; -} - -void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) -{ -} - -void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) -{ -} - -static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) +static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; unsigned long flags; @@ -631,7 +618,7 @@ out: return ret; } -void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb01.ptr; @@ -648,7 +635,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) * we need to check and update the AVIC logical APIC ID table * accordingly before re-activating. */ - avic_post_state_restore(vcpu); + avic_apicv_post_state_restore(vcpu); vmcb->control.int_ctl |= AVIC_ENABLE_MASK; } else { vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; @@ -660,17 +647,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) else avic_vcpu_put(vcpu); - svm_set_pi_irte_mode(vcpu, activated); -} - -void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) -{ - return; -} - -bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - return false; + avic_set_pi_irte_mode(vcpu, activated); } static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) @@ -770,7 +747,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, } /* - * svm_update_pi_irte - set IRTE for Posted-Interrupts + * avic_pi_update_irte - set IRTE for Posted-Interrupts * * @kvm: kvm * @host_irq: host irq of the interrupt @@ -778,8 +755,8 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, * @set: set or unset PI * returns 0 on success, < 0 on failure */ -int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; @@ -879,7 +856,7 @@ out: return ret; } -bool svm_check_apicv_inhibit_reasons(ulong bit) +bool avic_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_ABSENT) | diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h new file mode 100644 index 000000000000..7d6d97968fb9 --- /dev/null +++ b/arch/x86/kvm/svm/hyperv.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common Hyper-V on KVM and KVM on Hyper-V definitions (SVM). + */ + +#ifndef __ARCH_X86_KVM_SVM_HYPERV_H__ +#define __ARCH_X86_KVM_SVM_HYPERV_H__ + +#include <asm/mshyperv.h> + +#include "../hyperv.h" + +/* + * Hyper-V uses the software reserved 32 bytes in VMCB + * control area to expose SVM enlightenments to guests. + */ +struct hv_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB + */ +#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW + +#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 39d280e7e80e..f284e61451c8 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -28,6 +28,7 @@ #include "cpuid.h" #include "lapic.h" #include "svm.h" +#include "hyperv.h" #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK @@ -165,14 +166,30 @@ void recalc_intercepts(struct vcpu_svm *svm) vmcb_set_intercept(c, INTERCEPT_VMSAVE); } +/* + * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function + * is optimized in that it only merges the parts where KVM MSR permission bitmap + * may contain zero bits. + */ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { + struct hv_enlightenments *hve = + (struct hv_enlightenments *)svm->nested.ctl.reserved_sw; + int i; + /* - * This function merges the msr permission bitmaps of kvm and the - * nested vmcb. It is optimized in that it only merges the parts where - * the kvm msr permission bitmap may contain zero bits + * MSR bitmap update can be skipped when: + * - MSR bitmap for L1 hasn't changed. + * - Nested hypervisor (L1) is attempting to launch the same L2 as + * before. + * - Nested hypervisor (L1) is using Hyper-V emulation interface and + * tells KVM (L0) there were no changes in MSR bitmap for L2. */ - int i; + if (!svm->nested.force_msr_bitmap_recalc && + kvm_hv_hypercall_enabled(&svm->vcpu) && + hve->hv_enlightenments_control.msr_bitmap && + (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS))) + goto set_msrpm_base_pa; if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; @@ -193,6 +210,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) svm->nested.msrpm[p] = svm->msrpm[p] | value; } + svm->nested.force_msr_bitmap_recalc = false; + +set_msrpm_base_pa: svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); return true; @@ -298,7 +318,8 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) } static -void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to, +void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, + struct vmcb_ctrl_area_cached *to, struct vmcb_control_area *from) { unsigned int i; @@ -331,12 +352,19 @@ void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to, to->asid = from->asid; to->msrpm_base_pa &= ~0x0fffULL; to->iopm_base_pa &= ~0x0fffULL; + + /* Hyper-V extensions (Enlightened VMCB) */ + if (kvm_hv_hypercall_enabled(vcpu)) { + to->clean = from->clean; + memcpy(to->reserved_sw, from->reserved_sw, + sizeof(struct hv_enlightenments)); + } } void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, struct vmcb_control_area *control) { - __nested_copy_vmcb_control_to_cache(&svm->nested.ctl, control); + __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control); } static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, @@ -494,6 +522,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { new_vmcb12 = true; svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa; + svm->nested.force_msr_bitmap_recalc = true; } if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { @@ -1302,6 +1331,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->virt_ext = from->virt_ext; dst->pause_filter_count = from->pause_filter_count; dst->pause_filter_thresh = from->pause_filter_thresh; + /* 'clean' and 'reserved_sw' are not changed by KVM */ } static int svm_get_nested_state(struct kvm_vcpu *vcpu, @@ -1434,7 +1464,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, goto out_free; ret = -EINVAL; - __nested_copy_vmcb_control_to_cache(&ctl_cached, ctl); + __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl); if (!__nested_vmcb_check_controls(vcpu, &ctl_cached)) goto out_free; @@ -1495,6 +1525,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (WARN_ON_ONCE(ret)) goto out_free; + svm->nested.force_msr_bitmap_recalc = true; kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 17b53457d866..789b69294d28 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -258,6 +258,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free; INIT_LIST_HEAD(&sev->regions_list); + INIT_LIST_HEAD(&sev->mirror_vms); return 0; @@ -1623,9 +1624,12 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm) } } -static void sev_migrate_from(struct kvm_sev_info *dst, - struct kvm_sev_info *src) +static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) { + struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *mirror; + dst->active = true; dst->asid = src->asid; dst->handle = src->handle; @@ -1639,6 +1643,30 @@ static void sev_migrate_from(struct kvm_sev_info *dst, src->enc_context_owner = NULL; list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); + + /* + * If this VM has mirrors, "transfer" each mirror's refcount of the + * source to the destination (this KVM). The caller holds a reference + * to the source, so there's no danger of use-after-free. + */ + list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms); + list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) { + kvm_get_kvm(dst_kvm); + kvm_put_kvm(src_kvm); + mirror->enc_context_owner = dst_kvm; + } + + /* + * If this VM is a mirror, remove the old mirror from the owners list + * and add the new mirror to the list. + */ + if (is_mirroring_enc_context(dst_kvm)) { + struct kvm_sev_info *owner_sev_info = + &to_kvm_svm(dst->enc_context_owner)->sev_info; + + list_del(&src->mirror_entry); + list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); + } } static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) @@ -1681,7 +1709,7 @@ static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) return 0; } -int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) +int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_info *src_sev, *cg_cleanup_sev; @@ -1708,15 +1736,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) src_sev = &to_kvm_svm(source_kvm)->sev_info; - /* - * VMs mirroring src's encryption context rely on it to keep the - * ASID allocated, but below we are clearing src_sev->asid. - */ - if (src_sev->num_mirrored_vms) { - ret = -EBUSY; - goto out_unlock; - } - dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; if (dst_sev->misc_cg != src_sev->misc_cg) { @@ -1738,7 +1757,8 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) if (ret) goto out_source_vcpu; } - sev_migrate_from(dst_sev, src_sev); + + sev_migrate_from(kvm, source_kvm); kvm_vm_dead(source_kvm); cg_cleanup_sev = src_sev; ret = 0; @@ -1761,7 +1781,7 @@ out_fput: return ret; } -int svm_mem_enc_op(struct kvm *kvm, void __user *argp) +int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; int r; @@ -1858,8 +1878,8 @@ out: return r; } -int svm_register_enc_region(struct kvm *kvm, - struct kvm_enc_region *range) +int sev_mem_enc_register_region(struct kvm *kvm, + struct kvm_enc_region *range) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct enc_region *region; @@ -1932,8 +1952,8 @@ static void __unregister_enc_region_locked(struct kvm *kvm, kfree(region); } -int svm_unregister_enc_region(struct kvm *kvm, - struct kvm_enc_region *range) +int sev_mem_enc_unregister_region(struct kvm *kvm, + struct kvm_enc_region *range) { struct enc_region *region; int ret; @@ -1972,7 +1992,7 @@ failed: return ret; } -int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) +int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct file *source_kvm_file; struct kvm *source_kvm; @@ -2008,10 +2028,10 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) */ source_sev = &to_kvm_svm(source_kvm)->sev_info; kvm_get_kvm(source_kvm); - source_sev->num_mirrored_vms++; + mirror_sev = &to_kvm_svm(kvm)->sev_info; + list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); /* Set enc_context_owner and copy its encryption context over */ - mirror_sev = &to_kvm_svm(kvm)->sev_info; mirror_sev->enc_context_owner = source_kvm; mirror_sev->active = true; mirror_sev->asid = source_sev->asid; @@ -2019,6 +2039,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) mirror_sev->es_active = source_sev->es_active; mirror_sev->handle = source_sev->handle; INIT_LIST_HEAD(&mirror_sev->regions_list); + INIT_LIST_HEAD(&mirror_sev->mirror_vms); ret = 0; /* @@ -2041,19 +2062,17 @@ void sev_vm_destroy(struct kvm *kvm) struct list_head *head = &sev->regions_list; struct list_head *pos, *q; - WARN_ON(sev->num_mirrored_vms); - if (!sev_guest(kvm)) return; + WARN_ON(!list_empty(&sev->mirror_vms)); + /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ if (is_mirroring_enc_context(kvm)) { struct kvm *owner_kvm = sev->enc_context_owner; - struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info; mutex_lock(&owner_kvm->lock); - if (!WARN_ON(!owner_sev->num_mirrored_vms)) - owner_sev->num_mirrored_vms--; + list_del(&sev->mirror_entry); mutex_unlock(&owner_kvm->lock); kvm_put_kvm(owner_kvm); return; @@ -2173,7 +2192,7 @@ out: #endif } -void sev_hardware_teardown(void) +void sev_hardware_unsetup(void) { if (!sev_enabled) return; @@ -2907,20 +2926,16 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) sev_enc_bit)); } -void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu) +void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa) { - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - struct vmcb_save_area *hostsa; - /* * As an SEV-ES guest, hardware will restore the host state on VMEXIT, - * of which one step is to perform a VMLOAD. Since hardware does not - * perform a VMSAVE on VMRUN, the host savearea must be updated. + * of which one step is to perform a VMLOAD. KVM performs the + * corresponding VMSAVE in svm_prepare_guest_switch for both + * traditional and SEV-ES guests. */ - vmsave(__sme_page_pa(sd->save_area)); /* XCR0 is restored on VMEXIT, save the current host value */ - hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); /* PKRU is restored on VMEXIT, save the current host value */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 821edf664e7a..7038c76fa841 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -263,7 +263,7 @@ u32 svm_msrpm_offset(u32 msr) return MSR_INVALID; } -#define MAX_INST_SIZE 15 +static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); static int get_npt_level(void) { @@ -353,7 +353,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } -static int skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -401,7 +401,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) * raises a fault that is not intercepted. Still better than * failing in all cases. */ - (void)skip_emulated_instruction(vcpu); + (void)svm_skip_emulated_instruction(vcpu); rip = kvm_rip_read(vcpu); svm->int3_rip = rip + svm->vmcb->save.cs.base; svm->int3_injected = rip - old_rip; @@ -668,6 +668,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write) { + struct vcpu_svm *svm = to_svm(vcpu); u8 bit_read, bit_write; unsigned long tmp; u32 offset; @@ -698,7 +699,7 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, msrpm[offset] = tmp; svm_hv_vmcb_dirty_nested_enlightenments(vcpu); - + svm->nested.force_msr_bitmap_recalc = true; } void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, @@ -873,11 +874,11 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -static void svm_hardware_teardown(void) +static void svm_hardware_unsetup(void) { int cpu; - sev_hardware_teardown(); + sev_hardware_unsetup(); for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -1175,7 +1176,7 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) svm->vmcb = target_vmcb->ptr; } -static int svm_create_vcpu(struct kvm_vcpu *vcpu) +static int svm_vcpu_create(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *vmcb01_page; @@ -1246,7 +1247,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb) cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); } -static void svm_free_vcpu(struct kvm_vcpu *vcpu) +static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1265,7 +1266,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) +static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); @@ -1280,10 +1281,12 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) * Save additional host state that will be restored on VMEXIT (sev-es) * or subsequent vmload of host save area. */ + vmsave(__sme_page_pa(sd->save_area)); if (sev_es_guest(vcpu->kvm)) { - sev_es_prepare_guest_switch(svm, vcpu->cpu); - } else { - vmsave(__sme_page_pa(sd->save_area)); + struct vmcb_save_area *hostsa; + hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); + + sev_es_prepare_switch_to_guest(hostsa); } if (tsc_scaling) { @@ -1529,6 +1532,15 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu) return save->cpl; } +static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + struct kvm_segment cs; + + svm_get_segment(vcpu, &cs, VCPU_SREG_CS); + *db = cs.db; + *l = cs.l; +} + static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1563,7 +1575,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) vmcb_mark_dirty(svm->vmcb, VMCB_DT); } -static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1647,7 +1659,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long old_cr4 = vcpu->arch.cr4; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu); + svm_flush_tlb_current(vcpu); vcpu->arch.cr4 = cr4; if (!npt_enabled) { @@ -2269,7 +2281,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT && (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { - if (!skip_emulated_instruction(vcpu)) + if (!svm_skip_emulated_instruction(vcpu)) return 0; } @@ -3134,7 +3146,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } -static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code) +static bool svm_check_exit_valid(u64 exit_code) { return (exit_code < ARRAY_SIZE(svm_exit_handlers) && svm_exit_handlers[exit_code]); @@ -3154,7 +3166,7 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (!svm_check_exit_valid(vcpu, exit_code)) + if (!svm_check_exit_valid(exit_code)) return svm_handle_invalid_exit(vcpu, exit_code); #ifdef CONFIG_RETPOLINE @@ -3189,7 +3201,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, *error_code = 0; } -static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; @@ -3286,7 +3298,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) ++vcpu->stat.nmi_injections; } -static void svm_set_irq(struct kvm_vcpu *vcpu) +static void svm_inject_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3516,17 +3528,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } -static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - return 0; -} - -static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) -{ - return 0; -} - -void svm_flush_tlb(struct kvm_vcpu *vcpu) +static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3900,11 +3902,6 @@ static int __init svm_check_processor_compat(void) return 0; } -static bool svm_cpu_has_accelerated_tpr(void) -{ - return false; -} - /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. @@ -3927,11 +3924,6 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4239,7 +4231,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) * by 0x400 (matches the offset of 'struct vmcb_save_area' * within 'struct vmcb'). Note: HSAVE area may also be used by * L1 hypervisor to save additional host context (e.g. KVM does - * that, see svm_prepare_guest_switch()) which must be + * that, see svm_prepare_switch_to_guest()) which must be * preserved. */ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), @@ -4514,21 +4506,20 @@ static int svm_vm_init(struct kvm *kvm) static struct kvm_x86_ops svm_x86_ops __initdata = { .name = "kvm_amd", - .hardware_unsetup = svm_hardware_teardown, + .hardware_unsetup = svm_hardware_unsetup, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, - .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, .has_emulated_msr = svm_has_emulated_msr, - .vcpu_create = svm_create_vcpu, - .vcpu_free = svm_free_vcpu, + .vcpu_create = svm_vcpu_create, + .vcpu_free = svm_vcpu_free, .vcpu_reset = svm_vcpu_reset, .vm_size = sizeof(struct kvm_svm), .vm_init = svm_vm_init, .vm_destroy = svm_vm_destroy, - .prepare_guest_switch = svm_prepare_guest_switch, + .prepare_switch_to_guest = svm_prepare_switch_to_guest, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, .vcpu_blocking = avic_vcpu_blocking, @@ -4542,9 +4533,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_segment = svm_get_segment, .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, - .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .get_cs_db_l_bits = svm_get_cs_db_l_bits, .set_cr0 = svm_set_cr0, - .post_set_cr3 = svm_post_set_cr3, + .post_set_cr3 = sev_post_set_cr3, .is_valid_cr4 = svm_is_valid_cr4, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, @@ -4559,21 +4550,21 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_rflags = svm_set_rflags, .get_if_flag = svm_get_if_flag, - .tlb_flush_all = svm_flush_tlb, - .tlb_flush_current = svm_flush_tlb, - .tlb_flush_gva = svm_flush_tlb_gva, - .tlb_flush_guest = svm_flush_tlb, + .flush_tlb_all = svm_flush_tlb_current, + .flush_tlb_current = svm_flush_tlb_current, + .flush_tlb_gva = svm_flush_tlb_gva, + .flush_tlb_guest = svm_flush_tlb_current, .vcpu_pre_run = svm_vcpu_pre_run, - .run = svm_vcpu_run, - .handle_exit = handle_exit, - .skip_emulated_instruction = skip_emulated_instruction, + .vcpu_run = svm_vcpu_run, + .handle_exit = svm_handle_exit, + .skip_emulated_instruction = svm_skip_emulated_instruction, .update_emulated_instruction = NULL, .set_interrupt_shadow = svm_set_interrupt_shadow, .get_interrupt_shadow = svm_get_interrupt_shadow, .patch_hypercall = svm_patch_hypercall, - .set_irq = svm_set_irq, - .set_nmi = svm_inject_nmi, + .inject_irq = svm_inject_irq, + .inject_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, @@ -4583,17 +4574,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, - .set_virtual_apic_mode = svm_set_virtual_apic_mode, - .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, - .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, - .load_eoi_exitmap = svm_load_eoi_exitmap, - .hwapic_irr_update = svm_hwapic_irr_update, - .hwapic_isr_update = svm_hwapic_isr_update, - .apicv_post_state_restore = avic_post_state_restore, - - .set_tss_addr = svm_set_tss_addr, - .set_identity_map_addr = svm_set_identity_map_addr, - .get_mt_mask = svm_get_mt_mask, + .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, + .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, + .apicv_post_state_restore = avic_apicv_post_state_restore, .get_exit_info = svm_get_exit_info, @@ -4619,8 +4602,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .nested_ops = &svm_nested_ops, .deliver_interrupt = svm_deliver_interrupt, - .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, - .update_pi_irte = svm_update_pi_irte, + .pi_update_irte = avic_pi_update_irte, .setup_mce = svm_setup_mce, .smi_allowed = svm_smi_allowed, @@ -4628,12 +4610,12 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, - .mem_enc_op = svm_mem_enc_op, - .mem_enc_reg_region = svm_register_enc_region, - .mem_enc_unreg_region = svm_unregister_enc_region, + .mem_enc_ioctl = sev_mem_enc_ioctl, + .mem_enc_register_region = sev_mem_enc_register_region, + .mem_enc_unregister_region = sev_mem_enc_unregister_region, - .vm_copy_enc_context_from = svm_vm_copy_asid_from, - .vm_move_enc_context_from = svm_vm_migrate_from, + .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, + .vm_move_enc_context_from = sev_vm_move_enc_context_from, .can_emulate_instruction = svm_can_emulate_instruction, @@ -4878,7 +4860,7 @@ static __init int svm_hardware_setup(void) return 0; err: - svm_hardware_teardown(); + svm_hardware_unsetup(); return r; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index fa98d6844728..70850cbe5bcb 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -79,7 +79,8 @@ struct kvm_sev_info { struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ - unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */ + struct list_head mirror_vms; /* List of VMs mirroring */ + struct list_head mirror_entry; /* Use as a list entry of mirrors */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; }; @@ -137,6 +138,8 @@ struct vmcb_ctrl_area_cached { u32 event_inj_err; u64 nested_cr3; u64 virt_ext; + u32 clean; + u8 reserved_sw[32]; }; struct svm_nested_state { @@ -163,6 +166,15 @@ struct svm_nested_state { struct vmcb_save_area_cached save; bool initialized; + + /* + * Indicates whether MSR bitmap for L2 needs to be rebuilt due to + * changes in MSR bitmap for L1 or switching to a different L2. Note, + * this flag can only be used reliably in conjunction with a paravirt L1 + * which informs L0 whether any changes to MSR bitmap for L2 were done + * on its side. + */ + bool force_msr_bitmap_recalc; }; struct vcpu_sev_es_state { @@ -321,7 +333,7 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) /* * Only the PDPTRs are loaded on demand into the shadow MMU. All other - * fields are synchronized in handle_exit, because accessing the VMCB is cheap. + * fields are synchronized on VM-Exit, because accessing the VMCB is cheap. * * CR3 might be out of date in the VMCB but it is not marked dirty; instead, * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3 @@ -480,7 +492,6 @@ void svm_vcpu_free_msrpm(u32 *msrpm); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -void svm_flush_tlb(struct kvm_vcpu *vcpu); void disable_nmi_singlestep(struct vcpu_svm *svm); bool svm_smi_blocked(struct kvm_vcpu *vcpu); bool svm_nmi_blocked(struct kvm_vcpu *vcpu); @@ -567,16 +578,15 @@ int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); -void avic_post_state_restore(struct kvm_vcpu *vcpu); -void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu); -void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); -bool svm_check_apicv_inhibit_reasons(ulong bit); -void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); -void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); -void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); -bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); -int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); +void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); +void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); +bool avic_check_apicv_inhibit_reasons(ulong bit); +void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); +void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); +bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); +int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); @@ -590,17 +600,17 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu); extern unsigned int max_sev_asid; void sev_vm_destroy(struct kvm *kvm); -int svm_mem_enc_op(struct kvm *kvm, void __user *argp); -int svm_register_enc_region(struct kvm *kvm, - struct kvm_enc_region *range); -int svm_unregister_enc_region(struct kvm *kvm, - struct kvm_enc_region *range); -int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd); -int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd); +int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp); +int sev_mem_enc_register_region(struct kvm *kvm, + struct kvm_enc_region *range); +int sev_mem_enc_unregister_region(struct kvm *kvm, + struct kvm_enc_region *range); +int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd); +int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd); void pre_sev_run(struct vcpu_svm *svm, int cpu); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); -void sev_hardware_teardown(void); +void sev_hardware_unsetup(void); int sev_cpu_init(struct svm_cpu_data *sd); void sev_free_vcpu(struct kvm_vcpu *vcpu); int sev_handle_vmgexit(struct kvm_vcpu *vcpu); @@ -608,7 +618,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); -void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); +void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa); void sev_es_unmap_ghcb(struct vcpu_svm *svm); /* vmenter.S */ diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 489ca56212c6..e2fc59380465 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -7,35 +7,12 @@ #define __ARCH_X86_KVM_SVM_ONHYPERV_H__ #if IS_ENABLED(CONFIG_HYPERV) -#include <asm/mshyperv.h> -#include "hyperv.h" #include "kvm_onhyperv.h" +#include "svm/hyperv.h" static struct kvm_x86_ops svm_x86_ops; -/* - * Hyper-V uses the software reserved 32 bytes in VMCB - * control area to expose SVM enlightenments to guests. - */ -struct hv_enlightenments { - struct __packed hv_enlightenments_control { - u32 nested_flush_hypercall:1; - u32 msr_bitmap:1; - u32 enlightened_npt_tlb: 1; - u32 reserved:29; - } __packed hv_enlightenments_control; - u32 hv_vp_id; - u64 hv_vm_id; - u64 partition_assist_page; - u64 reserved; -} __packed; - -/* - * Hyper-V uses the software reserved clean bit in VMCB - */ -#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW - int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu); static inline void svm_hv_init_vmcb(struct vmcb *vmcb) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 92e6f6702f00..e5a8c271b42d 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -64,9 +64,9 @@ TRACE_EVENT(kvm_hypercall, * Tracepoint for hypercall. */ TRACE_EVENT(kvm_hv_hypercall, - TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, - __u64 ingpa, __u64 outgpa), - TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), + TP_PROTO(__u16 code, bool fast, __u16 var_cnt, __u16 rep_cnt, + __u16 rep_idx, __u64 ingpa, __u64 outgpa), + TP_ARGS(code, fast, var_cnt, rep_cnt, rep_idx, ingpa, outgpa), TP_STRUCT__entry( __field( __u16, rep_cnt ) @@ -74,6 +74,7 @@ TRACE_EVENT(kvm_hv_hypercall, __field( __u64, ingpa ) __field( __u64, outgpa ) __field( __u16, code ) + __field( __u16, var_cnt ) __field( bool, fast ) ), @@ -83,13 +84,14 @@ TRACE_EVENT(kvm_hv_hypercall, __entry->ingpa = ingpa; __entry->outgpa = outgpa; __entry->code = code; + __entry->var_cnt = var_cnt; __entry->fast = fast; ), - TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", + TP_printk("code 0x%x %s var_cnt 0x%x rep_cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", __entry->code, __entry->fast ? "fast" : "slow", - __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, - __entry->outgpa) + __entry->var_cnt, __entry->rep_cnt, __entry->rep_idx, + __entry->ingpa, __entry->outgpa) ); TRACE_EVENT(kvm_hv_hypercall_done, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ba34e94049c7..c73e4d938ddc 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4797,7 +4797,8 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } -void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, + bool vcpu_has_perf_global_ctrl) { struct vcpu_vmx *vmx; @@ -4805,7 +4806,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) return; vmx = to_vmx(vcpu); - if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { + if (vcpu_has_perf_global_ctrl) { vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; vmx->nested.msrs.exit_ctls_high |= diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index b69a80f43b37..c92cea0b8ccc 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -32,7 +32,8 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); -void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu); +void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, + bool vcpu_has_perf_global_ctrl); void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 466d18fc0c5d..03fab48b149c 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -541,7 +541,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - nested_vmx_pmu_entry_exit_ctls_update(vcpu); + nested_vmx_pmu_refresh(vcpu, + intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); if (intel_pmu_lbr_is_compatible(vcpu)) x86_perf_get_lbr(&lbr_desc->records); diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index aa1fe9085d77..3834bb30ce54 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -244,7 +244,7 @@ void vmx_pi_start_assignment(struct kvm *kvm) } /* - * pi_update_irte - set IRTE for Posted-Interrupts + * vmx_pi_update_irte - set IRTE for Posted-Interrupts * * @kvm: kvm * @host_irq: host irq of the interrupt @@ -252,8 +252,8 @@ void vmx_pi_start_assignment(struct kvm *kvm) * @set: set or unset PI * returns 0 on success, < 0 on failure */ -int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, - bool set) +int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index eb14e76b84ef..9a45d5c9f116 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -97,8 +97,8 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); -int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, - bool set); +int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); void vmx_pi_start_assignment(struct kvm *kvm); #endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index efda5e4d6247..d8547144d3b7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -541,11 +541,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) return flexpriority_enabled && lapic_in_kernel(vcpu); } -static inline bool report_flexpriority(void) -{ - return flexpriority_enabled; -} - static int possible_passthrough_msr_slot(u32 msr) { u32 i; @@ -2341,7 +2336,7 @@ fault: return -EFAULT; } -static int hardware_enable(void) +static int vmx_hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2382,7 +2377,7 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } -static void hardware_disable(void) +static void vmx_hardware_disable(void) { vmclear_local_loaded_vmcss(); @@ -3937,31 +3932,33 @@ static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, #ifdef CONFIG_SMP if (vcpu->mode == IN_GUEST_MODE) { /* - * The vector of interrupt to be delivered to vcpu had - * been set in PIR before this function. + * The vector of the virtual has already been set in the PIR. + * Send a notification event to deliver the virtual interrupt + * unless the vCPU is the currently running vCPU, i.e. the + * event is being sent from a fastpath VM-Exit handler, in + * which case the PIR will be synced to the vIRR before + * re-entering the guest. * - * Following cases will be reached in this block, and - * we always send a notification event in all cases as - * explained below. + * When the target is not the running vCPU, the following + * possibilities emerge: * - * Case 1: vcpu keeps in non-root mode. Sending a - * notification event posts the interrupt to vcpu. + * Case 1: vCPU stays in non-root mode. Sending a notification + * event posts the interrupt to the vCPU. * - * Case 2: vcpu exits to root mode and is still - * runnable. PIR will be synced to vIRR before the - * next vcpu entry. Sending a notification event in - * this case has no effect, as vcpu is not in root - * mode. + * Case 2: vCPU exits to root mode and is still runnable. The + * PIR will be synced to the vIRR before re-entering the guest. + * Sending a notification event is ok as the host IRQ handler + * will ignore the spurious event. * - * Case 3: vcpu exits to root mode and is blocked. - * vcpu_block() has already synced PIR to vIRR and - * never blocks vcpu if vIRR is not cleared. Therefore, - * a blocked vcpu here does not wait for any requested - * interrupts in PIR, and sending a notification event - * which has no effect is safe here. + * Case 3: vCPU exits to root mode and is blocked. vcpu_block() + * has already synced PIR to vIRR and never blocks the vCPU if + * the vIRR is not empty. Therefore, a blocked vCPU here does + * not wait for any requested interrupts in PIR, and sending a + * notification event also results in a benign, spurious event. */ - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); + if (vcpu != kvm_get_running_vcpu()) + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return; } #endif @@ -5184,7 +5181,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (!kvm_require_dr(vcpu, dr)) return 1; - if (kvm_x86_ops.get_cpl(vcpu) > 0) + if (vmx_get_cpl(vcpu) > 0) goto out; dr7 = vmcs_readl(GUEST_DR7); @@ -6969,7 +6966,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) return vmx_exit_handlers_fastpath(vcpu); } -static void vmx_free_vcpu(struct kvm_vcpu *vcpu) +static void vmx_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6980,7 +6977,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_loaded_vmcs(vmx->loaded_vmcs); } -static int vmx_create_vcpu(struct kvm_vcpu *vcpu) +static int vmx_vcpu_create(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; @@ -7342,11 +7339,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_secondary_exec_control(vmx)); if (nested_vmx_allowed(vcpu)) - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; else - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + vmx->msr_ia32_feature_control_valid_bits &= ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); @@ -7685,7 +7682,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu) } } -static void hardware_unsetup(void) +static void vmx_hardware_unsetup(void) { kvm_set_posted_intr_wakeup_handler(NULL); @@ -7708,21 +7705,20 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit) static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = "kvm_intel", - .hardware_unsetup = hardware_unsetup, + .hardware_unsetup = vmx_hardware_unsetup, - .hardware_enable = hardware_enable, - .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = report_flexpriority, + .hardware_enable = vmx_hardware_enable, + .hardware_disable = vmx_hardware_disable, .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, - .vcpu_create = vmx_create_vcpu, - .vcpu_free = vmx_free_vcpu, + .vcpu_create = vmx_vcpu_create, + .vcpu_free = vmx_vcpu_free, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_prepare_switch_to_guest, + .prepare_switch_to_guest = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, @@ -7750,21 +7746,21 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_rflags = vmx_set_rflags, .get_if_flag = vmx_get_if_flag, - .tlb_flush_all = vmx_flush_tlb_all, - .tlb_flush_current = vmx_flush_tlb_current, - .tlb_flush_gva = vmx_flush_tlb_gva, - .tlb_flush_guest = vmx_flush_tlb_guest, + .flush_tlb_all = vmx_flush_tlb_all, + .flush_tlb_current = vmx_flush_tlb_current, + .flush_tlb_gva = vmx_flush_tlb_gva, + .flush_tlb_guest = vmx_flush_tlb_guest, .vcpu_pre_run = vmx_vcpu_pre_run, - .run = vmx_vcpu_run, + .vcpu_run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, - .set_irq = vmx_inject_irq, - .set_nmi = vmx_inject_nmi, + .inject_irq = vmx_inject_irq, + .inject_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, @@ -7817,8 +7813,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, - .update_pi_irte = pi_update_irte, - .start_assignment = vmx_pi_start_assignment, + .pi_update_irte = vmx_pi_update_irte, + .pi_start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 641044db415d..16d29d41908f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -126,16 +126,15 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); struct kvm_x86_ops kvm_x86_ops __read_mostly; -EXPORT_SYMBOL_GPL(kvm_x86_ops); #define KVM_X86_OP(func) \ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ *(((struct kvm_x86_ops *)0)->func)); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); -EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current); static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); @@ -194,6 +193,9 @@ bool __read_mostly enable_pmu = true; EXPORT_SYMBOL_GPL(enable_pmu); module_param(enable_pmu, bool, 0444); +bool __read_mostly eager_page_split = true; +module_param(eager_page_split, bool, 0644); + /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU @@ -2399,7 +2401,7 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc) return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); } -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio) +u64 kvm_scale_tsc(u64 tsc, u64 ratio) { u64 _tsc = tsc; @@ -2414,7 +2416,7 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); + tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); return target_tsc - tsc; } @@ -2422,7 +2424,7 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { return vcpu->arch.l1_tsc_offset + - kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio); + kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); @@ -2625,7 +2627,7 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) WARN_ON(adjustment < 0); - adjustment = kvm_scale_tsc(vcpu, (u64) adjustment, + adjustment = kvm_scale_tsc((u64) adjustment, vcpu->arch.l1_tsc_scaling_ratio); adjust_tsc_offset_guest(vcpu, adjustment); } @@ -3045,7 +3047,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ if (kvm_has_tsc_control) - tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz, + tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { @@ -3268,7 +3270,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_all)(vcpu); + static_call(kvm_x86_flush_tlb_all)(vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) @@ -3286,14 +3288,14 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) kvm_mmu_sync_prev_roots(vcpu); } - static_call(kvm_x86_tlb_flush_guest)(vcpu); + static_call(kvm_x86_flush_tlb_guest)(vcpu); } static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_current)(vcpu); + static_call(kvm_x86_flush_tlb_current)(vcpu); } /* @@ -3857,7 +3859,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ratio = vcpu->arch.tsc_scaling_ratio; } - msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset; + msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; break; } case MSR_MTRRcap: @@ -4233,6 +4235,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXIT_ON_EMULATION_FAILURE: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_SYS_ATTRIBUTES: + case KVM_CAP_VAPIC: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4273,9 +4276,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; - case KVM_CAP_VAPIC: - r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); - break; case KVM_CAP_NR_VCPUS: r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; @@ -5132,7 +5132,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz && kvm->arch.last_tsc_offset == offset); - tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; + tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; ns = get_kvmclock_base_ns(); __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); @@ -5977,15 +5977,18 @@ split_irqchip_unlock: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: r = -EINVAL; - if (kvm_x86_ops.vm_copy_enc_context_from) - r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); - return r; + if (!kvm_x86_ops.vm_copy_enc_context_from) + break; + + r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]); + break; case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: r = -EINVAL; - if (kvm_x86_ops.vm_move_enc_context_from) - r = kvm_x86_ops.vm_move_enc_context_from( - kvm, cap->args[0]); - return r; + if (!kvm_x86_ops.vm_move_enc_context_from) + break; + + r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]); + break; case KVM_CAP_EXIT_HYPERCALL: if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { r = -EINVAL; @@ -6460,8 +6463,10 @@ set_pit2_out: break; case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; - if (kvm_x86_ops.mem_enc_op) - r = static_call(kvm_x86_mem_enc_op)(kvm, argp); + if (!kvm_x86_ops.mem_enc_ioctl) + goto out; + + r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -6472,8 +6477,10 @@ set_pit2_out: goto out; r = -ENOTTY; - if (kvm_x86_ops.mem_enc_reg_region) - r = static_call(kvm_x86_mem_enc_reg_region)(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_register_region) + goto out; + + r = static_call(kvm_x86_mem_enc_register_region)(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -6484,8 +6491,10 @@ set_pit2_out: goto out; r = -ENOTTY; - if (kvm_x86_ops.mem_enc_unreg_region) - r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_unregister_region) + goto out; + + r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, ®ion); break; } case KVM_HYPERV_EVENTFD: { @@ -8413,8 +8422,7 @@ writeback: kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); - if (kvm_x86_ops.update_emulated_instruction) - static_call(kvm_x86_update_emulated_instruction)(vcpu); + static_call_cond(kvm_x86_update_emulated_instruction)(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -8965,7 +8973,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, * * @apicid - apicid of vcpu to be kicked. */ -static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) +static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid) { struct kvm_lapic_irq lapic_irq; @@ -9084,7 +9092,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) break; - kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + kvm_pv_kick_cpu_op(vcpu->kvm, a1); kvm_sched_yield(vcpu, a1); ret = 0; break; @@ -9254,10 +9262,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) */ else if (!vcpu->arch.exception.pending) { if (vcpu->arch.nmi_injected) { - static_call(kvm_x86_set_nmi)(vcpu); + static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_set_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu); can_inject = false; } } @@ -9337,7 +9345,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) if (r) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - static_call(kvm_x86_set_nmi)(vcpu); + static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); } @@ -9351,7 +9359,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_set_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu); WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) @@ -9679,8 +9687,7 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) lockdep_assert_held_write(&kvm->arch.apicv_update_lock); - if (!kvm_x86_ops.check_apicv_inhibit_reasons || - !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) + if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; old = new = kvm->arch.apicv_inhibit_reasons; @@ -9713,10 +9720,12 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) } else kvm->arch.apicv_inhibit_reasons = new; } -EXPORT_SYMBOL_GPL(__kvm_request_apicv_update); void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { + if (!enable_apicv) + return; + down_write(&kvm->arch.apicv_update_lock); __kvm_request_apicv_update(kvm, activate, bit); up_write(&kvm->arch.apicv_update_lock); @@ -9755,11 +9764,11 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, to_hv_synic(vcpu)->vec_bitmap, 256); - static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); return; } - static_call(kvm_x86_load_eoi_exitmap)( + static_call_cond(kvm_x86_load_eoi_exitmap)( vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } @@ -9782,10 +9791,7 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (!kvm_x86_ops.set_apic_access_page_addr) - return; - - static_call(kvm_x86_set_apic_access_page_addr)(vcpu); + static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); } void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -9975,7 +9981,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - static_call(kvm_x86_prepare_guest_switch)(vcpu); + static_call(kvm_x86_prepare_switch_to_guest)(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -10056,7 +10062,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); - exit_fastpath = static_call(kvm_x86_run)(vcpu); + exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; @@ -10359,10 +10365,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - /* - * Exclude PKRU from restore as restored separately in - * kvm_x86_ops.run(). - */ + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); } @@ -10545,16 +10548,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - struct kvm_segment cs; - - kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); - *db = cs.db; - *l = cs.l; -} -EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); - static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct desc_ptr dt; @@ -11977,6 +11970,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (kvm_dirty_log_manual_protect_and_init_set(kvm)) return; + if (READ_ONCE(eager_page_split)) + kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); + if (kvm_x86_ops.cpu_dirty_log_size) { kvm_mmu_slot_leaf_clear_dirty(kvm, new); kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); @@ -12021,8 +12017,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && - kvm_x86_ops.guest_apic_has_interrupt && - static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); + static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) @@ -12377,7 +12372,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) void kvm_arch_start_assignment(struct kvm *kvm) { if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) - static_call_cond(kvm_x86_start_assignment)(kvm); + static_call_cond(kvm_x86_pi_start_assignment)(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); @@ -12425,7 +12420,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); - ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) @@ -12450,7 +12445,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); @@ -12461,7 +12456,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); + return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set); } bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 767ec7f99516..aa86abad914d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -307,6 +307,8 @@ extern int pi_inject_timer; extern bool report_ignored_msrs; +extern bool eager_page_split; + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 74be1fda58e3..4aa0f2b31665 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -732,7 +732,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) instructions[0] = 0xb8; /* vmcall / vmmcall */ - kvm_x86_ops.patch_hypercall(vcpu, instructions + 5); + static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5); /* ret */ instructions[8] = 0xc3; @@ -867,7 +867,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_XEN; vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; vcpu->run->xen.u.hcall.longmode = longmode; - vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu); + vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu); vcpu->run->xen.u.hcall.input = input; vcpu->run->xen.u.hcall.params[0] = params[0]; vcpu->run->xen.u.hcall.params[1] = params[1]; diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index e1b1dd215267..ba9273f80069 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2448,23 +2448,21 @@ static void ata_dev_config_cpr(struct ata_device *dev) struct ata_cpr_log *cpr_log = NULL; u8 *desc, *buf = NULL; - if (!ata_identify_page_supported(dev, - ATA_LOG_CONCURRENT_POSITIONING_RANGES)) + if (ata_id_major_version(dev->id) < 11 || + !ata_log_supported(dev, ATA_LOG_CONCURRENT_POSITIONING_RANGES)) goto out; /* - * Read IDENTIFY DEVICE data log, page 0x47 - * (concurrent positioning ranges). We can have at most 255 32B range - * descriptors plus a 64B header. + * Read the concurrent positioning ranges log (0x47). We can have at + * most 255 32B range descriptors plus a 64B header. */ buf_len = (64 + 255 * 32 + 511) & ~511; buf = kzalloc(buf_len, GFP_KERNEL); if (!buf) goto out; - err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, - ATA_LOG_CONCURRENT_POSITIONING_RANGES, - buf, buf_len >> 9); + err_mask = ata_read_log_page(dev, ATA_LOG_CONCURRENT_POSITIONING_RANGES, + 0, buf, buf_len >> 9); if (err_mask) goto out; diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index 45f578793980..bd87012c220c 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -67,7 +67,7 @@ static const unsigned int sd_au_size[] = { __res & __mask; \ }) -#define SD_POWEROFF_NOTIFY_TIMEOUT_MS 2000 +#define SD_POWEROFF_NOTIFY_TIMEOUT_MS 1000 #define SD_WRITE_EXTR_SINGLE_TIMEOUT_MS 1000 struct sd_busy_data { @@ -1664,6 +1664,12 @@ static int sd_poweroff_notify(struct mmc_card *card) goto out; } + /* Find out when the command is completed. */ + err = mmc_poll_for_busy(card, SD_WRITE_EXTR_SINGLE_TIMEOUT_MS, false, + MMC_BUSY_EXTR_SINGLE); + if (err) + goto out; + cb_data.card = card; cb_data.reg_buf = reg_buf; err = __mmc_poll_for_busy(card->host, SD_POWEROFF_NOTIFY_TIMEOUT_MS, diff --git a/drivers/mmc/host/moxart-mmc.c b/drivers/mmc/host/moxart-mmc.c index 16d1c7a43d33..b6eb75f4bbfc 100644 --- a/drivers/mmc/host/moxart-mmc.c +++ b/drivers/mmc/host/moxart-mmc.c @@ -705,12 +705,12 @@ static int moxart_remove(struct platform_device *pdev) if (!IS_ERR_OR_NULL(host->dma_chan_rx)) dma_release_channel(host->dma_chan_rx); mmc_remove_host(mmc); - mmc_free_host(mmc); writel(0, host->base + REG_INTERRUPT_MASK); writel(0, host->base + REG_POWER_CONTROL); writel(readl(host->base + REG_CLOCK_CONTROL) | CLK_OFF, host->base + REG_CLOCK_CONTROL); + mmc_free_host(mmc); return 0; } diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index a593b1fbd69e..0f3658b36513 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -524,12 +524,16 @@ static void esdhc_of_adma_workaround(struct sdhci_host *host, u32 intmask) static int esdhc_of_enable_dma(struct sdhci_host *host) { + int ret; u32 value; struct device *dev = mmc_dev(host->mmc); if (of_device_is_compatible(dev->of_node, "fsl,ls1043a-esdhc") || - of_device_is_compatible(dev->of_node, "fsl,ls1046a-esdhc")) - dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40)); + of_device_is_compatible(dev->of_node, "fsl,ls1046a-esdhc")) { + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40)); + if (ret) + return ret; + } value = sdhci_readl(host, ESDHC_DMA_SYSCTL); diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index bcc595c70a9f..104dcd702870 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -405,6 +405,9 @@ static int sh_mmcif_dma_slave_config(struct sh_mmcif_host *host, struct dma_slave_config cfg = { 0, }; res = platform_get_resource(host->pd, IORESOURCE_MEM, 0); + if (!res) + return -EINVAL; + cfg.direction = direction; if (direction == DMA_DEV_TO_MEM) { diff --git a/fs/Kconfig b/fs/Kconfig index 7a2b11c0b803..6c7dc1387beb 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -369,8 +369,8 @@ source "fs/ksmbd/Kconfig" config SMBFS_COMMON tristate - default y if CIFS=y - default m if CIFS=m + default y if CIFS=y || SMB_SERVER=y + default m if CIFS=m || SMB_SERVER=m source "fs/coda/Kconfig" source "fs/afs/Kconfig" diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index dc3d061edda9..911444d21267 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -29,6 +29,7 @@ #include "mgmt/user_config.h" #include "crypto_ctx.h" #include "transport_ipc.h" +#include "../smbfs_common/arc4.h" /* * Fixed format data defining GSS header and fixed string @@ -336,6 +337,29 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, nt_len - CIFS_ENCPWD_SIZE, domain_name, conn->ntlmssp.cryptkey); kfree(domain_name); + + /* The recovered secondary session key */ + if (conn->ntlmssp.client_flags & NTLMSSP_NEGOTIATE_KEY_XCH) { + struct arc4_ctx *ctx_arc4; + unsigned int sess_key_off, sess_key_len; + + sess_key_off = le32_to_cpu(authblob->SessionKey.BufferOffset); + sess_key_len = le16_to_cpu(authblob->SessionKey.Length); + + if (blob_len < (u64)sess_key_off + sess_key_len) + return -EINVAL; + + ctx_arc4 = kmalloc(sizeof(*ctx_arc4), GFP_KERNEL); + if (!ctx_arc4) + return -ENOMEM; + + cifs_arc4_setkey(ctx_arc4, sess->sess_key, + SMB2_NTLMV2_SESSKEY_SIZE); + cifs_arc4_crypt(ctx_arc4, sess->sess_key, + (char *)authblob + sess_key_off, sess_key_len); + kfree_sensitive(ctx_arc4); + } + return ret; } @@ -408,6 +432,9 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, (cflags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) flags |= NTLMSSP_NEGOTIATE_EXTENDED_SEC; + if (cflags & NTLMSSP_NEGOTIATE_KEY_XCH) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + chgblob->NegotiateFlags = cpu_to_le32(flags); len = strlen(ksmbd_netbios_name()); name = kmalloc(2 + UNICODE_LEN(len), GFP_KERNEL); diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 1866c81c5c99..67e8e28e3fc3 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -2688,7 +2688,7 @@ int smb2_open(struct ksmbd_work *work) (struct create_posix *)context; if (le16_to_cpu(context->DataOffset) + le32_to_cpu(context->DataLength) < - sizeof(struct create_posix)) { + sizeof(struct create_posix) - 4) { rc = -EINVAL; goto err_out1; } @@ -3422,9 +3422,9 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, goto free_conv_name; } - struct_sz = readdir_info_level_struct_sz(info_level); - next_entry_offset = ALIGN(struct_sz - 1 + conv_len, - KSMBD_DIR_INFO_ALIGNMENT); + struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len; + next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT); + d_info->last_entry_off_align = next_entry_offset - struct_sz; if (next_entry_offset > d_info->out_buf_len) { d_info->out_buf_len = 0; @@ -3976,6 +3976,7 @@ int smb2_query_dir(struct ksmbd_work *work) ((struct file_directory_info *) ((char *)rsp->Buffer + d_info.last_entry_offset)) ->NextEntryOffset = 0; + d_info.data_count -= d_info.last_entry_off_align; rsp->StructureSize = cpu_to_le16(9); rsp->OutputBufferOffset = cpu_to_le16(72); @@ -6126,13 +6127,26 @@ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, __le16 ChannelInfoOffset, __le16 ChannelInfoLength) { + unsigned int i, ch_count; + if (work->conn->dialect == SMB30_PROT_ID && Channel != SMB2_CHANNEL_RDMA_V1) return -EINVAL; - if (ChannelInfoOffset == 0 || - le16_to_cpu(ChannelInfoLength) < sizeof(*desc)) + ch_count = le16_to_cpu(ChannelInfoLength) / sizeof(*desc); + if (ksmbd_debug_types & KSMBD_DEBUG_RDMA) { + for (i = 0; i < ch_count; i++) { + pr_info("RDMA r/w request %#x: token %#x, length %#x\n", + i, + le32_to_cpu(desc[i].token), + le32_to_cpu(desc[i].length)); + } + } + if (ch_count != 1) { + ksmbd_debug(RDMA, "RDMA multiple buffer descriptors %d are not supported yet\n", + ch_count); return -EINVAL; + } work->need_invalidate_rkey = (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); @@ -6185,9 +6199,15 @@ int smb2_read(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || req->Channel == SMB2_CHANNEL_RDMA_V1) { + unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset); + + if (ch_offset < offsetof(struct smb2_read_req, Buffer)) { + err = -EINVAL; + goto out; + } err = smb2_set_remote_key_for_rdma(work, (struct smb2_buffer_desc_v1 *) - &req->Buffer[0], + ((char *)req + ch_offset), req->Channel, req->ReadChannelInfoOffset, req->ReadChannelInfoLength); @@ -6428,11 +6448,16 @@ int smb2_write(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1 || req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { - if (req->Length != 0 || req->DataOffset != 0) - return -EINVAL; + unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset); + + if (req->Length != 0 || req->DataOffset != 0 || + ch_offset < offsetof(struct smb2_write_req, Buffer)) { + err = -EINVAL; + goto out; + } err = smb2_set_remote_key_for_rdma(work, (struct smb2_buffer_desc_v1 *) - &req->Buffer[0], + ((char *)req + ch_offset), req->Channel, req->WriteChannelInfoOffset, req->WriteChannelInfoLength); diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index ef7f42b0290a..9a7e211dbf4f 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -308,14 +308,17 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, for (i = 0; i < 2; i++) { struct kstat kstat; struct ksmbd_kstat ksmbd_kstat; + struct dentry *dentry; if (!dir->dot_dotdot[i]) { /* fill dot entry info */ if (i == 0) { d_info->name = "."; d_info->name_len = 1; + dentry = dir->filp->f_path.dentry; } else { d_info->name = ".."; d_info->name_len = 2; + dentry = dir->filp->f_path.dentry->d_parent; } if (!match_pattern(d_info->name, d_info->name_len, @@ -327,7 +330,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, ksmbd_kstat.kstat = &kstat; ksmbd_vfs_fill_dentry_attrs(work, user_ns, - dir->filp->f_path.dentry->d_parent, + dentry, &ksmbd_kstat); rc = fn(conn, info_level, d_info, &ksmbd_kstat); if (rc) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 3c1ec1ac0b27..ba5a22bc2e6d 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -80,7 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ static int smb_direct_max_receive_size = 8192; -static int smb_direct_max_read_write_size = 1048512; +static int smb_direct_max_read_write_size = 524224; static int smb_direct_max_outstanding_rw_ops = 8; diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index adf94a4f22fa..8c37aaf936ab 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -47,6 +47,7 @@ struct ksmbd_dir_info { int last_entry_offset; bool hide_dot_file; int flags; + int last_entry_off_align; }; struct ksmbd_readdir_data { diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f18e80fda9bf..d1f34229e11a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -177,6 +177,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); + clp->cl_flags = cl_init->init_flags; clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; @@ -423,7 +424,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); - new->cl_flags = cl_init->init_flags; return rpc_ops->init_client(new, cl_init); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 848f3b8fb821..7bc7cf6b26f0 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -80,6 +80,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir ctx->dir_cookie = 0; ctx->dup_cookie = 0; ctx->page_index = 0; + ctx->eof = false; spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) @@ -168,6 +169,7 @@ struct nfs_readdir_descriptor { unsigned int cache_entry_index; signed char duped; bool plus; + bool eob; bool eof; }; @@ -867,7 +869,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, status = nfs_readdir_page_filler(desc, entry, pages, pglen, arrays, narrays); - } while (!status && nfs_readdir_page_needs_filling(page)); + } while (!status && nfs_readdir_page_needs_filling(page) && + page_mapping(page)); nfs_readdir_free_pages(pages, array_size); out: @@ -988,7 +991,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { - desc->eof = true; + desc->eob = true; break; } memcpy(desc->verf, verf, sizeof(desc->verf)); @@ -1004,7 +1007,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, desc->duped = 1; } if (array->page_is_eof) - desc->eof = true; + desc->eof = !desc->eob; kunmap(desc->page); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", @@ -1041,12 +1044,13 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) goto out; desc->page_index = 0; + desc->cache_entry_index = 0; desc->last_cookie = desc->dir_cookie; desc->duped = 0; status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); - for (i = 0; !desc->eof && i < sz && arrays[i]; i++) { + for (i = 0; !desc->eob && i < sz && arrays[i]; i++) { desc->page = arrays[i]; nfs_do_filldir(desc, verf); } @@ -1105,9 +1109,15 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->duped = dir_ctx->duped; page_index = dir_ctx->page_index; desc->attr_gencount = dir_ctx->attr_gencount; + desc->eof = dir_ctx->eof; memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); spin_unlock(&file->f_lock); + if (desc->eof) { + res = 0; + goto out_free; + } + if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) && list_is_singular(&nfsi->open_files)) invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1); @@ -1141,7 +1151,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) nfs_do_filldir(desc, nfsi->cookieverf); nfs_readdir_page_unlock_and_put_cached(desc); - } while (!desc->eof); + } while (!desc->eob && !desc->eof); spin_lock(&file->f_lock); dir_ctx->dir_cookie = desc->dir_cookie; @@ -1149,9 +1159,10 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) dir_ctx->duped = desc->duped; dir_ctx->attr_gencount = desc->attr_gencount; dir_ctx->page_index = desc->page_index; + dir_ctx->eof = desc->eof; memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); spin_unlock(&file->f_lock); - +out_free: kfree(desc); out: @@ -1193,6 +1204,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) if (offset == 0) memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf)); dir_ctx->duped = 0; + dir_ctx->eof = false; } spin_unlock(&filp->f_lock); return offset; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b18f31b2c9e7..f5020828ab65 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8032,7 +8032,8 @@ static int _nfs41_proc_get_locations(struct nfs_server *server, /** * nfs4_proc_get_locations - discover locations for a migrated FSID - * @inode: inode on FSID that is migrating + * @server: pointer to nfs_server to process + * @fhandle: pointer to the kernel NFS client file handle * @locations: result of query * @page: buffer * @cred: credential to use for this operation diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index 8f97c2927bee..fdce7a4cfc6f 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -183,11 +183,18 @@ enum HV_GENERIC_SET_FORMAT { #define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) #define HV_HYPERCALL_FAST_BIT BIT(16) #define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_VARHEAD_MASK GENMASK_ULL(26, 17) +#define HV_HYPERCALL_RSVD0_MASK GENMASK_ULL(31, 27) #define HV_HYPERCALL_REP_COMP_OFFSET 32 #define HV_HYPERCALL_REP_COMP_1 BIT_ULL(32) #define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) +#define HV_HYPERCALL_RSVD1_MASK GENMASK_ULL(47, 44) #define HV_HYPERCALL_REP_START_OFFSET 48 #define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) +#define HV_HYPERCALL_RSVD2_MASK GENMASK_ULL(63, 60) +#define HV_HYPERCALL_RSVD_MASK (HV_HYPERCALL_RSVD0_MASK | \ + HV_HYPERCALL_RSVD1_MASK | \ + HV_HYPERCALL_RSVD2_MASK) /* hypercall status code */ #define HV_STATUS_SUCCESS 0 diff --git a/include/linux/ata.h b/include/linux/ata.h index 199e47e97d64..21292b5bbb55 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -324,12 +324,12 @@ enum { ATA_LOG_NCQ_NON_DATA = 0x12, ATA_LOG_NCQ_SEND_RECV = 0x13, ATA_LOG_IDENTIFY_DEVICE = 0x30, + ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47, /* Identify device log pages: */ ATA_LOG_SECURITY = 0x06, ATA_LOG_SATA_SETTINGS = 0x08, ATA_LOG_ZONED_INFORMATION = 0x09, - ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47, /* Identify device SATA settings log:*/ ATA_LOG_DEVSLP_OFFSET = 0x30, diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 02aa49323d1d..68f81d8d36de 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -107,6 +107,7 @@ struct nfs_open_dir_context { __u64 dup_cookie; pgoff_t page_index; signed char duped; + bool eof; }; /* diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 507ee1f2aa96..a02bbf8fd0f6 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -562,9 +562,12 @@ struct kvm_s390_mem_op { __u32 op; /* type of operation */ __u64 buf; /* buffer in userspace */ union { - __u8 ar; /* the access register number */ + struct { + __u8 ar; /* the access register number */ + __u8 key; /* access key, ignored if flag unset */ + }; __u32 sida_offset; /* offset into the sida */ - __u8 reserved[32]; /* should be set to 0 */ + __u8 reserved[32]; /* ignored */ }; }; /* types for kvm_s390_mem_op->op */ @@ -572,9 +575,12 @@ struct kvm_s390_mem_op { #define KVM_S390_MEMOP_LOGICAL_WRITE 1 #define KVM_S390_MEMOP_SIDA_READ 2 #define KVM_S390_MEMOP_SIDA_WRITE 3 +#define KVM_S390_MEMOP_ABSOLUTE_READ 4 +#define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 /* flags for kvm_s390_mem_op->flags */ #define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) #define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) +#define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) /* for KVM_INTERRUPT */ struct kvm_interrupt { @@ -1135,6 +1141,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_XSAVE2 208 #define KVM_CAP_SYS_ATTRIBUTES 209 #define KVM_CAP_PPC_AIL_MODE_3 210 +#define KVM_CAP_S390_MEM_OP_EXTENSION 211 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/kernel/static_call.c b/kernel/static_call.c index 43ba0b1e0edb..f2b8baea35d2 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -503,6 +503,7 @@ long __static_call_return0(void) { return 0; } +EXPORT_SYMBOL_GPL(__static_call_return0); #ifdef CONFIG_STATIC_CALL_SELFTEST diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index b64a0286b182..05c758da6a92 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -115,11 +115,14 @@ static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, } sock = container_of(xprt, struct sock_xprt, xprt); - if (kernel_getsockname(sock->sock, (struct sockaddr *)&saddr) < 0) + mutex_lock(&sock->recv_mutex); + if (sock->sock == NULL || + kernel_getsockname(sock->sock, (struct sockaddr *)&saddr) < 0) goto out; ret = sprintf(buf, "%pISc\n", &saddr); out: + mutex_unlock(&sock->recv_mutex); xprt_put(xprt); return ret + 1; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f172d1298013..7b5fce2faa10 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -413,6 +413,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) IB_POLL_WORKQUEUE); if (IS_ERR(ep->re_attr.send_cq)) { rc = PTR_ERR(ep->re_attr.send_cq); + ep->re_attr.send_cq = NULL; goto out_destroy; } @@ -421,6 +422,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) IB_POLL_WORKQUEUE); if (IS_ERR(ep->re_attr.recv_cq)) { rc = PTR_ERR(ep->re_attr.recv_cq); + ep->re_attr.recv_cq = NULL; goto out_destroy; } ep->re_receive_count = 0; @@ -459,6 +461,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) ep->re_pd = ib_alloc_pd(device, 0); if (IS_ERR(ep->re_pd)) { rc = PTR_ERR(ep->re_pd); + ep->re_pd = NULL; goto out_destroy; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 69b6ee5a5fd1..0f39e08ee580 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1641,7 +1641,12 @@ static int xs_get_srcport(struct sock_xprt *transport) unsigned short get_srcport(struct rpc_xprt *xprt) { struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); - return xs_sock_getport(sock->sock); + unsigned short ret = 0; + mutex_lock(&sock->recv_mutex); + if (sock->sock) + ret = xs_sock_getport(sock->sock); + mutex_unlock(&sock->recv_mutex); + return ret; } EXPORT_SYMBOL(get_srcport); diff --git a/security/integrity/digsig_asymmetric.c b/security/integrity/digsig_asymmetric.c index 23240d793b07..895f4b9ce8c6 100644 --- a/security/integrity/digsig_asymmetric.c +++ b/security/integrity/digsig_asymmetric.c @@ -109,22 +109,25 @@ int asymmetric_verify(struct key *keyring, const char *sig, pk = asymmetric_key_public_key(key); pks.pkey_algo = pk->pkey_algo; - if (!strcmp(pk->pkey_algo, "rsa")) + if (!strcmp(pk->pkey_algo, "rsa")) { pks.encoding = "pkcs1"; - else if (!strncmp(pk->pkey_algo, "ecdsa-", 6)) + } else if (!strncmp(pk->pkey_algo, "ecdsa-", 6)) { /* edcsa-nist-p192 etc. */ pks.encoding = "x962"; - else if (!strcmp(pk->pkey_algo, "ecrdsa") || - !strcmp(pk->pkey_algo, "sm2")) + } else if (!strcmp(pk->pkey_algo, "ecrdsa") || + !strcmp(pk->pkey_algo, "sm2")) { pks.encoding = "raw"; - else - return -ENOPKG; + } else { + ret = -ENOPKG; + goto out; + } pks.digest = (u8 *)data; pks.digest_size = datalen; pks.s = hdr->sig; pks.s_size = siglen; ret = verify_signature(key, &pks); +out: key_put(key); pr_debug("%s() = %d\n", __func__, ret); return ret; diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index 3d8e9d5db5aa..3ad8f7734208 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -496,12 +496,12 @@ int __init ima_fs_init(void) return 0; out: + securityfs_remove(ima_policy); securityfs_remove(violations); securityfs_remove(runtime_measurements_count); securityfs_remove(ascii_runtime_measurements); securityfs_remove(binary_runtime_measurements); securityfs_remove(ima_symlink); securityfs_remove(ima_dir); - securityfs_remove(ima_policy); return -1; } diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 320ca80aacab..2a1f6418b10a 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1967,6 +1967,14 @@ int ima_policy_show(struct seq_file *m, void *v) rcu_read_lock(); + /* Do not print rules with inactive LSM labels */ + for (i = 0; i < MAX_LSM_RULES; i++) { + if (entry->lsm[i].args_p && !entry->lsm[i].rule) { + rcu_read_unlock(); + return 0; + } + } + if (entry->action & MEASURE) seq_puts(m, pt(Opt_measure)); if (entry->action & DONT_MEASURE) diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index 694560396be0..db1ad6d7a57f 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -29,6 +29,7 @@ static struct ima_template_desc builtin_templates[] = { static LIST_HEAD(defined_templates); static DEFINE_SPINLOCK(template_list); +static int template_setup_done; static const struct ima_template_field supported_fields[] = { {.field_id = "d", .field_init = ima_eventdigest_init, @@ -101,10 +102,11 @@ static int __init ima_template_setup(char *str) struct ima_template_desc *template_desc; int template_len = strlen(str); - if (ima_template) + if (template_setup_done) return 1; - ima_init_template_list(); + if (!ima_template) + ima_init_template_list(); /* * Verify that a template with the supplied name exists. @@ -128,6 +130,7 @@ static int __init ima_template_setup(char *str) } ima_template = template_desc; + template_setup_done = 1; return 1; } __setup("ima_template=", ima_template_setup); @@ -136,7 +139,7 @@ static int __init ima_template_fmt_setup(char *str) { int num_templates = ARRAY_SIZE(builtin_templates); - if (ima_template) + if (template_setup_done) return 1; if (template_desc_init_fields(str, NULL, NULL) < 0) { @@ -147,6 +150,7 @@ static int __init ima_template_fmt_setup(char *str) builtin_templates[num_templates - 1].fmt = str; ima_template = builtin_templates + num_templates - 1; + template_setup_done = 1; return 1; } diff --git a/security/integrity/integrity_audit.c b/security/integrity/integrity_audit.c index 29220056207f..0ec5e4c22cb2 100644 --- a/security/integrity/integrity_audit.c +++ b/security/integrity/integrity_audit.c @@ -45,6 +45,8 @@ void integrity_audit_message(int audit_msgno, struct inode *inode, return; ab = audit_log_start(audit_context(), GFP_KERNEL, audit_msgno); + if (!ab) + return; audit_log_format(ab, "pid=%d uid=%u auid=%u ses=%u", task_pid_nr(current), from_kuid(&init_user_ns, current_uid()), diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index dce7de7755e6..7903580a48ac 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -8,6 +8,7 @@ /s390x/memop /s390x/resets /s390x/sync_regs_test +/s390x/tprot /x86_64/amx_test /x86_64/cpuid_test /x86_64/cr4_cpuid_sync_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 17c3f0749f05..f9943b96ab3f 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -51,6 +51,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test @@ -120,6 +121,7 @@ TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test TEST_GEN_PROGS_s390x = s390x/memop TEST_GEN_PROGS_s390x += s390x/resets TEST_GEN_PROGS_s390x += s390x/sync_regs_test +TEST_GEN_PROGS_s390x += s390x/tprot TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += kvm_create_max_vcpus diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 1954b964d1cf..101759ac93a4 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -298,12 +298,18 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-i iterations] [-p offset] " + printf("usage: %s [-h] [-i iterations] [-p offset] [-g]" "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]" "[-x memslots]\n", name); puts(""); printf(" -i: specify iteration counts (default: %"PRIu64")\n", TEST_HOST_LOOP_N); + printf(" -g: Do not enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2. This\n" + " makes KVM_GET_DIRTY_LOG clear the dirty log (i.e.\n" + " KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE is not enabled)\n" + " and writes will be tracked as soon as dirty logging is\n" + " enabled on the memslot (i.e. KVM_DIRTY_LOG_INITIALLY_SET\n" + " is not enabled).\n"); printf(" -p: specify guest physical test memory offset\n" " Warning: a low offset can conflict with the loaded test code.\n"); guest_modes_help(); @@ -343,8 +349,11 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:x:")) != -1) { + while ((opt = getopt(argc, argv, "ghi:p:m:b:f:v:os:x:")) != -1) { switch (opt) { + case 'g': + dirty_log_manual_caps = 0; + break; case 'i': p.iterations = atoi(optarg); break; diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h index c9af97abd622..cc5d14a45702 100644 --- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -213,6 +213,25 @@ struct hv_enlightened_vmcs { u64 padding64_6[7]; }; +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0 +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP BIT(1) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2 BIT(2) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1 BIT(3) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC BIT(4) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT BIT(5) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY BIT(6) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN BIT(7) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR BIT(8) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT BIT(9) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC BIT(10) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1 BIT(11) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2 BIT(12) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER BIT(13) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1 BIT(14) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF + #define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 @@ -648,381 +667,507 @@ static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value) switch (encoding) { case GUEST_RIP: current_evmcs->guest_rip = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case GUEST_RSP: current_evmcs->guest_rsp = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC; break; case GUEST_RFLAGS: current_evmcs->guest_rflags = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC; break; case HOST_IA32_PAT: current_evmcs->host_ia32_pat = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_IA32_EFER: current_evmcs->host_ia32_efer = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_CR0: current_evmcs->host_cr0 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_CR3: current_evmcs->host_cr3 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_CR4: current_evmcs->host_cr4 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_IA32_SYSENTER_ESP: current_evmcs->host_ia32_sysenter_esp = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_IA32_SYSENTER_EIP: current_evmcs->host_ia32_sysenter_eip = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_RIP: current_evmcs->host_rip = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case IO_BITMAP_A: current_evmcs->io_bitmap_a = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP; break; case IO_BITMAP_B: current_evmcs->io_bitmap_b = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP; break; case MSR_BITMAP: current_evmcs->msr_bitmap = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; break; case GUEST_ES_BASE: current_evmcs->guest_es_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_CS_BASE: current_evmcs->guest_cs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_SS_BASE: current_evmcs->guest_ss_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_DS_BASE: current_evmcs->guest_ds_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_FS_BASE: current_evmcs->guest_fs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_GS_BASE: current_evmcs->guest_gs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_LDTR_BASE: current_evmcs->guest_ldtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_TR_BASE: current_evmcs->guest_tr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_GDTR_BASE: current_evmcs->guest_gdtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_IDTR_BASE: current_evmcs->guest_idtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case TSC_OFFSET: current_evmcs->tsc_offset = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; break; case VIRTUAL_APIC_PAGE_ADDR: current_evmcs->virtual_apic_page_addr = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; break; case VMCS_LINK_POINTER: current_evmcs->vmcs_link_pointer = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_IA32_DEBUGCTL: current_evmcs->guest_ia32_debugctl = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_IA32_PAT: current_evmcs->guest_ia32_pat = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_IA32_EFER: current_evmcs->guest_ia32_efer = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_PDPTR0: current_evmcs->guest_pdptr0 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_PDPTR1: current_evmcs->guest_pdptr1 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_PDPTR2: current_evmcs->guest_pdptr2 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_PDPTR3: current_evmcs->guest_pdptr3 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_PENDING_DBG_EXCEPTIONS: current_evmcs->guest_pending_dbg_exceptions = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_SYSENTER_ESP: current_evmcs->guest_sysenter_esp = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_SYSENTER_EIP: current_evmcs->guest_sysenter_eip = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case CR0_GUEST_HOST_MASK: current_evmcs->cr0_guest_host_mask = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case CR4_GUEST_HOST_MASK: current_evmcs->cr4_guest_host_mask = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case CR0_READ_SHADOW: current_evmcs->cr0_read_shadow = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case CR4_READ_SHADOW: current_evmcs->cr4_read_shadow = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case GUEST_CR0: current_evmcs->guest_cr0 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case GUEST_CR3: current_evmcs->guest_cr3 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case GUEST_CR4: current_evmcs->guest_cr4 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case GUEST_DR7: current_evmcs->guest_dr7 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case HOST_FS_BASE: current_evmcs->host_fs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; break; case HOST_GS_BASE: current_evmcs->host_gs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; break; case HOST_TR_BASE: current_evmcs->host_tr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; break; case HOST_GDTR_BASE: current_evmcs->host_gdtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; break; case HOST_IDTR_BASE: current_evmcs->host_idtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; break; case HOST_RSP: current_evmcs->host_rsp = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; break; case EPT_POINTER: current_evmcs->ept_pointer = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT; break; case GUEST_BNDCFGS: current_evmcs->guest_bndcfgs = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case XSS_EXIT_BITMAP: current_evmcs->xss_exit_bitmap = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; break; case GUEST_PHYSICAL_ADDRESS: current_evmcs->guest_physical_address = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case EXIT_QUALIFICATION: current_evmcs->exit_qualification = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case GUEST_LINEAR_ADDRESS: current_evmcs->guest_linear_address = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case VM_EXIT_MSR_STORE_ADDR: current_evmcs->vm_exit_msr_store_addr = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case VM_EXIT_MSR_LOAD_ADDR: current_evmcs->vm_exit_msr_load_addr = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case VM_ENTRY_MSR_LOAD_ADDR: current_evmcs->vm_entry_msr_load_addr = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case CR3_TARGET_VALUE0: current_evmcs->cr3_target_value0 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case CR3_TARGET_VALUE1: current_evmcs->cr3_target_value1 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case CR3_TARGET_VALUE2: current_evmcs->cr3_target_value2 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case CR3_TARGET_VALUE3: current_evmcs->cr3_target_value3 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case TPR_THRESHOLD: current_evmcs->tpr_threshold = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case GUEST_INTERRUPTIBILITY_INFO: current_evmcs->guest_interruptibility_info = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC; break; case CPU_BASED_VM_EXEC_CONTROL: current_evmcs->cpu_based_vm_exec_control = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC; break; case EXCEPTION_BITMAP: current_evmcs->exception_bitmap = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN; break; case VM_ENTRY_CONTROLS: current_evmcs->vm_entry_controls = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY; break; case VM_ENTRY_INTR_INFO_FIELD: current_evmcs->vm_entry_intr_info_field = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT; break; case VM_ENTRY_EXCEPTION_ERROR_CODE: current_evmcs->vm_entry_exception_error_code = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT; break; case VM_ENTRY_INSTRUCTION_LEN: current_evmcs->vm_entry_instruction_len = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT; break; case HOST_IA32_SYSENTER_CS: current_evmcs->host_ia32_sysenter_cs = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case PIN_BASED_VM_EXEC_CONTROL: current_evmcs->pin_based_vm_exec_control = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1; break; case VM_EXIT_CONTROLS: current_evmcs->vm_exit_controls = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1; break; case SECONDARY_VM_EXEC_CONTROL: current_evmcs->secondary_vm_exec_control = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1; break; case GUEST_ES_LIMIT: current_evmcs->guest_es_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_CS_LIMIT: current_evmcs->guest_cs_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_SS_LIMIT: current_evmcs->guest_ss_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_DS_LIMIT: current_evmcs->guest_ds_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_FS_LIMIT: current_evmcs->guest_fs_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_GS_LIMIT: current_evmcs->guest_gs_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_LDTR_LIMIT: current_evmcs->guest_ldtr_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_TR_LIMIT: current_evmcs->guest_tr_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_GDTR_LIMIT: current_evmcs->guest_gdtr_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_IDTR_LIMIT: current_evmcs->guest_idtr_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_ES_AR_BYTES: current_evmcs->guest_es_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_CS_AR_BYTES: current_evmcs->guest_cs_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_SS_AR_BYTES: current_evmcs->guest_ss_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_DS_AR_BYTES: current_evmcs->guest_ds_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_FS_AR_BYTES: current_evmcs->guest_fs_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_GS_AR_BYTES: current_evmcs->guest_gs_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_LDTR_AR_BYTES: current_evmcs->guest_ldtr_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_TR_AR_BYTES: current_evmcs->guest_tr_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_ACTIVITY_STATE: current_evmcs->guest_activity_state = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_SYSENTER_CS: current_evmcs->guest_sysenter_cs = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case VM_INSTRUCTION_ERROR: current_evmcs->vm_instruction_error = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case VM_EXIT_REASON: current_evmcs->vm_exit_reason = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case VM_EXIT_INTR_INFO: current_evmcs->vm_exit_intr_info = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case VM_EXIT_INTR_ERROR_CODE: current_evmcs->vm_exit_intr_error_code = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case IDT_VECTORING_INFO_FIELD: current_evmcs->idt_vectoring_info_field = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case IDT_VECTORING_ERROR_CODE: current_evmcs->idt_vectoring_error_code = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case VM_EXIT_INSTRUCTION_LEN: current_evmcs->vm_exit_instruction_len = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case VMX_INSTRUCTION_INFO: current_evmcs->vmx_instruction_info = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case PAGE_FAULT_ERROR_CODE_MASK: current_evmcs->page_fault_error_code_mask = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case PAGE_FAULT_ERROR_CODE_MATCH: current_evmcs->page_fault_error_code_match = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case CR3_TARGET_COUNT: current_evmcs->cr3_target_count = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case VM_EXIT_MSR_STORE_COUNT: current_evmcs->vm_exit_msr_store_count = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case VM_EXIT_MSR_LOAD_COUNT: current_evmcs->vm_exit_msr_load_count = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case VM_ENTRY_MSR_LOAD_COUNT: current_evmcs->vm_entry_msr_load_count = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case HOST_ES_SELECTOR: current_evmcs->host_es_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_CS_SELECTOR: current_evmcs->host_cs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_SS_SELECTOR: current_evmcs->host_ss_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_DS_SELECTOR: current_evmcs->host_ds_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_FS_SELECTOR: current_evmcs->host_fs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_GS_SELECTOR: current_evmcs->host_gs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_TR_SELECTOR: current_evmcs->host_tr_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case GUEST_ES_SELECTOR: current_evmcs->guest_es_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_CS_SELECTOR: current_evmcs->guest_cs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_SS_SELECTOR: current_evmcs->guest_ss_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_DS_SELECTOR: current_evmcs->guest_ds_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_FS_SELECTOR: current_evmcs->guest_fs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_GS_SELECTOR: current_evmcs->guest_gs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_LDTR_SELECTOR: current_evmcs->guest_ldtr_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_TR_SELECTOR: current_evmcs->guest_tr_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case VIRTUAL_PROCESSOR_ID: current_evmcs->virtual_processor_id = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT; break; default: return 1; } @@ -1070,7 +1215,10 @@ static inline int evmcs_vmresume(void) { int ret; - current_evmcs->hv_clean_fields = 0; + /* HOST_RIP */ + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + /* HOST_RSP */ + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; __asm__ __volatile__("push %%rbp;" "push %%rcx;" diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h index f4ea2355dbc2..2225e5077350 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm.h @@ -99,7 +99,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u8 reserved_6[8]; /* Offset 0xe8 */ u64 avic_logical_id; /* Offset 0xf0 */ u64 avic_physical_id; /* Offset 0xf8 */ - u8 reserved_7[768]; + u8 reserved_7[8]; + u64 vmsa_pa; /* Used for an SEV-ES guest */ + u8 reserved_8[720]; + /* + * Offset 0x3e0, 32 bytes reserved + * for use by hypervisor/software. + */ + u8 reserved_sw[32]; }; diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h index 587fbe408b99..a25aabd8f5e7 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -16,6 +16,7 @@ #define CPUID_SVM_BIT 2 #define CPUID_SVM BIT_ULL(CPUID_SVM_BIT) +#define SVM_EXIT_MSR 0x07c #define SVM_EXIT_VMMCALL 0x081 struct svm_test_data { @@ -28,6 +29,11 @@ struct svm_test_data { struct vmcb_save_area *save_area; /* gva */ void *save_area_hva; uint64_t save_area_gpa; + + /* MSR-Bitmap */ + void *msr; /* gva */ + void *msr_hva; + uint64_t msr_gpa; }; struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva); diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 0ebc03ce079c..736ee4a23df6 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -43,6 +43,11 @@ vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva) svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area); svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area); + svm->msr = (void *)vm_vaddr_alloc_page(vm); + svm->msr_hva = addr_gva2hva(vm, (uintptr_t)svm->msr); + svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr); + memset(svm->msr_hva, 0, getpagesize()); + *p_svm_gva = svm_gva; return svm; } @@ -106,6 +111,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR); ctrl->intercept = (1ULL << INTERCEPT_VMRUN) | (1ULL << INTERCEPT_VMMCALL); + ctrl->msrpm_base_pa = svm->msr_gpa; vmcb->save.rip = (u64)guest_rip; vmcb->save.rsp = (u64)guest_rsp; diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index 9f49ead380ab..d19c3ffdea3f 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -160,6 +160,21 @@ int main(int argc, char *argv[]) run->psw_mask &= ~(3UL << (63 - 17)); /* Disable AR mode */ vcpu_run(vm, VCPU_ID); /* Run to sync new state */ + /* Check that the SIDA calls are rejected for non-protected guests */ + ksmo.gaddr = 0; + ksmo.flags = 0; + ksmo.size = 8; + ksmo.op = KVM_S390_MEMOP_SIDA_READ; + ksmo.buf = (uintptr_t)mem1; + ksmo.sida_offset = 0x1c0; + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + TEST_ASSERT(rv == -1 && errno == EINVAL, + "ioctl does not reject SIDA_READ in non-protected mode"); + ksmo.op = KVM_S390_MEMOP_SIDA_WRITE; + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + TEST_ASSERT(rv == -1 && errno == EINVAL, + "ioctl does not reject SIDA_WRITE in non-protected mode"); + kvm_vm_free(vm); return 0; diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c new file mode 100644 index 000000000000..c097b9db495e --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/tprot.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test TEST PROTECTION emulation. + * + * Copyright IBM Corp. 2021 + */ + +#include <sys/mman.h> +#include "test_util.h" +#include "kvm_util.h" + +#define PAGE_SHIFT 12 +#define PAGE_SIZE (1 << PAGE_SHIFT) +#define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) +#define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) + +#define VCPU_ID 1 + +static __aligned(PAGE_SIZE) uint8_t pages[2][PAGE_SIZE]; +static uint8_t *const page_store_prot = pages[0]; +static uint8_t *const page_fetch_prot = pages[1]; + +/* Nonzero return value indicates that address not mapped */ +static int set_storage_key(void *addr, uint8_t key) +{ + int not_mapped = 0; + + asm volatile ( + "lra %[addr], 0(0,%[addr])\n" + " jz 0f\n" + " llill %[not_mapped],1\n" + " j 1f\n" + "0: sske %[key], %[addr]\n" + "1:" + : [addr] "+&a" (addr), [not_mapped] "+r" (not_mapped) + : [key] "r" (key) + : "cc" + ); + return -not_mapped; +} + +enum permission { + READ_WRITE = 0, + READ = 1, + RW_PROTECTED = 2, + TRANSL_UNAVAIL = 3, +}; + +static enum permission test_protection(void *addr, uint8_t key) +{ + uint64_t mask; + + asm volatile ( + "tprot %[addr], 0(%[key])\n" + " ipm %[mask]\n" + : [mask] "=r" (mask) + : [addr] "Q" (*(char *)addr), + [key] "a" (key) + : "cc" + ); + + return (enum permission)(mask >> 28); +} + +enum stage { + STAGE_END, + STAGE_INIT_SIMPLE, + TEST_SIMPLE, + STAGE_INIT_FETCH_PROT_OVERRIDE, + TEST_FETCH_PROT_OVERRIDE, + TEST_STORAGE_PROT_OVERRIDE, +}; + +struct test { + enum stage stage; + void *addr; + uint8_t key; + enum permission expected; +} tests[] = { + /* + * We perform each test in the array by executing TEST PROTECTION on + * the specified addr with the specified key and checking if the returned + * permissions match the expected value. + * Both guest and host cooperate to set up the required test conditions. + * A central condition is that the page targeted by addr has to be DAT + * protected in the host mappings, in order for KVM to emulate the + * TEST PROTECTION instruction. + * Since the page tables are shared, the host uses mprotect to achieve + * this. + * + * Test resulting in RW_PROTECTED/TRANSL_UNAVAIL will be interpreted + * by SIE, not KVM, but there is no harm in testing them also. + * See Enhanced Suppression-on-Protection Facilities in the + * Interpretive-Execution Mode + */ + /* + * guest: set storage key of page_store_prot to 1 + * storage key of page_fetch_prot to 9 and enable + * protection for it + * STAGE_INIT_SIMPLE + * host: write protect both via mprotect + */ + /* access key 0 matches any storage key -> RW */ + { TEST_SIMPLE, page_store_prot, 0x00, READ_WRITE }, + /* access key matches storage key -> RW */ + { TEST_SIMPLE, page_store_prot, 0x10, READ_WRITE }, + /* mismatched keys, but no fetch protection -> RO */ + { TEST_SIMPLE, page_store_prot, 0x20, READ }, + /* access key 0 matches any storage key -> RW */ + { TEST_SIMPLE, page_fetch_prot, 0x00, READ_WRITE }, + /* access key matches storage key -> RW */ + { TEST_SIMPLE, page_fetch_prot, 0x90, READ_WRITE }, + /* mismatched keys, fetch protection -> inaccessible */ + { TEST_SIMPLE, page_fetch_prot, 0x10, RW_PROTECTED }, + /* page 0 not mapped yet -> translation not available */ + { TEST_SIMPLE, (void *)0x00, 0x10, TRANSL_UNAVAIL }, + /* + * host: try to map page 0 + * guest: set storage key of page 0 to 9 and enable fetch protection + * STAGE_INIT_FETCH_PROT_OVERRIDE + * host: write protect page 0 + * enable fetch protection override + */ + /* mismatched keys, fetch protection, but override applies -> RO */ + { TEST_FETCH_PROT_OVERRIDE, (void *)0x00, 0x10, READ }, + /* mismatched keys, fetch protection, override applies to 0-2048 only -> inaccessible */ + { TEST_FETCH_PROT_OVERRIDE, (void *)2049, 0x10, RW_PROTECTED }, + /* + * host: enable storage protection override + */ + /* mismatched keys, but override applies (storage key 9) -> RW */ + { TEST_STORAGE_PROT_OVERRIDE, page_fetch_prot, 0x10, READ_WRITE }, + /* mismatched keys, no fetch protection, override doesn't apply -> RO */ + { TEST_STORAGE_PROT_OVERRIDE, page_store_prot, 0x20, READ }, + /* mismatched keys, but override applies (storage key 9) -> RW */ + { TEST_STORAGE_PROT_OVERRIDE, (void *)2049, 0x10, READ_WRITE }, + /* end marker */ + { STAGE_END, 0, 0, 0 }, +}; + +static enum stage perform_next_stage(int *i, bool mapped_0) +{ + enum stage stage = tests[*i].stage; + enum permission result; + bool skip; + + for (; tests[*i].stage == stage; (*i)++) { + /* + * Some fetch protection override tests require that page 0 + * be mapped, however, when the hosts tries to map that page via + * vm_vaddr_alloc, it may happen that some other page gets mapped + * instead. + * In order to skip these tests we detect this inside the guest + */ + skip = tests[*i].addr < (void *)4096 && + tests[*i].expected != TRANSL_UNAVAIL && + !mapped_0; + if (!skip) { + result = test_protection(tests[*i].addr, tests[*i].key); + GUEST_ASSERT_2(result == tests[*i].expected, *i, result); + } + } + return stage; +} + +static void guest_code(void) +{ + bool mapped_0; + int i = 0; + + GUEST_ASSERT_EQ(set_storage_key(page_store_prot, 0x10), 0); + GUEST_ASSERT_EQ(set_storage_key(page_fetch_prot, 0x98), 0); + GUEST_SYNC(STAGE_INIT_SIMPLE); + GUEST_SYNC(perform_next_stage(&i, false)); + + /* Fetch-protection override */ + mapped_0 = !set_storage_key((void *)0, 0x98); + GUEST_SYNC(STAGE_INIT_FETCH_PROT_OVERRIDE); + GUEST_SYNC(perform_next_stage(&i, mapped_0)); + + /* Storage-protection override */ + GUEST_SYNC(perform_next_stage(&i, mapped_0)); +} + +#define HOST_SYNC(vmp, stage) \ +({ \ + struct kvm_vm *__vm = (vmp); \ + struct ucall uc; \ + int __stage = (stage); \ + \ + vcpu_run(__vm, VCPU_ID); \ + get_ucall(__vm, VCPU_ID, &uc); \ + if (uc.cmd == UCALL_ABORT) { \ + TEST_FAIL("line %lu: %s, hints: %lu, %lu", uc.args[1], \ + (const char *)uc.args[0], uc.args[2], uc.args[3]); \ + } \ + ASSERT_EQ(uc.cmd, UCALL_SYNC); \ + ASSERT_EQ(uc.args[1], __stage); \ +}) + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_run *run; + vm_vaddr_t guest_0_page; + + vm = vm_create_default(VCPU_ID, 0, guest_code); + run = vcpu_state(vm, VCPU_ID); + + HOST_SYNC(vm, STAGE_INIT_SIMPLE); + mprotect(addr_gva2hva(vm, (vm_vaddr_t)pages), PAGE_SIZE * 2, PROT_READ); + HOST_SYNC(vm, TEST_SIMPLE); + + guest_0_page = vm_vaddr_alloc(vm, PAGE_SIZE, 0); + if (guest_0_page != 0) + print_skip("Did not allocate page at 0 for fetch protection override tests"); + HOST_SYNC(vm, STAGE_INIT_FETCH_PROT_OVERRIDE); + if (guest_0_page == 0) + mprotect(addr_gva2hva(vm, (vm_vaddr_t)0), PAGE_SIZE, PROT_READ); + run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE; + run->kvm_dirty_regs = KVM_SYNC_CRS; + HOST_SYNC(vm, TEST_FETCH_PROT_OVERRIDE); + + run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE; + run->kvm_dirty_regs = KVM_SYNC_CRS; + HOST_SYNC(vm, TEST_STORAGE_PROT_OVERRIDE); +} diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 4c7841dfd481..d12e043aa2ee 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -10,6 +10,7 @@ #include <stdlib.h> #include <string.h> #include <sys/ioctl.h> +#include <linux/bitmap.h> #include "test_util.h" @@ -32,6 +33,22 @@ static void guest_nmi_handler(struct ex_regs *regs) { } +/* Exits to L1 destroy GRPs! */ +static inline void rdmsr_fs_base(void) +{ + __asm__ __volatile__ ("mov $0xc0000100, %%rcx; rdmsr" : : : + "rax", "rbx", "rcx", "rdx", + "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", + "r13", "r14", "r15"); +} +static inline void rdmsr_gs_base(void) +{ + __asm__ __volatile__ ("mov $0xc0000101, %%rcx; rdmsr" : : : + "rax", "rbx", "rcx", "rdx", + "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", + "r13", "r14", "r15"); +} + void l2_guest_code(void) { GUEST_SYNC(7); @@ -41,6 +58,15 @@ void l2_guest_code(void) /* Forced exit to L1 upon restore */ GUEST_SYNC(9); + vmcall(); + + /* MSR-Bitmap tests */ + rdmsr_fs_base(); /* intercepted */ + rdmsr_fs_base(); /* intercepted */ + rdmsr_gs_base(); /* not intercepted */ + vmcall(); + rdmsr_gs_base(); /* intercepted */ + /* Done, exit to L1 and never come back. */ vmcall(); } @@ -76,8 +102,9 @@ void guest_code(struct vmx_pages *vmx_pages) current_evmcs->revision_id = EVMCS_VERSION; GUEST_SYNC(6); - current_evmcs->pin_based_vm_exec_control |= - PIN_BASED_NMI_EXITING; + vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmreadz(PIN_BASED_VM_EXEC_CONTROL) | + PIN_BASED_NMI_EXITING); + GUEST_ASSERT(!vmlaunch()); GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); @@ -91,6 +118,39 @@ void guest_code(struct vmx_pages *vmx_pages) GUEST_SYNC(10); GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + current_evmcs->guest_rip += 3; /* vmcall */ + + /* Intercept RDMSR 0xc0000100 */ + vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmreadz(CPU_BASED_VM_EXEC_CONTROL) | + CPU_BASED_USE_MSR_BITMAPS); + set_bit(MSR_FS_BASE & 0x1fff, vmx_pages->msr + 0x400); + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); + current_evmcs->guest_rip += 2; /* rdmsr */ + + /* Enable enlightened MSR bitmap */ + current_evmcs->hv_enlightenments_control.msr_bitmap = 1; + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); + current_evmcs->guest_rip += 2; /* rdmsr */ + + /* Intercept RDMSR 0xc0000101 without telling KVM about it */ + set_bit(MSR_GS_BASE & 0x1fff, vmx_pages->msr + 0x400); + /* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */ + current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; + GUEST_ASSERT(!vmresume()); + /* Make sure we don't see EXIT_REASON_MSR_READ here so eMSR bitmap works */ + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + current_evmcs->guest_rip += 3; /* vmcall */ + + /* Now tell KVM we've changed MSR-Bitmap */ + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); + current_evmcs->guest_rip += 2; /* rdmsr */ + + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); GUEST_SYNC(11); /* Try enlightened vmptrld with an incorrect GPA */ diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 7e2d2d17d2ed..8c245ab2d98a 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -49,16 +49,13 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, bool evmcs_expected) { int i; - int nent = 9; + int nent_expected = 10; u32 test_val; - if (evmcs_expected) - nent += 1; /* 0x4000000A */ - - TEST_ASSERT(hv_cpuid_entries->nent == nent, + TEST_ASSERT(hv_cpuid_entries->nent == nent_expected, "KVM_GET_SUPPORTED_HV_CPUID should return %d entries" - " with evmcs=%d (returned %d)", - nent, evmcs_expected, hv_cpuid_entries->nent); + " (returned %d)", + nent_expected, hv_cpuid_entries->nent); for (i = 0; i < hv_cpuid_entries->nent; i++) { struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i]; @@ -68,9 +65,6 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, "function %x is our of supported range", entry->function); - TEST_ASSERT(evmcs_expected || (entry->function != 0x4000000A), - "0x4000000A leaf should not be reported"); - TEST_ASSERT(entry->index == 0, ".index field should be zero"); @@ -97,8 +91,20 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, "NoNonArchitecturalCoreSharing bit" " doesn't reflect SMT setting"); break; - } + case 0x4000000A: + TEST_ASSERT(entry->eax & (1UL << 19), + "Enlightened MSR-Bitmap should always be supported" + " 0x40000000.EAX: %x", entry->eax); + if (evmcs_expected) + TEST_ASSERT((entry->eax & 0xffff) == 0x101, + "Supported Enlightened VMCS version range is supposed to be 1:1" + " 0x40000000.EAX: %x", entry->eax); + + break; + default: + break; + } /* * If needed for debug: * fprintf(stdout, @@ -107,7 +113,6 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, * entry->edx); */ } - } void test_hv_cpuid_e2big(struct kvm_vm *vm, bool system) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c new file mode 100644 index 000000000000..21f5ca9197da --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM_GET/SET_* tests + * + * Copyright (C) 2022, Red Hat, Inc. + * + * Tests for Hyper-V extensions to SVM. + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <linux/bitmap.h> + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "hyperv.h" + +#define VCPU_ID 1 +#define L2_GUEST_STACK_SIZE 256 + +struct hv_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB + */ +#define VMCB_HV_NESTED_ENLIGHTENMENTS (1U << 31) + +static inline void vmmcall(void) +{ + __asm__ __volatile__("vmmcall"); +} + +void l2_guest_code(void) +{ + GUEST_SYNC(3); + /* Exit to L1 */ + vmmcall(); + + /* MSR-Bitmap tests */ + rdmsr(MSR_FS_BASE); /* intercepted */ + rdmsr(MSR_FS_BASE); /* intercepted */ + rdmsr(MSR_GS_BASE); /* not intercepted */ + vmmcall(); + rdmsr(MSR_GS_BASE); /* intercepted */ + + GUEST_SYNC(5); + + /* Done, exit to L1 and never come back. */ + vmmcall(); +} + +static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + struct hv_enlightenments *hve = + (struct hv_enlightenments *)vmcb->control.reserved_sw; + + GUEST_SYNC(1); + + wrmsr(HV_X64_MSR_GUEST_OS_ID, (u64)0x8100 << 48); + + GUEST_ASSERT(svm->vmcb_gpa); + /* Prepare for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(2); + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_SYNC(4); + vmcb->save.rip += 3; + + /* Intercept RDMSR 0xc0000100 */ + vmcb->control.intercept |= 1ULL << INTERCEPT_MSR_PROT; + set_bit(2 * (MSR_FS_BASE & 0x1fff), svm->msr + 0x800); + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); + vmcb->save.rip += 2; /* rdmsr */ + + /* Enable enlightened MSR bitmap */ + hve->hv_enlightenments_control.msr_bitmap = 1; + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); + vmcb->save.rip += 2; /* rdmsr */ + + /* Intercept RDMSR 0xc0000101 without telling KVM about it */ + set_bit(2 * (MSR_GS_BASE & 0x1fff), svm->msr + 0x800); + /* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */ + vmcb->control.clean |= VMCB_HV_NESTED_ENLIGHTENMENTS; + run_guest(vmcb, svm->vmcb_gpa); + /* Make sure we don't see SVM_EXIT_MSR here so eMSR bitmap works */ + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + vmcb->save.rip += 3; /* vmcall */ + + /* Now tell KVM we've changed MSR-Bitmap */ + vmcb->control.clean &= ~VMCB_HV_NESTED_ENLIGHTENMENTS; + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); + vmcb->save.rip += 2; /* rdmsr */ + + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_SYNC(6); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_gva = 0; + + struct kvm_vm *vm; + struct kvm_run *run; + struct ucall uc; + int stage; + + if (!nested_svm_supported()) { + print_skip("Nested SVM not supported"); + exit(KSFT_SKIP); + } + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_hv_cpuid(vm, VCPU_ID); + run = vcpu_state(vm, VCPU_ID); + vcpu_alloc_svm(vm, &nested_gva); + vcpu_args_set(vm, VCPU_ID, 1, nested_gva); + + for (stage = 1;; stage++) { + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Stage %d: unexpected exit reason: %u (%s),\n", + stage, run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + /* UCALL_SYNC is handled here. */ + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + + } + +done: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index 80056bbbb003..d1dc1acf997c 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -21,6 +21,8 @@ #define NR_LOCK_TESTING_THREADS 3 #define NR_LOCK_TESTING_ITERATIONS 10000 +bool have_sev_es; + static int __sev_ioctl(int vm_fd, int cmd_id, void *data, __u32 *fw_error) { struct kvm_sev_cmd cmd = { @@ -172,10 +174,18 @@ static void test_sev_migrate_parameters(void) *sev_es_vm_no_vmsa; int ret; - sev_vm = sev_vm_create(/* es= */ false); - sev_es_vm = sev_vm_create(/* es= */ true); vm_no_vcpu = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); vm_no_sev = aux_vm_create(true); + ret = __sev_migrate_from(vm_no_vcpu->fd, vm_no_sev->fd); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "Migrations require SEV enabled. ret %d, errno: %d\n", ret, + errno); + + if (!have_sev_es) + goto out; + + sev_vm = sev_vm_create(/* es= */ false); + sev_es_vm = sev_vm_create(/* es= */ true); sev_es_vm_no_vmsa = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL); vm_vcpu_add(sev_es_vm_no_vmsa, 1); @@ -204,14 +214,10 @@ static void test_sev_migrate_parameters(void) "SEV-ES migrations require UPDATE_VMSA. ret %d, errno: %d\n", ret, errno); - ret = __sev_migrate_from(vm_no_vcpu->fd, vm_no_sev->fd); - TEST_ASSERT(ret == -1 && errno == EINVAL, - "Migrations require SEV enabled. ret %d, errno: %d\n", ret, - errno); - kvm_vm_free(sev_vm); kvm_vm_free(sev_es_vm); kvm_vm_free(sev_es_vm_no_vmsa); +out: kvm_vm_free(vm_no_vcpu); kvm_vm_free(vm_no_sev); } @@ -300,7 +306,6 @@ static void test_sev_mirror_parameters(void) int ret; sev_vm = sev_vm_create(/* es= */ false); - sev_es_vm = sev_vm_create(/* es= */ true); vm_with_vcpu = aux_vm_create(true); vm_no_vcpu = aux_vm_create(false); @@ -310,6 +315,21 @@ static void test_sev_mirror_parameters(void) "Should not be able copy context to self. ret: %d, errno: %d\n", ret, errno); + ret = __sev_mirror_create(vm_no_vcpu->fd, vm_with_vcpu->fd); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "Copy context requires SEV enabled. ret %d, errno: %d\n", ret, + errno); + + ret = __sev_mirror_create(vm_with_vcpu->fd, sev_vm->fd); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d\n", + ret, errno); + + if (!have_sev_es) + goto out; + + sev_es_vm = sev_vm_create(/* es= */ true); ret = __sev_mirror_create(sev_vm->fd, sev_es_vm->fd); TEST_ASSERT( ret == -1 && errno == EINVAL, @@ -322,63 +342,97 @@ static void test_sev_mirror_parameters(void) "Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d\n", ret, errno); - ret = __sev_mirror_create(vm_no_vcpu->fd, vm_with_vcpu->fd); - TEST_ASSERT(ret == -1 && errno == EINVAL, - "Copy context requires SEV enabled. ret %d, errno: %d\n", ret, - errno); - - ret = __sev_mirror_create(vm_with_vcpu->fd, sev_vm->fd); - TEST_ASSERT( - ret == -1 && errno == EINVAL, - "SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d\n", - ret, errno); + kvm_vm_free(sev_es_vm); +out: kvm_vm_free(sev_vm); - kvm_vm_free(sev_es_vm); kvm_vm_free(vm_with_vcpu); kvm_vm_free(vm_no_vcpu); } static void test_sev_move_copy(void) { - struct kvm_vm *dst_vm, *sev_vm, *mirror_vm, *dst_mirror_vm; - int ret; + struct kvm_vm *dst_vm, *dst2_vm, *dst3_vm, *sev_vm, *mirror_vm, + *dst_mirror_vm, *dst2_mirror_vm, *dst3_mirror_vm; sev_vm = sev_vm_create(/* es= */ false); dst_vm = aux_vm_create(true); + dst2_vm = aux_vm_create(true); + dst3_vm = aux_vm_create(true); mirror_vm = aux_vm_create(false); dst_mirror_vm = aux_vm_create(false); + dst2_mirror_vm = aux_vm_create(false); + dst3_mirror_vm = aux_vm_create(false); sev_mirror_create(mirror_vm->fd, sev_vm->fd); - ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd); - TEST_ASSERT(ret == -1 && errno == EBUSY, - "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret, - errno); - /* The mirror itself can be migrated. */ sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd); - ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd); - TEST_ASSERT(ret == -1 && errno == EBUSY, - "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret, - errno); + sev_migrate_from(dst_vm->fd, sev_vm->fd); + + sev_migrate_from(dst2_vm->fd, dst_vm->fd); + sev_migrate_from(dst2_mirror_vm->fd, dst_mirror_vm->fd); + + sev_migrate_from(dst3_mirror_vm->fd, dst2_mirror_vm->fd); + sev_migrate_from(dst3_vm->fd, dst2_vm->fd); + + kvm_vm_free(dst_vm); + kvm_vm_free(sev_vm); + kvm_vm_free(dst2_vm); + kvm_vm_free(dst3_vm); + kvm_vm_free(mirror_vm); + kvm_vm_free(dst_mirror_vm); + kvm_vm_free(dst2_mirror_vm); + kvm_vm_free(dst3_mirror_vm); /* - * mirror_vm is not a mirror anymore, dst_mirror_vm is. Thus, - * the owner can be copied as soon as dst_mirror_vm is gone. + * Run similar test be destroy mirrors before mirrored VMs to ensure + * destruction is done safely. */ - kvm_vm_free(dst_mirror_vm); + sev_vm = sev_vm_create(/* es= */ false); + dst_vm = aux_vm_create(true); + mirror_vm = aux_vm_create(false); + dst_mirror_vm = aux_vm_create(false); + + sev_mirror_create(mirror_vm->fd, sev_vm->fd); + + sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd); sev_migrate_from(dst_vm->fd, sev_vm->fd); kvm_vm_free(mirror_vm); + kvm_vm_free(dst_mirror_vm); kvm_vm_free(dst_vm); kvm_vm_free(sev_vm); } +#define X86_FEATURE_SEV (1 << 1) +#define X86_FEATURE_SEV_ES (1 << 3) + int main(int argc, char *argv[]) { + struct kvm_cpuid_entry2 *cpuid; + + if (!kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM) && + !kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { + print_skip("Capabilities not available"); + exit(KSFT_SKIP); + } + + cpuid = kvm_get_supported_cpuid_entry(0x80000000); + if (cpuid->eax < 0x8000001f) { + print_skip("AMD memory encryption not available"); + exit(KSFT_SKIP); + } + cpuid = kvm_get_supported_cpuid_entry(0x8000001f); + if (!(cpuid->eax & X86_FEATURE_SEV)) { + print_skip("AMD SEV not available"); + exit(KSFT_SKIP); + } + have_sev_es = !!(cpuid->eax & X86_FEATURE_SEV_ES); + if (kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) { test_sev_migrate_from(/* es= */ false); - test_sev_migrate_from(/* es= */ true); + if (have_sev_es) + test_sev_migrate_from(/* es= */ true); test_sev_migrate_locking(); test_sev_migrate_parameters(); if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) @@ -386,7 +440,8 @@ int main(int argc, char *argv[]) } if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { test_sev_mirror(/* es= */ false); - test_sev_mirror(/* es= */ true); + if (have_sev_es) + test_sev_mirror(/* es= */ true); test_sev_mirror_parameters(); } return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 58d31da8a2f7..83c57bcc6eb6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -246,9 +246,8 @@ static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait) return true; } -static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int req, struct cpumask *tmp, - int current_cpu) +static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req, + struct cpumask *tmp, int current_cpu) { int cpu; @@ -291,7 +290,7 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, vcpu = kvm_get_vcpu(kvm, i); if (!vcpu) continue; - kvm_make_vcpu_request(kvm, vcpu, req, cpus, me); + kvm_make_vcpu_request(vcpu, req, cpus, me); } called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); @@ -317,7 +316,7 @@ bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu == except) continue; - kvm_make_vcpu_request(kvm, vcpu, req, cpus, me); + kvm_make_vcpu_request(vcpu, req, cpus, me); } called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); |