diff options
Diffstat (limited to 'tools/testing/selftests/kvm')
58 files changed, 5263 insertions, 768 deletions
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index bd83158e0e0b..0709af0144c8 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only +/aarch64/debug-exceptions /aarch64/get-reg-list -/aarch64/get-reg-list-sve /aarch64/vgic_init /s390x/memop /s390x/resets @@ -8,12 +8,15 @@ /x86_64/cr4_cpuid_sync_test /x86_64/debug_regs /x86_64/evmcs_test +/x86_64/emulator_error_test /x86_64/get_cpuid_test /x86_64/get_msr_index_features /x86_64/kvm_pv_test /x86_64/hyperv_clock /x86_64/hyperv_cpuid +/x86_64/hyperv_features /x86_64/mmio_warning_test +/x86_64/mmu_role_test /x86_64/platform_info_test /x86_64/set_boot_cpu_id /x86_64/set_sregs_test @@ -29,11 +32,13 @@ /x86_64/vmx_preemption_timer_test /x86_64/vmx_set_nested_state_test /x86_64/vmx_tsc_adjust_test +/x86_64/vmx_nested_tsc_scaling_test /x86_64/xapic_ipi_test /x86_64/xen_shinfo_test /x86_64/xen_vmcall_test /x86_64/xss_msr_test /x86_64/vmx_pmu_msrs_test +/access_tracking_perf_test /demand_paging_test /dirty_log_test /dirty_log_perf_test @@ -41,5 +46,7 @@ /kvm_create_max_vcpus /kvm_page_table_test /memslot_modification_stress_test +/memslot_perf_test /set_memory_region_test /steal_time +/kvm_binary_stats_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index e439d027939d..5832f510a16c 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -33,19 +33,22 @@ ifeq ($(ARCH),s390) UNAME_M := s390x endif -LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c -LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S -LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c +LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/rbtree.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c +LIBKVM_x86_64 = lib/x86_64/apic.c lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S +LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handlers.S LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test +TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test +TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test @@ -60,6 +63,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test TEST_GEN_PROGS_x86_64 += x86_64/debug_regs @@ -67,6 +71,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test +TEST_GEN_PROGS_x86_64 += access_tracking_perf_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test @@ -74,11 +79,13 @@ TEST_GEN_PROGS_x86_64 += hardware_disable_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += kvm_page_table_test TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test +TEST_GEN_PROGS_x86_64 += memslot_perf_test TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time +TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test +TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list -TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test @@ -87,6 +94,7 @@ TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus TEST_GEN_PROGS_aarch64 += kvm_page_table_test TEST_GEN_PROGS_aarch64 += set_memory_region_test TEST_GEN_PROGS_aarch64 += steal_time +TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test TEST_GEN_PROGS_s390x = s390x/memop TEST_GEN_PROGS_s390x += s390x/resets @@ -96,6 +104,7 @@ TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += kvm_create_max_vcpus TEST_GEN_PROGS_s390x += kvm_page_table_test TEST_GEN_PROGS_s390x += set_memory_region_test +TEST_GEN_PROGS_s390x += kvm_binary_stats_test TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c new file mode 100644 index 000000000000..e5e6c92b60da --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_util.h> +#include <kvm_util.h> +#include <processor.h> + +#define VCPU_ID 0 + +#define MDSCR_KDE (1 << 13) +#define MDSCR_MDE (1 << 15) +#define MDSCR_SS (1 << 0) + +#define DBGBCR_LEN8 (0xff << 5) +#define DBGBCR_EXEC (0x0 << 3) +#define DBGBCR_EL1 (0x1 << 1) +#define DBGBCR_E (0x1 << 0) + +#define DBGWCR_LEN8 (0xff << 5) +#define DBGWCR_RD (0x1 << 3) +#define DBGWCR_WR (0x2 << 3) +#define DBGWCR_EL1 (0x1 << 1) +#define DBGWCR_E (0x1 << 0) + +#define SPSR_D (1 << 9) +#define SPSR_SS (1 << 21) + +extern unsigned char sw_bp, hw_bp, bp_svc, bp_brk, hw_wp, ss_start; +static volatile uint64_t sw_bp_addr, hw_bp_addr; +static volatile uint64_t wp_addr, wp_data_addr; +static volatile uint64_t svc_addr; +static volatile uint64_t ss_addr[4], ss_idx; +#define PC(v) ((uint64_t)&(v)) + +static void reset_debug_state(void) +{ + asm volatile("msr daifset, #8"); + + write_sysreg(osdlr_el1, 0); + write_sysreg(oslar_el1, 0); + isb(); + + write_sysreg(mdscr_el1, 0); + /* This test only uses the first bp and wp slot. */ + write_sysreg(dbgbvr0_el1, 0); + write_sysreg(dbgbcr0_el1, 0); + write_sysreg(dbgwcr0_el1, 0); + write_sysreg(dbgwvr0_el1, 0); + isb(); +} + +static void install_wp(uint64_t addr) +{ + uint32_t wcr; + uint32_t mdscr; + + wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E; + write_sysreg(dbgwcr0_el1, wcr); + write_sysreg(dbgwvr0_el1, addr); + isb(); + + asm volatile("msr daifclr, #8"); + + mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_MDE; + write_sysreg(mdscr_el1, mdscr); + isb(); +} + +static void install_hw_bp(uint64_t addr) +{ + uint32_t bcr; + uint32_t mdscr; + + bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E; + write_sysreg(dbgbcr0_el1, bcr); + write_sysreg(dbgbvr0_el1, addr); + isb(); + + asm volatile("msr daifclr, #8"); + + mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_MDE; + write_sysreg(mdscr_el1, mdscr); + isb(); +} + +static void install_ss(void) +{ + uint32_t mdscr; + + asm volatile("msr daifclr, #8"); + + mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_SS; + write_sysreg(mdscr_el1, mdscr); + isb(); +} + +static volatile char write_data; + +static void guest_code(void) +{ + GUEST_SYNC(0); + + /* Software-breakpoint */ + asm volatile("sw_bp: brk #0"); + GUEST_ASSERT_EQ(sw_bp_addr, PC(sw_bp)); + + GUEST_SYNC(1); + + /* Hardware-breakpoint */ + reset_debug_state(); + install_hw_bp(PC(hw_bp)); + asm volatile("hw_bp: nop"); + GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp)); + + GUEST_SYNC(2); + + /* Hardware-breakpoint + svc */ + reset_debug_state(); + install_hw_bp(PC(bp_svc)); + asm volatile("bp_svc: svc #0"); + GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_svc)); + GUEST_ASSERT_EQ(svc_addr, PC(bp_svc) + 4); + + GUEST_SYNC(3); + + /* Hardware-breakpoint + software-breakpoint */ + reset_debug_state(); + install_hw_bp(PC(bp_brk)); + asm volatile("bp_brk: brk #0"); + GUEST_ASSERT_EQ(sw_bp_addr, PC(bp_brk)); + GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_brk)); + + GUEST_SYNC(4); + + /* Watchpoint */ + reset_debug_state(); + install_wp(PC(write_data)); + write_data = 'x'; + GUEST_ASSERT_EQ(write_data, 'x'); + GUEST_ASSERT_EQ(wp_data_addr, PC(write_data)); + + GUEST_SYNC(5); + + /* Single-step */ + reset_debug_state(); + install_ss(); + ss_idx = 0; + asm volatile("ss_start:\n" + "mrs x0, esr_el1\n" + "add x0, x0, #1\n" + "msr daifset, #8\n" + : : : "x0"); + GUEST_ASSERT_EQ(ss_addr[0], PC(ss_start)); + GUEST_ASSERT_EQ(ss_addr[1], PC(ss_start) + 4); + GUEST_ASSERT_EQ(ss_addr[2], PC(ss_start) + 8); + + GUEST_DONE(); +} + +static void guest_sw_bp_handler(struct ex_regs *regs) +{ + sw_bp_addr = regs->pc; + regs->pc += 4; +} + +static void guest_hw_bp_handler(struct ex_regs *regs) +{ + hw_bp_addr = regs->pc; + regs->pstate |= SPSR_D; +} + +static void guest_wp_handler(struct ex_regs *regs) +{ + wp_data_addr = read_sysreg(far_el1); + wp_addr = regs->pc; + regs->pstate |= SPSR_D; +} + +static void guest_ss_handler(struct ex_regs *regs) +{ + GUEST_ASSERT_1(ss_idx < 4, ss_idx); + ss_addr[ss_idx++] = regs->pc; + regs->pstate |= SPSR_SS; +} + +static void guest_svc_handler(struct ex_regs *regs) +{ + svc_addr = regs->pc; +} + +static int debug_version(struct kvm_vm *vm) +{ + uint64_t id_aa64dfr0; + + get_reg(vm, VCPU_ID, ARM64_SYS_REG(ID_AA64DFR0_EL1), &id_aa64dfr0); + return id_aa64dfr0 & 0xf; +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct ucall uc; + int stage; + + vm = vm_create_default(VCPU_ID, 0, guest_code); + ucall_init(vm, NULL); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + + if (debug_version(vm) < 6) { + print_skip("Armv8 debug architecture not supported."); + kvm_vm_free(vm); + exit(KSFT_SKIP); + } + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_BRK_INS, guest_sw_bp_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_HW_BP_CURRENT, guest_hw_bp_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_WP_CURRENT, guest_wp_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_SSTEP_CURRENT, guest_ss_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_SVC64, guest_svc_handler); + + for (stage = 0; stage < 7; stage++) { + vcpu_run(vm, VCPU_ID); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(uc.args[1] == stage, + "Stage %d: Unexpected sync ucall, got %lx", + stage, (ulong)uc.args[1]); + break; + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld\n\tvalues: %#lx, %#lx", + (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2], uc.args[3]); + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c deleted file mode 100644 index efba76682b4b..000000000000 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c +++ /dev/null @@ -1,3 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define REG_LIST_SVE -#include "get-reg-list.c" diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 486932164cf2..cc898181faab 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -27,17 +27,37 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/wait.h> #include "kvm_util.h" #include "test_util.h" #include "processor.h" -#ifdef REG_LIST_SVE -#define reg_list_sve() (true) -#else -#define reg_list_sve() (false) -#endif +static struct kvm_reg_list *reg_list; +static __u64 *blessed_reg, blessed_n; -#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK) +struct reg_sublist { + const char *name; + long capability; + int feature; + bool finalize; + __u64 *regs; + __u64 regs_n; + __u64 *rejects_set; + __u64 rejects_set_n; +}; + +struct vcpu_config { + char *name; + struct reg_sublist sublists[]; +}; + +static struct vcpu_config *vcpu_configs[]; +static int vcpu_configs_n; + +#define for_each_sublist(c, s) \ + for ((s) = &(c)->sublists[0]; (s)->regs; ++(s)) #define for_each_reg(i) \ for ((i) = 0; (i) < reg_list->n; ++(i)) @@ -54,12 +74,41 @@ for_each_reg_filtered(i) \ if (!find_reg(blessed_reg, blessed_n, reg_list->reg[i])) +static const char *config_name(struct vcpu_config *c) +{ + struct reg_sublist *s; + int len = 0; -static struct kvm_reg_list *reg_list; + if (c->name) + return c->name; -static __u64 base_regs[], vregs[], sve_regs[], rejects_set[]; -static __u64 base_regs_n, vregs_n, sve_regs_n, rejects_set_n; -static __u64 *blessed_reg, blessed_n; + for_each_sublist(c, s) + len += strlen(s->name) + 1; + + c->name = malloc(len); + + len = 0; + for_each_sublist(c, s) { + if (!strcmp(s->name, "base")) + continue; + strcat(c->name + len, s->name); + len += strlen(s->name) + 1; + c->name[len - 1] = '+'; + } + c->name[len - 1] = '\0'; + + return c->name; +} + +static bool has_cap(struct vcpu_config *c, long capability) +{ + struct reg_sublist *s; + + for_each_sublist(c, s) + if (s->capability == capability) + return true; + return false; +} static bool filter_reg(__u64 reg) { @@ -96,11 +145,13 @@ static const char *str_with_index(const char *template, __u64 index) return (const char *)str; } +#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK) + #define CORE_REGS_XX_NR_WORDS 2 #define CORE_SPSR_XX_NR_WORDS 2 #define CORE_FPREGS_XX_NR_WORDS 4 -static const char *core_id_to_str(__u64 id) +static const char *core_id_to_str(struct vcpu_config *c, __u64 id) { __u64 core_off = id & ~REG_MASK, idx; @@ -111,7 +162,7 @@ static const char *core_id_to_str(__u64 id) case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... KVM_REG_ARM_CORE_REG(regs.regs[30]): idx = (core_off - KVM_REG_ARM_CORE_REG(regs.regs[0])) / CORE_REGS_XX_NR_WORDS; - TEST_ASSERT(idx < 31, "Unexpected regs.regs index: %lld", idx); + TEST_ASSERT(idx < 31, "%s: Unexpected regs.regs index: %lld", config_name(c), idx); return str_with_index("KVM_REG_ARM_CORE_REG(regs.regs[##])", idx); case KVM_REG_ARM_CORE_REG(regs.sp): return "KVM_REG_ARM_CORE_REG(regs.sp)"; @@ -126,12 +177,12 @@ static const char *core_id_to_str(__u64 id) case KVM_REG_ARM_CORE_REG(spsr[0]) ... KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]): idx = (core_off - KVM_REG_ARM_CORE_REG(spsr[0])) / CORE_SPSR_XX_NR_WORDS; - TEST_ASSERT(idx < KVM_NR_SPSR, "Unexpected spsr index: %lld", idx); + TEST_ASSERT(idx < KVM_NR_SPSR, "%s: Unexpected spsr index: %lld", config_name(c), idx); return str_with_index("KVM_REG_ARM_CORE_REG(spsr[##])", idx); case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): idx = (core_off - KVM_REG_ARM_CORE_REG(fp_regs.vregs[0])) / CORE_FPREGS_XX_NR_WORDS; - TEST_ASSERT(idx < 32, "Unexpected fp_regs.vregs index: %lld", idx); + TEST_ASSERT(idx < 32, "%s: Unexpected fp_regs.vregs index: %lld", config_name(c), idx); return str_with_index("KVM_REG_ARM_CORE_REG(fp_regs.vregs[##])", idx); case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): return "KVM_REG_ARM_CORE_REG(fp_regs.fpsr)"; @@ -139,11 +190,11 @@ static const char *core_id_to_str(__u64 id) return "KVM_REG_ARM_CORE_REG(fp_regs.fpcr)"; } - TEST_FAIL("Unknown core reg id: 0x%llx", id); + TEST_FAIL("%s: Unknown core reg id: 0x%llx", config_name(c), id); return NULL; } -static const char *sve_id_to_str(__u64 id) +static const char *sve_id_to_str(struct vcpu_config *c, __u64 id) { __u64 sve_off, n, i; @@ -153,37 +204,37 @@ static const char *sve_id_to_str(__u64 id) sve_off = id & ~(REG_MASK | ((1ULL << 5) - 1)); i = id & (KVM_ARM64_SVE_MAX_SLICES - 1); - TEST_ASSERT(i == 0, "Currently we don't expect slice > 0, reg id 0x%llx", id); + TEST_ASSERT(i == 0, "%s: Currently we don't expect slice > 0, reg id 0x%llx", config_name(c), id); switch (sve_off) { case KVM_REG_ARM64_SVE_ZREG_BASE ... KVM_REG_ARM64_SVE_ZREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_ZREGS - 1: n = (id >> 5) & (KVM_ARM64_SVE_NUM_ZREGS - 1); TEST_ASSERT(id == KVM_REG_ARM64_SVE_ZREG(n, 0), - "Unexpected bits set in SVE ZREG id: 0x%llx", id); + "%s: Unexpected bits set in SVE ZREG id: 0x%llx", config_name(c), id); return str_with_index("KVM_REG_ARM64_SVE_ZREG(##, 0)", n); case KVM_REG_ARM64_SVE_PREG_BASE ... KVM_REG_ARM64_SVE_PREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_PREGS - 1: n = (id >> 5) & (KVM_ARM64_SVE_NUM_PREGS - 1); TEST_ASSERT(id == KVM_REG_ARM64_SVE_PREG(n, 0), - "Unexpected bits set in SVE PREG id: 0x%llx", id); + "%s: Unexpected bits set in SVE PREG id: 0x%llx", config_name(c), id); return str_with_index("KVM_REG_ARM64_SVE_PREG(##, 0)", n); case KVM_REG_ARM64_SVE_FFR_BASE: TEST_ASSERT(id == KVM_REG_ARM64_SVE_FFR(0), - "Unexpected bits set in SVE FFR id: 0x%llx", id); + "%s: Unexpected bits set in SVE FFR id: 0x%llx", config_name(c), id); return "KVM_REG_ARM64_SVE_FFR(0)"; } return NULL; } -static void print_reg(__u64 id) +static void print_reg(struct vcpu_config *c, __u64 id) { unsigned op0, op1, crn, crm, op2; const char *reg_size = NULL; TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64, - "KVM_REG_ARM64 missing in reg id: 0x%llx", id); + "%s: KVM_REG_ARM64 missing in reg id: 0x%llx", config_name(c), id); switch (id & KVM_REG_SIZE_MASK) { case KVM_REG_SIZE_U8: @@ -214,17 +265,17 @@ static void print_reg(__u64 id) reg_size = "KVM_REG_SIZE_U2048"; break; default: - TEST_FAIL("Unexpected reg size: 0x%llx in reg id: 0x%llx", - (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id); + TEST_FAIL("%s: Unexpected reg size: 0x%llx in reg id: 0x%llx", + config_name(c), (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id); } switch (id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: - printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(id)); + printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(c, id)); break; case KVM_REG_ARM_DEMUX: TEST_ASSERT(!(id & ~(REG_MASK | KVM_REG_ARM_DEMUX_ID_MASK | KVM_REG_ARM_DEMUX_VAL_MASK)), - "Unexpected bits set in DEMUX reg id: 0x%llx", id); + "%s: Unexpected bits set in DEMUX reg id: 0x%llx", config_name(c), id); printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | %lld,\n", reg_size, id & KVM_REG_ARM_DEMUX_VAL_MASK); break; @@ -235,23 +286,23 @@ static void print_reg(__u64 id) crm = (id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT; op2 = (id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT; TEST_ASSERT(id == ARM64_SYS_REG(op0, op1, crn, crm, op2), - "Unexpected bits set in SYSREG reg id: 0x%llx", id); + "%s: Unexpected bits set in SYSREG reg id: 0x%llx", config_name(c), id); printf("\tARM64_SYS_REG(%d, %d, %d, %d, %d),\n", op0, op1, crn, crm, op2); break; case KVM_REG_ARM_FW: TEST_ASSERT(id == KVM_REG_ARM_FW_REG(id & 0xffff), - "Unexpected bits set in FW reg id: 0x%llx", id); + "%s: Unexpected bits set in FW reg id: 0x%llx", config_name(c), id); printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff); break; case KVM_REG_ARM64_SVE: - if (reg_list_sve()) - printf("\t%s,\n", sve_id_to_str(id)); + if (has_cap(c, KVM_CAP_ARM_SVE)) + printf("\t%s,\n", sve_id_to_str(c, id)); else - TEST_FAIL("KVM_REG_ARM64_SVE is an unexpected coproc type in reg id: 0x%llx", id); + TEST_FAIL("%s: KVM_REG_ARM64_SVE is an unexpected coproc type in reg id: 0x%llx", config_name(c), id); break; default: - TEST_FAIL("Unexpected coproc type: 0x%llx in reg id: 0x%llx", - (id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id); + TEST_FAIL("%s: Unexpected coproc type: 0x%llx in reg id: 0x%llx", + config_name(c), (id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id); } } @@ -312,56 +363,58 @@ static void core_reg_fixup(void) reg_list = tmp; } -static void prepare_vcpu_init(struct kvm_vcpu_init *init) +static void prepare_vcpu_init(struct vcpu_config *c, struct kvm_vcpu_init *init) { - if (reg_list_sve()) - init->features[0] |= 1 << KVM_ARM_VCPU_SVE; + struct reg_sublist *s; + + for_each_sublist(c, s) + if (s->capability) + init->features[s->feature / 32] |= 1 << (s->feature % 32); } -static void finalize_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +static void finalize_vcpu(struct kvm_vm *vm, uint32_t vcpuid, struct vcpu_config *c) { + struct reg_sublist *s; int feature; - if (reg_list_sve()) { - feature = KVM_ARM_VCPU_SVE; - vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_FINALIZE, &feature); + for_each_sublist(c, s) { + if (s->finalize) { + feature = s->feature; + vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_FINALIZE, &feature); + } } } -static void check_supported(void) +static void check_supported(struct vcpu_config *c) { - if (reg_list_sve() && !kvm_check_cap(KVM_CAP_ARM_SVE)) { - fprintf(stderr, "SVE not available, skipping tests\n"); - exit(KSFT_SKIP); + struct reg_sublist *s; + + for_each_sublist(c, s) { + if (s->capability && !kvm_check_cap(s->capability)) { + fprintf(stderr, "%s: %s not available, skipping tests\n", config_name(c), s->name); + exit(KSFT_SKIP); + } } } -int main(int ac, char **av) +static bool print_list; +static bool print_filtered; +static bool fixup_core_regs; + +static void run_test(struct vcpu_config *c) { struct kvm_vcpu_init init = { .target = -1, }; - int new_regs = 0, missing_regs = 0, i; + int new_regs = 0, missing_regs = 0, i, n; int failed_get = 0, failed_set = 0, failed_reject = 0; - bool print_list = false, print_filtered = false, fixup_core_regs = false; struct kvm_vm *vm; - __u64 *vec_regs; + struct reg_sublist *s; - check_supported(); - - for (i = 1; i < ac; ++i) { - if (strcmp(av[i], "--core-reg-fixup") == 0) - fixup_core_regs = true; - else if (strcmp(av[i], "--list") == 0) - print_list = true; - else if (strcmp(av[i], "--list-filtered") == 0) - print_filtered = true; - else - TEST_FAIL("Unknown option: %s\n", av[i]); - } + check_supported(c); vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); - prepare_vcpu_init(&init); + prepare_vcpu_init(c, &init); aarch64_vcpu_add_default(vm, 0, &init, NULL); - finalize_vcpu(vm, 0); + finalize_vcpu(vm, 0, c); reg_list = vcpu_get_reg_list(vm, 0); @@ -374,10 +427,10 @@ int main(int ac, char **av) __u64 id = reg_list->reg[i]; if ((print_list && !filter_reg(id)) || (print_filtered && filter_reg(id))) - print_reg(id); + print_reg(c, id); } putchar('\n'); - return 0; + return; } /* @@ -396,50 +449,52 @@ int main(int ac, char **av) .id = reg_list->reg[i], .addr = (__u64)&addr, }; + bool reject_reg = false; int ret; ret = _vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, ®); if (ret) { - puts("Failed to get "); - print_reg(reg.id); + printf("%s: Failed to get ", config_name(c)); + print_reg(c, reg.id); putchar('\n'); ++failed_get; } /* rejects_set registers are rejected after KVM_ARM_VCPU_FINALIZE */ - if (find_reg(rejects_set, rejects_set_n, reg.id)) { - ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); - if (ret != -1 || errno != EPERM) { - printf("Failed to reject (ret=%d, errno=%d) ", ret, errno); - print_reg(reg.id); - putchar('\n'); - ++failed_reject; + for_each_sublist(c, s) { + if (s->rejects_set && find_reg(s->rejects_set, s->rejects_set_n, reg.id)) { + reject_reg = true; + ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + if (ret != -1 || errno != EPERM) { + printf("%s: Failed to reject (ret=%d, errno=%d) ", config_name(c), ret, errno); + print_reg(c, reg.id); + putchar('\n'); + ++failed_reject; + } + break; } - continue; } - ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); - if (ret) { - puts("Failed to set "); - print_reg(reg.id); - putchar('\n'); - ++failed_set; + if (!reject_reg) { + ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + if (ret) { + printf("%s: Failed to set ", config_name(c)); + print_reg(c, reg.id); + putchar('\n'); + ++failed_set; + } } } - if (reg_list_sve()) { - blessed_n = base_regs_n + sve_regs_n; - vec_regs = sve_regs; - } else { - blessed_n = base_regs_n + vregs_n; - vec_regs = vregs; - } - + for_each_sublist(c, s) + blessed_n += s->regs_n; blessed_reg = calloc(blessed_n, sizeof(__u64)); - for (i = 0; i < base_regs_n; ++i) - blessed_reg[i] = base_regs[i]; - for (i = 0; i < blessed_n - base_regs_n; ++i) - blessed_reg[base_regs_n + i] = vec_regs[i]; + + n = 0; + for_each_sublist(c, s) { + for (i = 0; i < s->regs_n; ++i) + blessed_reg[n++] = s->regs[i]; + } for_each_new_reg(i) ++new_regs; @@ -448,40 +503,141 @@ int main(int ac, char **av) ++missing_regs; if (new_regs || missing_regs) { - printf("Number blessed registers: %5lld\n", blessed_n); - printf("Number registers: %5lld\n", reg_list->n); + printf("%s: Number blessed registers: %5lld\n", config_name(c), blessed_n); + printf("%s: Number registers: %5lld\n", config_name(c), reg_list->n); } if (new_regs) { - printf("\nThere are %d new registers.\n" + printf("\n%s: There are %d new registers.\n" "Consider adding them to the blessed reg " - "list with the following lines:\n\n", new_regs); + "list with the following lines:\n\n", config_name(c), new_regs); for_each_new_reg(i) - print_reg(reg_list->reg[i]); + print_reg(c, reg_list->reg[i]); putchar('\n'); } if (missing_regs) { - printf("\nThere are %d missing registers.\n" - "The following lines are missing registers:\n\n", missing_regs); + printf("\n%s: There are %d missing registers.\n" + "The following lines are missing registers:\n\n", config_name(c), missing_regs); for_each_missing_reg(i) - print_reg(blessed_reg[i]); + print_reg(c, blessed_reg[i]); putchar('\n'); } TEST_ASSERT(!missing_regs && !failed_get && !failed_set && !failed_reject, - "There are %d missing registers; " + "%s: There are %d missing registers; " "%d registers failed get; %d registers failed set; %d registers failed reject", - missing_regs, failed_get, failed_set, failed_reject); + config_name(c), missing_regs, failed_get, failed_set, failed_reject); - return 0; + pr_info("%s: PASS\n", config_name(c)); + blessed_n = 0; + free(blessed_reg); + free(reg_list); + kvm_vm_free(vm); +} + +static void help(void) +{ + struct vcpu_config *c; + int i; + + printf( + "\n" + "usage: get-reg-list [--config=<selection>] [--list] [--list-filtered] [--core-reg-fixup]\n\n" + " --config=<selection> Used to select a specific vcpu configuration for the test/listing\n" + " '<selection>' may be\n"); + + for (i = 0; i < vcpu_configs_n; ++i) { + c = vcpu_configs[i]; + printf( + " '%s'\n", config_name(c)); + } + + printf( + "\n" + " --list Print the register list rather than test it (requires --config)\n" + " --list-filtered Print registers that would normally be filtered out (requires --config)\n" + " --core-reg-fixup Needed when running on old kernels with broken core reg listings\n" + "\n" + ); +} + +static struct vcpu_config *parse_config(const char *config) +{ + struct vcpu_config *c; + int i; + + if (config[8] != '=') + help(), exit(1); + + for (i = 0; i < vcpu_configs_n; ++i) { + c = vcpu_configs[i]; + if (strcmp(config_name(c), &config[9]) == 0) + break; + } + + if (i == vcpu_configs_n) + help(), exit(1); + + return c; +} + +int main(int ac, char **av) +{ + struct vcpu_config *c, *sel = NULL; + int i, ret = 0; + pid_t pid; + + for (i = 1; i < ac; ++i) { + if (strcmp(av[i], "--core-reg-fixup") == 0) + fixup_core_regs = true; + else if (strncmp(av[i], "--config", 8) == 0) + sel = parse_config(av[i]); + else if (strcmp(av[i], "--list") == 0) + print_list = true; + else if (strcmp(av[i], "--list-filtered") == 0) + print_filtered = true; + else if (strcmp(av[i], "--help") == 0 || strcmp(av[1], "-h") == 0) + help(), exit(0); + else + help(), exit(1); + } + + if (print_list || print_filtered) { + /* + * We only want to print the register list of a single config. + */ + if (!sel) + help(), exit(1); + } + + for (i = 0; i < vcpu_configs_n; ++i) { + c = vcpu_configs[i]; + if (sel && c != sel) + continue; + + pid = fork(); + + if (!pid) { + run_test(c); + exit(0); + } else { + int wstatus; + pid_t wpid = wait(&wstatus); + TEST_ASSERT(wpid == pid && WIFEXITED(wstatus), "wait: Unexpected return"); + if (WEXITSTATUS(wstatus) && WEXITSTATUS(wstatus) != KSFT_SKIP) + ret = KSFT_FAIL; + } + } + + return ret; } /* * The current blessed list was primed with the output of kernel version * v4.15 with --core-reg-fixup and then later updated with new registers. * - * The blessed list is up to date with kernel version v5.10-rc5 + * The blessed list is up to date with kernel version v5.13-rc3 */ static __u64 base_regs[] = { KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]), @@ -673,8 +829,6 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 5, 2, 0), /* ESR_EL1 */ ARM64_SYS_REG(3, 0, 6, 0, 0), /* FAR_EL1 */ ARM64_SYS_REG(3, 0, 7, 4, 0), /* PAR_EL1 */ - ARM64_SYS_REG(3, 0, 9, 14, 1), /* PMINTENSET_EL1 */ - ARM64_SYS_REG(3, 0, 9, 14, 2), /* PMINTENCLR_EL1 */ ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */ ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */ ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */ @@ -683,6 +837,16 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */ ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */ ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */ + ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ + ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ + ARM64_SYS_REG(3, 4, 3, 0, 0), /* DACR32_EL2 */ + ARM64_SYS_REG(3, 4, 5, 0, 1), /* IFSR32_EL2 */ + ARM64_SYS_REG(3, 4, 5, 3, 0), /* FPEXC32_EL2 */ +}; + +static __u64 pmu_regs[] = { + ARM64_SYS_REG(3, 0, 9, 14, 1), /* PMINTENSET_EL1 */ + ARM64_SYS_REG(3, 0, 9, 14, 2), /* PMINTENCLR_EL1 */ ARM64_SYS_REG(3, 3, 9, 12, 0), /* PMCR_EL0 */ ARM64_SYS_REG(3, 3, 9, 12, 1), /* PMCNTENSET_EL0 */ ARM64_SYS_REG(3, 3, 9, 12, 2), /* PMCNTENCLR_EL0 */ @@ -692,8 +856,6 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 3, 9, 13, 0), /* PMCCNTR_EL0 */ ARM64_SYS_REG(3, 3, 9, 14, 0), /* PMUSERENR_EL0 */ ARM64_SYS_REG(3, 3, 9, 14, 3), /* PMOVSSET_EL0 */ - ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ - ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ ARM64_SYS_REG(3, 3, 14, 8, 0), ARM64_SYS_REG(3, 3, 14, 8, 1), ARM64_SYS_REG(3, 3, 14, 8, 2), @@ -757,11 +919,7 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 3, 14, 15, 5), ARM64_SYS_REG(3, 3, 14, 15, 6), ARM64_SYS_REG(3, 3, 14, 15, 7), /* PMCCFILTR_EL0 */ - ARM64_SYS_REG(3, 4, 3, 0, 0), /* DACR32_EL2 */ - ARM64_SYS_REG(3, 4, 5, 0, 1), /* IFSR32_EL2 */ - ARM64_SYS_REG(3, 4, 5, 3, 0), /* FPEXC32_EL2 */ }; -static __u64 base_regs_n = ARRAY_SIZE(base_regs); static __u64 vregs[] = { KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]), @@ -797,7 +955,6 @@ static __u64 vregs[] = { KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]), KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]), }; -static __u64 vregs_n = ARRAY_SIZE(vregs); static __u64 sve_regs[] = { KVM_REG_ARM64_SVE_VLS, @@ -852,11 +1009,58 @@ static __u64 sve_regs[] = { KVM_REG_ARM64_SVE_FFR(0), ARM64_SYS_REG(3, 0, 1, 2, 0), /* ZCR_EL1 */ }; -static __u64 sve_regs_n = ARRAY_SIZE(sve_regs); -static __u64 rejects_set[] = { -#ifdef REG_LIST_SVE +static __u64 sve_rejects_set[] = { KVM_REG_ARM64_SVE_VLS, -#endif }; -static __u64 rejects_set_n = ARRAY_SIZE(rejects_set); + +#define BASE_SUBLIST \ + { "base", .regs = base_regs, .regs_n = ARRAY_SIZE(base_regs), } +#define VREGS_SUBLIST \ + { "vregs", .regs = vregs, .regs_n = ARRAY_SIZE(vregs), } +#define PMU_SUBLIST \ + { "pmu", .capability = KVM_CAP_ARM_PMU_V3, .feature = KVM_ARM_VCPU_PMU_V3, \ + .regs = pmu_regs, .regs_n = ARRAY_SIZE(pmu_regs), } +#define SVE_SUBLIST \ + { "sve", .capability = KVM_CAP_ARM_SVE, .feature = KVM_ARM_VCPU_SVE, .finalize = true, \ + .regs = sve_regs, .regs_n = ARRAY_SIZE(sve_regs), \ + .rejects_set = sve_rejects_set, .rejects_set_n = ARRAY_SIZE(sve_rejects_set), } + +static struct vcpu_config vregs_config = { + .sublists = { + BASE_SUBLIST, + VREGS_SUBLIST, + {0}, + }, +}; +static struct vcpu_config vregs_pmu_config = { + .sublists = { + BASE_SUBLIST, + VREGS_SUBLIST, + PMU_SUBLIST, + {0}, + }, +}; +static struct vcpu_config sve_config = { + .sublists = { + BASE_SUBLIST, + SVE_SUBLIST, + {0}, + }, +}; +static struct vcpu_config sve_pmu_config = { + .sublists = { + BASE_SUBLIST, + SVE_SUBLIST, + PMU_SUBLIST, + {0}, + }, +}; + +static struct vcpu_config *vcpu_configs[] = { + &vregs_config, + &vregs_pmu_config, + &sve_config, + &sve_pmu_config, +}; +static int vcpu_configs_n = ARRAY_SIZE(vcpu_configs); diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c new file mode 100644 index 000000000000..e2baa187a21e --- /dev/null +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -0,0 +1,429 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * access_tracking_perf_test + * + * Copyright (C) 2021, Google, Inc. + * + * This test measures the performance effects of KVM's access tracking. + * Access tracking is driven by the MMU notifiers test_young, clear_young, and + * clear_flush_young. These notifiers do not have a direct userspace API, + * however the clear_young notifier can be triggered by marking a pages as idle + * in /sys/kernel/mm/page_idle/bitmap. This test leverages that mechanism to + * enable access tracking on guest memory. + * + * To measure performance this test runs a VM with a configurable number of + * vCPUs that each touch every page in disjoint regions of memory. Performance + * is measured in the time it takes all vCPUs to finish touching their + * predefined region. + * + * Note that a deterministic correctness test of access tracking is not possible + * by using page_idle as it exists today. This is for a few reasons: + * + * 1. page_idle only issues clear_young notifiers, which lack a TLB flush. This + * means subsequent guest accesses are not guaranteed to see page table + * updates made by KVM until some time in the future. + * + * 2. page_idle only operates on LRU pages. Newly allocated pages are not + * immediately allocated to LRU lists. Instead they are held in a "pagevec", + * which is drained to LRU lists some time in the future. There is no + * userspace API to force this drain to occur. + * + * These limitations are worked around in this test by using a large enough + * region of memory for each vCPU such that the number of translations cached in + * the TLB and the number of pages held in pagevecs are a small fraction of the + * overall workload. And if either of those conditions are not true this test + * will fail rather than silently passing. + */ +#include <inttypes.h> +#include <limits.h> +#include <pthread.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include "kvm_util.h" +#include "test_util.h" +#include "perf_test_util.h" +#include "guest_modes.h" + +/* Global variable used to synchronize all of the vCPU threads. */ +static int iteration = -1; + +/* Defines what vCPU threads should do during a given iteration. */ +static enum { + /* Run the vCPU to access all its memory. */ + ITERATION_ACCESS_MEMORY, + /* Mark the vCPU's memory idle in page_idle. */ + ITERATION_MARK_IDLE, +} iteration_work; + +/* Set to true when vCPU threads should exit. */ +static bool done; + +/* The iteration that was last completed by each vCPU. */ +static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; + +/* Whether to overlap the regions of memory vCPUs access. */ +static bool overlap_memory_access; + +struct test_params { + /* The backing source for the region of memory. */ + enum vm_mem_backing_src_type backing_src; + + /* The amount of memory to allocate for each vCPU. */ + uint64_t vcpu_memory_bytes; + + /* The number of vCPUs to create in the VM. */ + int vcpus; +}; + +static uint64_t pread_uint64(int fd, const char *filename, uint64_t index) +{ + uint64_t value; + off_t offset = index * sizeof(value); + + TEST_ASSERT(pread(fd, &value, sizeof(value), offset) == sizeof(value), + "pread from %s offset 0x%" PRIx64 " failed!", + filename, offset); + + return value; + +} + +#define PAGEMAP_PRESENT (1ULL << 63) +#define PAGEMAP_PFN_MASK ((1ULL << 55) - 1) + +static uint64_t lookup_pfn(int pagemap_fd, struct kvm_vm *vm, uint64_t gva) +{ + uint64_t hva = (uint64_t) addr_gva2hva(vm, gva); + uint64_t entry; + uint64_t pfn; + + entry = pread_uint64(pagemap_fd, "pagemap", hva / getpagesize()); + if (!(entry & PAGEMAP_PRESENT)) + return 0; + + pfn = entry & PAGEMAP_PFN_MASK; + if (!pfn) { + print_skip("Looking up PFNs requires CAP_SYS_ADMIN"); + exit(KSFT_SKIP); + } + + return pfn; +} + +static bool is_page_idle(int page_idle_fd, uint64_t pfn) +{ + uint64_t bits = pread_uint64(page_idle_fd, "page_idle", pfn / 64); + + return !!((bits >> (pfn % 64)) & 1); +} + +static void mark_page_idle(int page_idle_fd, uint64_t pfn) +{ + uint64_t bits = 1ULL << (pfn % 64); + + TEST_ASSERT(pwrite(page_idle_fd, &bits, 8, 8 * (pfn / 64)) == 8, + "Set page_idle bits for PFN 0x%" PRIx64, pfn); +} + +static void mark_vcpu_memory_idle(struct kvm_vm *vm, int vcpu_id) +{ + uint64_t base_gva = perf_test_args.vcpu_args[vcpu_id].gva; + uint64_t pages = perf_test_args.vcpu_args[vcpu_id].pages; + uint64_t page; + uint64_t still_idle = 0; + uint64_t no_pfn = 0; + int page_idle_fd; + int pagemap_fd; + + /* If vCPUs are using an overlapping region, let vCPU 0 mark it idle. */ + if (overlap_memory_access && vcpu_id) + return; + + page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); + TEST_ASSERT(page_idle_fd > 0, "Failed to open page_idle."); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + TEST_ASSERT(pagemap_fd > 0, "Failed to open pagemap."); + + for (page = 0; page < pages; page++) { + uint64_t gva = base_gva + page * perf_test_args.guest_page_size; + uint64_t pfn = lookup_pfn(pagemap_fd, vm, gva); + + if (!pfn) { + no_pfn++; + continue; + } + + if (is_page_idle(page_idle_fd, pfn)) { + still_idle++; + continue; + } + + mark_page_idle(page_idle_fd, pfn); + } + + /* + * Assumption: Less than 1% of pages are going to be swapped out from + * under us during this test. + */ + TEST_ASSERT(no_pfn < pages / 100, + "vCPU %d: No PFN for %" PRIu64 " out of %" PRIu64 " pages.", + vcpu_id, no_pfn, pages); + + /* + * Test that at least 90% of memory has been marked idle (the rest might + * not be marked idle because the pages have not yet made it to an LRU + * list or the translations are still cached in the TLB). 90% is + * arbitrary; high enough that we ensure most memory access went through + * access tracking but low enough as to not make the test too brittle + * over time and across architectures. + */ + TEST_ASSERT(still_idle < pages / 10, + "vCPU%d: Too many pages still idle (%"PRIu64 " out of %" + PRIu64 ").\n", + vcpu_id, still_idle, pages); + + close(page_idle_fd); + close(pagemap_fd); +} + +static void assert_ucall(struct kvm_vm *vm, uint32_t vcpu_id, + uint64_t expected_ucall) +{ + struct ucall uc; + uint64_t actual_ucall = get_ucall(vm, vcpu_id, &uc); + + TEST_ASSERT(expected_ucall == actual_ucall, + "Guest exited unexpectedly (expected ucall %" PRIu64 + ", got %" PRIu64 ")", + expected_ucall, actual_ucall); +} + +static bool spin_wait_for_next_iteration(int *current_iteration) +{ + int last_iteration = *current_iteration; + + do { + if (READ_ONCE(done)) + return false; + + *current_iteration = READ_ONCE(iteration); + } while (last_iteration == *current_iteration); + + return true; +} + +static void *vcpu_thread_main(void *arg) +{ + struct perf_test_vcpu_args *vcpu_args = arg; + struct kvm_vm *vm = perf_test_args.vm; + int vcpu_id = vcpu_args->vcpu_id; + int current_iteration = -1; + + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + + while (spin_wait_for_next_iteration(¤t_iteration)) { + switch (READ_ONCE(iteration_work)) { + case ITERATION_ACCESS_MEMORY: + vcpu_run(vm, vcpu_id); + assert_ucall(vm, vcpu_id, UCALL_SYNC); + break; + case ITERATION_MARK_IDLE: + mark_vcpu_memory_idle(vm, vcpu_id); + break; + }; + + vcpu_last_completed_iteration[vcpu_id] = current_iteration; + } + + return NULL; +} + +static void spin_wait_for_vcpu(int vcpu_id, int target_iteration) +{ + while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != + target_iteration) { + continue; + } +} + +/* The type of memory accesses to perform in the VM. */ +enum access_type { + ACCESS_READ, + ACCESS_WRITE, +}; + +static void run_iteration(struct kvm_vm *vm, int vcpus, const char *description) +{ + struct timespec ts_start; + struct timespec ts_elapsed; + int next_iteration; + int vcpu_id; + + /* Kick off the vCPUs by incrementing iteration. */ + next_iteration = ++iteration; + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + + /* Wait for all vCPUs to finish the iteration. */ + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) + spin_wait_for_vcpu(vcpu_id, next_iteration); + + ts_elapsed = timespec_elapsed(ts_start); + pr_info("%-30s: %ld.%09lds\n", + description, ts_elapsed.tv_sec, ts_elapsed.tv_nsec); +} + +static void access_memory(struct kvm_vm *vm, int vcpus, enum access_type access, + const char *description) +{ + perf_test_args.wr_fract = (access == ACCESS_READ) ? INT_MAX : 1; + sync_global_to_guest(vm, perf_test_args); + iteration_work = ITERATION_ACCESS_MEMORY; + run_iteration(vm, vcpus, description); +} + +static void mark_memory_idle(struct kvm_vm *vm, int vcpus) +{ + /* + * Even though this parallelizes the work across vCPUs, this is still a + * very slow operation because page_idle forces the test to mark one pfn + * at a time and the clear_young notifier serializes on the KVM MMU + * lock. + */ + pr_debug("Marking VM memory idle (slow)...\n"); + iteration_work = ITERATION_MARK_IDLE; + run_iteration(vm, vcpus, "Mark memory idle"); +} + +static pthread_t *create_vcpu_threads(int vcpus) +{ + pthread_t *vcpu_threads; + int i; + + vcpu_threads = malloc(vcpus * sizeof(vcpu_threads[0])); + TEST_ASSERT(vcpu_threads, "Failed to allocate vcpu_threads."); + + for (i = 0; i < vcpus; i++) { + vcpu_last_completed_iteration[i] = iteration; + pthread_create(&vcpu_threads[i], NULL, vcpu_thread_main, + &perf_test_args.vcpu_args[i]); + } + + return vcpu_threads; +} + +static void terminate_vcpu_threads(pthread_t *vcpu_threads, int vcpus) +{ + int i; + + /* Set done to signal the vCPU threads to exit */ + done = true; + + for (i = 0; i < vcpus; i++) + pthread_join(vcpu_threads[i], NULL); +} + +static void run_test(enum vm_guest_mode mode, void *arg) +{ + struct test_params *params = arg; + struct kvm_vm *vm; + pthread_t *vcpu_threads; + int vcpus = params->vcpus; + + vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes, + params->backing_src); + + perf_test_setup_vcpus(vm, vcpus, params->vcpu_memory_bytes, + !overlap_memory_access); + + vcpu_threads = create_vcpu_threads(vcpus); + + pr_info("\n"); + access_memory(vm, vcpus, ACCESS_WRITE, "Populating memory"); + + /* As a control, read and write to the populated memory first. */ + access_memory(vm, vcpus, ACCESS_WRITE, "Writing to populated memory"); + access_memory(vm, vcpus, ACCESS_READ, "Reading from populated memory"); + + /* Repeat on memory that has been marked as idle. */ + mark_memory_idle(vm, vcpus); + access_memory(vm, vcpus, ACCESS_WRITE, "Writing to idle memory"); + mark_memory_idle(vm, vcpus); + access_memory(vm, vcpus, ACCESS_READ, "Reading from idle memory"); + + terminate_vcpu_threads(vcpu_threads, vcpus); + free(vcpu_threads); + perf_test_destroy_vm(vm); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-m mode] [-b vcpu_bytes] [-v vcpus] [-o] [-s mem_type]\n", + name); + puts(""); + printf(" -h: Display this help message."); + guest_modes_help(); + printf(" -b: specify the size of the memory region which should be\n" + " dirtied by each vCPU. e.g. 10M or 3G.\n" + " (default: 1G)\n"); + printf(" -v: specify the number of vCPUs to run.\n"); + printf(" -o: Overlap guest memory accesses instead of partitioning\n" + " them into a separate region of memory for each vCPU.\n"); + printf(" -s: specify the type of memory that should be used to\n" + " back the guest data region.\n\n"); + backing_src_help(); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + struct test_params params = { + .backing_src = VM_MEM_SRC_ANONYMOUS, + .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, + .vcpus = 1, + }; + int page_idle_fd; + int opt; + + guest_modes_append_default(); + + while ((opt = getopt(argc, argv, "hm:b:v:os:")) != -1) { + switch (opt) { + case 'm': + guest_modes_cmdline(optarg); + break; + case 'b': + params.vcpu_memory_bytes = parse_size(optarg); + break; + case 'v': + params.vcpus = atoi(optarg); + break; + case 'o': + overlap_memory_access = true; + break; + case 's': + params.backing_src = parse_backing_src_type(optarg); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); + if (page_idle_fd < 0) { + print_skip("CONFIG_IDLE_PAGE_TRACKING is not enabled"); + exit(KSFT_SKIP); + } + close(page_idle_fd); + + for_each_guest_mode(run_test, ¶ms); + + return 0; +} diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 5f7a229c3af1..b74704305835 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -9,6 +9,7 @@ #define _GNU_SOURCE /* for pipe2 */ +#include <inttypes.h> #include <stdio.h> #include <stdlib.h> #include <time.h> @@ -38,6 +39,7 @@ static int nr_vcpus = 1; static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; +static size_t demand_paging_size; static char *guest_data_prototype; static void *vcpu_worker(void *data) @@ -71,36 +73,51 @@ static void *vcpu_worker(void *data) return NULL; } -static int handle_uffd_page_request(int uffd, uint64_t addr) +static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr) { - pid_t tid; + pid_t tid = syscall(__NR_gettid); struct timespec start; struct timespec ts_diff; - struct uffdio_copy copy; int r; - tid = syscall(__NR_gettid); + clock_gettime(CLOCK_MONOTONIC, &start); - copy.src = (uint64_t)guest_data_prototype; - copy.dst = addr; - copy.len = perf_test_args.host_page_size; - copy.mode = 0; + if (uffd_mode == UFFDIO_REGISTER_MODE_MISSING) { + struct uffdio_copy copy; - clock_gettime(CLOCK_MONOTONIC, &start); + copy.src = (uint64_t)guest_data_prototype; + copy.dst = addr; + copy.len = demand_paging_size; + copy.mode = 0; - r = ioctl(uffd, UFFDIO_COPY, ©); - if (r == -1) { - pr_info("Failed Paged in 0x%lx from thread %d with errno: %d\n", - addr, tid, errno); - return r; + r = ioctl(uffd, UFFDIO_COPY, ©); + if (r == -1) { + pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d with errno: %d\n", + addr, tid, errno); + return r; + } + } else if (uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { + struct uffdio_continue cont = {0}; + + cont.range.start = addr; + cont.range.len = demand_paging_size; + + r = ioctl(uffd, UFFDIO_CONTINUE, &cont); + if (r == -1) { + pr_info("Failed UFFDIO_CONTINUE in 0x%lx from thread %d with errno: %d\n", + addr, tid, errno); + return r; + } + } else { + TEST_FAIL("Invalid uffd mode %d", uffd_mode); } ts_diff = timespec_elapsed(start); - PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid, + PER_PAGE_DEBUG("UFFD page-in %d \t%ld ns\n", tid, timespec_to_ns(ts_diff)); PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n", - perf_test_args.host_page_size, addr, tid); + demand_paging_size, addr, tid); return 0; } @@ -108,6 +125,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr) bool quit_uffd_thread; struct uffd_handler_args { + int uffd_mode; int uffd; int pipefd; useconds_t delay; @@ -169,7 +187,7 @@ static void *uffd_handler_thread_fn(void *arg) if (r == -1) { if (errno == EAGAIN) continue; - pr_info("Read of uffd gor errno %d", errno); + pr_info("Read of uffd got errno %d\n", errno); return NULL; } @@ -184,7 +202,7 @@ static void *uffd_handler_thread_fn(void *arg) if (delay) usleep(delay); addr = msg.arg.pagefault.address; - r = handle_uffd_page_request(uffd, addr); + r = handle_uffd_page_request(uffd_args->uffd_mode, uffd, addr); if (r < 0) return NULL; pages++; @@ -198,43 +216,53 @@ static void *uffd_handler_thread_fn(void *arg) return NULL; } -static int setup_demand_paging(struct kvm_vm *vm, - pthread_t *uffd_handler_thread, int pipefd, - useconds_t uffd_delay, - struct uffd_handler_args *uffd_args, - void *hva, uint64_t len) +static void setup_demand_paging(struct kvm_vm *vm, + pthread_t *uffd_handler_thread, int pipefd, + int uffd_mode, useconds_t uffd_delay, + struct uffd_handler_args *uffd_args, + void *hva, void *alias, uint64_t len) { + bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR); int uffd; struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; + uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd == -1) { - pr_info("uffd creation failed\n"); - return -1; + PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", + is_minor ? "MINOR" : "MISSING", + is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); + + /* In order to get minor faults, prefault via the alias. */ + if (is_minor) { + size_t p; + + expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE; + + TEST_ASSERT(alias != NULL, "Alias required for minor faults"); + for (p = 0; p < (len / demand_paging_size); ++p) { + memcpy(alias + (p * demand_paging_size), + guest_data_prototype, demand_paging_size); + } } + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno); + uffdio_api.api = UFFD_API; uffdio_api.features = 0; - if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { - pr_info("ioctl uffdio_api failed\n"); - return -1; - } + TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1, + "ioctl UFFDIO_API failed: %" PRIu64, + (uint64_t)uffdio_api.api); uffdio_register.range.start = (uint64_t)hva; uffdio_register.range.len = len; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { - pr_info("ioctl uffdio_register failed\n"); - return -1; - } - - if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) != - UFFD_API_RANGE_IOCTLS) { - pr_info("unexpected userfaultfd ioctl set\n"); - return -1; - } + uffdio_register.mode = uffd_mode; + TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1, + "ioctl UFFDIO_REGISTER failed"); + TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == + expected_ioctls, "missing userfaultfd ioctls"); + uffd_args->uffd_mode = uffd_mode; uffd_args->uffd = uffd; uffd_args->pipefd = pipefd; uffd_args->delay = uffd_delay; @@ -243,13 +271,12 @@ static int setup_demand_paging(struct kvm_vm *vm, PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", hva, hva + len); - - return 0; } struct test_params { - bool use_uffd; + int uffd_mode; useconds_t uffd_delay; + enum vm_mem_backing_src_type src_type; bool partition_vcpu_memory_access; }; @@ -267,14 +294,16 @@ static void run_test(enum vm_guest_mode mode, void *arg) int r; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, - VM_MEM_SRC_ANONYMOUS); + p->src_type); perf_test_args.wr_fract = 1; - guest_data_prototype = malloc(perf_test_args.host_page_size); + demand_paging_size = get_backing_src_pagesz(p->src_type); + + guest_data_prototype = malloc(demand_paging_size); TEST_ASSERT(guest_data_prototype, "Failed to allocate buffer for guest data pattern"); - memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size); + memset(guest_data_prototype, 0xAB, demand_paging_size); vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); @@ -282,7 +311,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size, p->partition_vcpu_memory_access); - if (p->use_uffd) { + if (p->uffd_mode) { uffd_handler_threads = malloc(nr_vcpus * sizeof(*uffd_handler_threads)); TEST_ASSERT(uffd_handler_threads, "Memory allocation failed"); @@ -296,6 +325,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vm_paddr_t vcpu_gpa; void *vcpu_hva; + void *vcpu_alias; uint64_t vcpu_mem_size; @@ -310,8 +340,9 @@ static void run_test(enum vm_guest_mode mode, void *arg) PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_mem_size); - /* Cache the HVA pointer of the region */ + /* Cache the host addresses of the region */ vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); + vcpu_alias = addr_gpa2alias(vm, vcpu_gpa); /* * Set up user fault fd to handle demand paging @@ -321,13 +352,11 @@ static void run_test(enum vm_guest_mode mode, void *arg) O_CLOEXEC | O_NONBLOCK); TEST_ASSERT(!r, "Failed to set up pipefd"); - r = setup_demand_paging(vm, - &uffd_handler_threads[vcpu_id], - pipefds[vcpu_id * 2], - p->uffd_delay, &uffd_args[vcpu_id], - vcpu_hva, vcpu_mem_size); - if (r < 0) - exit(-r); + setup_demand_paging(vm, &uffd_handler_threads[vcpu_id], + pipefds[vcpu_id * 2], p->uffd_mode, + p->uffd_delay, &uffd_args[vcpu_id], + vcpu_hva, vcpu_alias, + vcpu_mem_size); } } @@ -355,7 +384,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("All vCPU threads joined\n"); - if (p->use_uffd) { + if (p->uffd_mode) { char c; /* Tell the user fault fd handler threads to quit */ @@ -377,7 +406,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) free(guest_data_prototype); free(vcpu_threads); - if (p->use_uffd) { + if (p->uffd_mode) { free(uffd_handler_threads); free(uffd_args); free(pipefds); @@ -387,17 +416,19 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n" - " [-b memory] [-v vcpus] [-o]\n", name); + printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" + " [-b memory] [-t type] [-v vcpus] [-o]\n", name); guest_modes_help(); - printf(" -u: use User Fault FD to handle vCPU page\n" - " faults.\n"); + printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" + " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); printf(" -d: add a delay in usec to the User Fault\n" " FD handler to simulate demand paging\n" " overheads. Ignored without -u.\n"); printf(" -b: specify the size of the memory region which should be\n" " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); + printf(" -t: The type of backing memory to use. Default: anonymous\n"); + backing_src_help(); printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); @@ -409,19 +440,24 @@ int main(int argc, char *argv[]) { int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); struct test_params p = { + .src_type = VM_MEM_SRC_ANONYMOUS, .partition_vcpu_memory_access = true, }; int opt; guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:ud:b:v:o")) != -1) { + while ((opt = getopt(argc, argv, "hm:u:d:b:t:v:o")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); break; case 'u': - p.use_uffd = true; + if (!strcmp("MISSING", optarg)) + p.uffd_mode = UFFDIO_REGISTER_MODE_MISSING; + else if (!strcmp("MINOR", optarg)) + p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR; + TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'."); break; case 'd': p.uffd_delay = strtoul(optarg, NULL, 0); @@ -430,6 +466,9 @@ int main(int argc, char *argv[]) case 'b': guest_percpu_mem_size = parse_size(optarg); break; + case 't': + p.src_type = parse_backing_src_type(optarg); + break; case 'v': nr_vcpus = atoi(optarg); TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, @@ -445,6 +484,11 @@ int main(int argc, char *argv[]) } } + if (p.uffd_mode == UFFDIO_REGISTER_MODE_MINOR && + !backing_src_is_shared(p.src_type)) { + TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -t"); + } + for_each_guest_mode(run_test, &p); return 0; diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 04a2641261be..80cbd3a748c0 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -312,6 +312,7 @@ int main(int argc, char *argv[]) break; case 'o': p.partition_vcpu_memory_access = false; + break; case 's': p.backing_src = parse_backing_src_type(optarg); break; diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 81edbd23d371..5fe0140e407e 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -16,7 +16,6 @@ #include <errno.h> #include <linux/bitmap.h> #include <linux/bitops.h> -#include <asm/barrier.h> #include <linux/atomic.h> #include "kvm_util.h" @@ -681,7 +680,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + kvm_vm_elf_load(vm, program_invocation_name); #ifdef __x86_64__ vm_create_irqchip(vm); #endif @@ -761,7 +760,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) KVM_MEM_LOG_DIRTY_PAGES); /* Do mapping for the dirty track memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); /* Cache the HVA pointer of the region */ host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 5aadf84c91c0..b21c69a56daa 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -105,7 +105,7 @@ static void run_test(uint32_t run) CPU_SET(i, &cpu_set); vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + kvm_vm_elf_load(vm, program_invocation_name); vm_create_irqchip(vm); pr_debug("%s: [%d] start vcpus\n", __func__, run); @@ -132,6 +132,36 @@ static void run_test(uint32_t run) TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run); } +void wait_for_child_setup(pid_t pid) +{ + /* + * Wait for the child to post to the semaphore, but wake up periodically + * to check if the child exited prematurely. + */ + for (;;) { + const struct timespec wait_period = { .tv_sec = 1 }; + int status; + + if (!sem_timedwait(sem, &wait_period)) + return; + + /* Child is still running, keep waiting. */ + if (pid != waitpid(pid, &status, WNOHANG)) + continue; + + /* + * Child is no longer running, which is not expected. + * + * If it exited with a non-zero status, we explicitly forward + * the child's status in case it exited with KSFT_SKIP. + */ + if (WIFEXITED(status)) + exit(WEXITSTATUS(status)); + else + TEST_ASSERT(false, "Child exited unexpectedly"); + } +} + int main(int argc, char **argv) { uint32_t i; @@ -148,7 +178,7 @@ int main(int argc, char **argv) run_test(i); /* This function always exits */ pr_debug("%s: [%d] waiting semaphore\n", __func__, i); - sem_wait(sem); + wait_for_child_setup(pid); r = (rand() % DELAY_US_MAX) + 1; pr_debug("%s: [%d] waiting %dus\n", __func__, i, r); usleep(r); diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index b7fa0c8551db..27dc5c2e56b9 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -8,16 +8,20 @@ #define SELFTEST_KVM_PROCESSOR_H #include "kvm_util.h" +#include <linux/stringify.h> #define ARM64_CORE_REG(x) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \ KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x)) -#define CPACR_EL1 3, 0, 1, 0, 2 -#define TCR_EL1 3, 0, 2, 0, 2 -#define MAIR_EL1 3, 0, 10, 2, 0 -#define TTBR0_EL1 3, 0, 2, 0, 0 -#define SCTLR_EL1 3, 0, 1, 0, 0 +#define CPACR_EL1 3, 0, 1, 0, 2 +#define TCR_EL1 3, 0, 2, 0, 2 +#define MAIR_EL1 3, 0, 10, 2, 0 +#define TTBR0_EL1 3, 0, 2, 0, 0 +#define SCTLR_EL1 3, 0, 1, 0, 0 +#define VBAR_EL1 3, 0, 12, 0, 0 + +#define ID_AA64DFR0_EL1 3, 0, 0, 5, 0 /* * Default MAIR @@ -56,4 +60,73 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init *init, void *guest_code); +struct ex_regs { + u64 regs[31]; + u64 sp; + u64 pc; + u64 pstate; +}; + +#define VECTOR_NUM 16 + +enum { + VECTOR_SYNC_CURRENT_SP0, + VECTOR_IRQ_CURRENT_SP0, + VECTOR_FIQ_CURRENT_SP0, + VECTOR_ERROR_CURRENT_SP0, + + VECTOR_SYNC_CURRENT, + VECTOR_IRQ_CURRENT, + VECTOR_FIQ_CURRENT, + VECTOR_ERROR_CURRENT, + + VECTOR_SYNC_LOWER_64, + VECTOR_IRQ_LOWER_64, + VECTOR_FIQ_LOWER_64, + VECTOR_ERROR_LOWER_64, + + VECTOR_SYNC_LOWER_32, + VECTOR_IRQ_LOWER_32, + VECTOR_FIQ_LOWER_32, + VECTOR_ERROR_LOWER_32, +}; + +#define VECTOR_IS_SYNC(v) ((v) == VECTOR_SYNC_CURRENT_SP0 || \ + (v) == VECTOR_SYNC_CURRENT || \ + (v) == VECTOR_SYNC_LOWER_64 || \ + (v) == VECTOR_SYNC_LOWER_32) + +#define ESR_EC_NUM 64 +#define ESR_EC_SHIFT 26 +#define ESR_EC_MASK (ESR_EC_NUM - 1) + +#define ESR_EC_SVC64 0x15 +#define ESR_EC_HW_BP_CURRENT 0x31 +#define ESR_EC_SSTEP_CURRENT 0x33 +#define ESR_EC_WP_CURRENT 0x35 +#define ESR_EC_BRK_INS 0x3c + +void vm_init_descriptor_tables(struct kvm_vm *vm); +void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid); + +typedef void(*handler_fn)(struct ex_regs *); +void vm_install_exception_handler(struct kvm_vm *vm, + int vector, handler_fn handler); +void vm_install_sync_handler(struct kvm_vm *vm, + int vector, int ec, handler_fn handler); + +#define write_sysreg(reg, val) \ +({ \ + u64 __val = (u64)(val); \ + asm volatile("msr " __stringify(reg) ", %x0" : : "rZ" (__val)); \ +}) + +#define read_sysreg(reg) \ +({ u64 val; \ + asm volatile("mrs %0, "__stringify(reg) : "=r"(val) : : "memory");\ + val; \ +}) + +#define isb() asm volatile("isb" : : : "memory") + #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index a8f022794ce3..010b59b13917 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -30,6 +30,7 @@ typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ /* Minimum allocated guest virtual and physical addresses */ #define KVM_UTIL_MIN_VADDR 0x2000 +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 #define DEFAULT_GUEST_PHY_PAGES 512 #define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000 @@ -43,6 +44,8 @@ enum vm_guest_mode { VM_MODE_P40V48_4K, VM_MODE_P40V48_64K, VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + VM_MODE_P47V64_4K, + VM_MODE_P44V64_4K, NUM_VM_MODES, }; @@ -60,7 +63,7 @@ enum vm_guest_mode { #elif defined(__s390x__) -#define VM_MODE_DEFAULT VM_MODE_P52V48_4K +#define VM_MODE_DEFAULT VM_MODE_P44V64_4K #define MIN_PAGE_SHIFT 12U #define ptes_per_page(page_size) ((page_size) / 16) @@ -77,6 +80,7 @@ struct vm_guest_mode_params { }; extern const struct vm_guest_mode_params vm_guest_mode_params[]; +int open_kvm_dev_path_or_exit(void); int kvm_check_cap(long cap); int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, @@ -96,8 +100,7 @@ uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm); int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, size_t len); -void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, - uint32_t data_memslot, uint32_t pgd_memslot); +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); @@ -139,13 +142,16 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); -vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, - uint32_t data_memslot, uint32_t pgd_memslot); +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); +vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); +vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); + void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - unsigned int npages, uint32_t pgd_memslot); + unsigned int npages); void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); +void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); /* * Address Guest Virtual to Guest Physical @@ -234,7 +240,7 @@ int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, const char *exit_reason_str(unsigned int exit_reason); -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot); +void virt_pgd_alloc(struct kvm_vm *vm); /* * VM Virtual Page Map @@ -252,13 +258,13 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot); * Within @vm, creates a virtual translation for the page starting * at @vaddr to the page starting at @paddr. */ -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t memslot); +void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, uint32_t memslot); vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, vm_paddr_t paddr_min, uint32_t memslot); +vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); /* * Create a VM with reasonable defaults @@ -283,10 +289,11 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me uint32_t num_percpu_pages, void *guest_code, uint32_t vcpuids[]); -/* Like vm_create_default_with_vcpus, but accepts mode as a parameter */ +/* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t extra_mem_pages, uint32_t num_percpu_pages, - void *guest_code, uint32_t vcpuids[]); + uint64_t slot0_mem_pages, uint64_t extra_mem_pages, + uint32_t num_percpu_pages, void *guest_code, + uint32_t vcpuids[]); /* * Adds a vCPU with reasonable defaults (e.g. a stack) @@ -302,7 +309,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm); unsigned int vm_get_page_size(struct kvm_vm *vm); unsigned int vm_get_page_shift(struct kvm_vm *vm); -unsigned int vm_get_max_gfn(struct kvm_vm *vm); +uint64_t vm_get_max_gfn(struct kvm_vm *vm); int vm_get_fd(struct kvm_vm *vm); unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); @@ -347,6 +354,7 @@ enum { UCALL_SYNC, UCALL_ABORT, UCALL_DONE, + UCALL_UNHANDLED, }; #define UCALL_MAX_ARGS 6 @@ -365,26 +373,31 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc); ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) #define GUEST_DONE() ucall(UCALL_DONE, 0) -#define __GUEST_ASSERT(_condition, _nargs, _args...) do { \ - if (!(_condition)) \ - ucall(UCALL_ABORT, 2 + _nargs, \ - "Failed guest assert: " \ - #_condition, __LINE__, _args); \ +#define __GUEST_ASSERT(_condition, _condstr, _nargs, _args...) do { \ + if (!(_condition)) \ + ucall(UCALL_ABORT, 2 + _nargs, \ + "Failed guest assert: " \ + _condstr, __LINE__, _args); \ } while (0) #define GUEST_ASSERT(_condition) \ - __GUEST_ASSERT((_condition), 0, 0) + __GUEST_ASSERT(_condition, #_condition, 0, 0) #define GUEST_ASSERT_1(_condition, arg1) \ - __GUEST_ASSERT((_condition), 1, (arg1)) + __GUEST_ASSERT(_condition, #_condition, 1, (arg1)) #define GUEST_ASSERT_2(_condition, arg1, arg2) \ - __GUEST_ASSERT((_condition), 2, (arg1), (arg2)) + __GUEST_ASSERT(_condition, #_condition, 2, (arg1), (arg2)) #define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \ - __GUEST_ASSERT((_condition), 3, (arg1), (arg2), (arg3)) + __GUEST_ASSERT(_condition, #_condition, 3, (arg1), (arg2), (arg3)) #define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \ - __GUEST_ASSERT((_condition), 4, (arg1), (arg2), (arg3), (arg4)) + __GUEST_ASSERT(_condition, #_condition, 4, (arg1), (arg2), (arg3), (arg4)) + +#define GUEST_ASSERT_EQ(a, b) __GUEST_ASSERT((a) == (b), #a " == " #b, 2, a, b) + +int vm_get_stats_fd(struct kvm_vm *vm); +int vcpu_get_stats_fd(struct kvm_vm *vm, uint32_t vcpuid); #endif /* SELFTEST_KVM_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index fade3130eb01..d79be15dd3d2 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -17,6 +17,7 @@ #include <errno.h> #include <unistd.h> #include <fcntl.h> +#include <sys/mman.h> #include "kselftest.h" static inline int _no_printf(const char *format, ...) { return 0; } @@ -84,6 +85,8 @@ enum vm_mem_backing_src_type { VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB, VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB, VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB, + VM_MEM_SRC_SHMEM, + VM_MEM_SRC_SHARED_HUGETLB, NUM_SRC_TYPES, }; @@ -100,4 +103,13 @@ size_t get_backing_src_pagesz(uint32_t i); void backing_src_help(void); enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); +/* + * Whether or not the given source type is shared memory (as opposed to + * anonymous). + */ +static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t) +{ + return vm_mem_backing_src_alias(t)->flag & MAP_SHARED; +} + #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/apic.h b/tools/testing/selftests/kvm/include/x86_64/apic.h new file mode 100644 index 000000000000..0be4757f1f20 --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/apic.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/x86_64/apic.h + * + * Copyright (C) 2021, Google LLC. + */ + +#ifndef SELFTEST_KVM_APIC_H +#define SELFTEST_KVM_APIC_H + +#include <stdint.h> + +#include "processor.h" + +#define APIC_DEFAULT_GPA 0xfee00000ULL + +/* APIC base address MSR and fields */ +#define MSR_IA32_APICBASE 0x0000001b +#define MSR_IA32_APICBASE_BSP (1<<8) +#define MSR_IA32_APICBASE_EXTD (1<<10) +#define MSR_IA32_APICBASE_ENABLE (1<<11) +#define MSR_IA32_APICBASE_BASE (0xfffff<<12) +#define GET_APIC_BASE(x) (((x) >> 12) << 12) + +#define APIC_BASE_MSR 0x800 +#define X2APIC_ENABLE (1UL << 10) +#define APIC_ID 0x20 +#define APIC_LVR 0x30 +#define GET_APIC_ID_FIELD(x) (((x) >> 24) & 0xFF) +#define APIC_TASKPRI 0x80 +#define APIC_PROCPRI 0xA0 +#define APIC_EOI 0xB0 +#define APIC_SPIV 0xF0 +#define APIC_SPIV_FOCUS_DISABLED (1 << 9) +#define APIC_SPIV_APIC_ENABLED (1 << 8) +#define APIC_ICR 0x300 +#define APIC_DEST_SELF 0x40000 +#define APIC_DEST_ALLINC 0x80000 +#define APIC_DEST_ALLBUT 0xC0000 +#define APIC_ICR_RR_MASK 0x30000 +#define APIC_ICR_RR_INVALID 0x00000 +#define APIC_ICR_RR_INPROG 0x10000 +#define APIC_ICR_RR_VALID 0x20000 +#define APIC_INT_LEVELTRIG 0x08000 +#define APIC_INT_ASSERT 0x04000 +#define APIC_ICR_BUSY 0x01000 +#define APIC_DEST_LOGICAL 0x00800 +#define APIC_DEST_PHYSICAL 0x00000 +#define APIC_DM_FIXED 0x00000 +#define APIC_DM_FIXED_MASK 0x00700 +#define APIC_DM_LOWEST 0x00100 +#define APIC_DM_SMI 0x00200 +#define APIC_DM_REMRD 0x00300 +#define APIC_DM_NMI 0x00400 +#define APIC_DM_INIT 0x00500 +#define APIC_DM_STARTUP 0x00600 +#define APIC_DM_EXTINT 0x00700 +#define APIC_VECTOR_MASK 0x000FF +#define APIC_ICR2 0x310 +#define SET_APIC_DEST_FIELD(x) ((x) << 24) + +void apic_disable(void); +void xapic_enable(void); +void x2apic_enable(void); + +static inline uint32_t get_bsp_flag(void) +{ + return rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BSP; +} + +static inline uint32_t xapic_read_reg(unsigned int reg) +{ + return ((volatile uint32_t *)APIC_DEFAULT_GPA)[reg >> 2]; +} + +static inline void xapic_write_reg(unsigned int reg, uint32_t val) +{ + ((volatile uint32_t *)APIC_DEFAULT_GPA)[reg >> 2] = val; +} + +static inline uint64_t x2apic_read_reg(unsigned int reg) +{ + return rdmsr(APIC_BASE_MSR + (reg >> 4)); +} + +static inline void x2apic_write_reg(unsigned int reg, uint64_t value) +{ + wrmsr(APIC_BASE_MSR + (reg >> 4), value); +} + +#endif /* SELFTEST_KVM_APIC_H */ diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h index a034438b6266..c9af97abd622 100644 --- a/tools/testing/selftests/kvm/include/evmcs.h +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * tools/testing/selftests/kvm/include/vmx.h + * tools/testing/selftests/kvm/include/x86_64/evmcs.h * * Copyright (C) 2018, Red Hat, Inc. * diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h new file mode 100644 index 000000000000..b66910702c0a --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * tools/testing/selftests/kvm/include/x86_64/hyperv.h + * + * Copyright (C) 2021, Red Hat, Inc. + * + */ + +#ifndef SELFTEST_KVM_HYPERV_H +#define SELFTEST_KVM_HYPERV_H + +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 +#define HYPERV_CPUID_INTERFACE 0x40000001 +#define HYPERV_CPUID_VERSION 0x40000002 +#define HYPERV_CPUID_FEATURES 0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 +#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES 0x40000007 +#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A +#define HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS 0x40000080 +#define HYPERV_CPUID_SYNDBG_INTERFACE 0x40000081 +#define HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES 0x40000082 + +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 +#define HV_X64_MSR_HYPERCALL 0x40000001 +#define HV_X64_MSR_VP_INDEX 0x40000002 +#define HV_X64_MSR_RESET 0x40000003 +#define HV_X64_MSR_VP_RUNTIME 0x40000010 +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 +#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 +#define HV_X64_MSR_APIC_FREQUENCY 0x40000023 +#define HV_X64_MSR_EOI 0x40000070 +#define HV_X64_MSR_ICR 0x40000071 +#define HV_X64_MSR_TPR 0x40000072 +#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F +#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0 +#define HV_X64_MSR_STIMER0_COUNT 0x400000B1 +#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2 +#define HV_X64_MSR_STIMER1_COUNT 0x400000B3 +#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4 +#define HV_X64_MSR_STIMER2_COUNT 0x400000B5 +#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6 +#define HV_X64_MSR_STIMER3_COUNT 0x400000B7 +#define HV_X64_MSR_GUEST_IDLE 0x400000F0 +#define HV_X64_MSR_CRASH_P0 0x40000100 +#define HV_X64_MSR_CRASH_P1 0x40000101 +#define HV_X64_MSR_CRASH_P2 0x40000102 +#define HV_X64_MSR_CRASH_P3 0x40000103 +#define HV_X64_MSR_CRASH_P4 0x40000104 +#define HV_X64_MSR_CRASH_CTL 0x40000105 +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 +#define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118 + +#define HV_X64_MSR_SYNDBG_CONTROL 0x400000F1 +#define HV_X64_MSR_SYNDBG_STATUS 0x400000F2 +#define HV_X64_MSR_SYNDBG_SEND_BUFFER 0x400000F3 +#define HV_X64_MSR_SYNDBG_RECV_BUFFER 0x400000F4 +#define HV_X64_MSR_SYNDBG_PENDING_BUFFER 0x400000F5 +#define HV_X64_MSR_SYNDBG_OPTIONS 0x400000FF + +/* HYPERV_CPUID_FEATURES.EAX */ +#define HV_MSR_VP_RUNTIME_AVAILABLE BIT(0) +#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) +#define HV_MSR_SYNIC_AVAILABLE BIT(2) +#define HV_MSR_SYNTIMER_AVAILABLE BIT(3) +#define HV_MSR_APIC_ACCESS_AVAILABLE BIT(4) +#define HV_MSR_HYPERCALL_AVAILABLE BIT(5) +#define HV_MSR_VP_INDEX_AVAILABLE BIT(6) +#define HV_MSR_RESET_AVAILABLE BIT(7) +#define HV_MSR_STAT_PAGES_AVAILABLE BIT(8) +#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) +#define HV_MSR_GUEST_IDLE_AVAILABLE BIT(10) +#define HV_ACCESS_FREQUENCY_MSRS BIT(11) +#define HV_ACCESS_REENLIGHTENMENT BIT(13) +#define HV_ACCESS_TSC_INVARIANT BIT(15) + +/* HYPERV_CPUID_FEATURES.EBX */ +#define HV_CREATE_PARTITIONS BIT(0) +#define HV_ACCESS_PARTITION_ID BIT(1) +#define HV_ACCESS_MEMORY_POOL BIT(2) +#define HV_ADJUST_MESSAGE_BUFFERS BIT(3) +#define HV_POST_MESSAGES BIT(4) +#define HV_SIGNAL_EVENTS BIT(5) +#define HV_CREATE_PORT BIT(6) +#define HV_CONNECT_PORT BIT(7) +#define HV_ACCESS_STATS BIT(8) +#define HV_DEBUGGING BIT(11) +#define HV_CPU_MANAGEMENT BIT(12) +#define HV_ISOLATION BIT(22) + +/* HYPERV_CPUID_FEATURES.EDX */ +#define HV_X64_MWAIT_AVAILABLE BIT(0) +#define HV_X64_GUEST_DEBUGGING_AVAILABLE BIT(1) +#define HV_X64_PERF_MONITOR_AVAILABLE BIT(2) +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE BIT(3) +#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE BIT(4) +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5) +#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE BIT(8) +#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) +#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11) +#define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19) + +/* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */ +#define HV_X64_AS_SWITCH_RECOMMENDED BIT(0) +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED BIT(1) +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED BIT(2) +#define HV_X64_APIC_ACCESS_RECOMMENDED BIT(3) +#define HV_X64_SYSTEM_RESET_RECOMMENDED BIT(4) +#define HV_X64_RELAXED_TIMING_RECOMMENDED BIT(5) +#define HV_DEPRECATING_AEOI_RECOMMENDED BIT(9) +#define HV_X64_CLUSTER_IPI_RECOMMENDED BIT(10) +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED BIT(11) +#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) + +/* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ +#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING BIT(1) + +/* Hypercalls */ +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 +#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_SEND_IPI 0x000b +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 +#define HVCALL_SEND_IPI_EX 0x0015 +#define HVCALL_GET_PARTITION_ID 0x0046 +#define HVCALL_DEPOSIT_MEMORY 0x0048 +#define HVCALL_CREATE_VP 0x004e +#define HVCALL_GET_VP_REGISTERS 0x0050 +#define HVCALL_SET_VP_REGISTERS 0x0051 +#define HVCALL_POST_MESSAGE 0x005c +#define HVCALL_SIGNAL_EVENT 0x005d +#define HVCALL_POST_DEBUG_DATA 0x0069 +#define HVCALL_RETRIEVE_DEBUG_DATA 0x006a +#define HVCALL_RESET_DEBUG_SESSION 0x006b +#define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 +#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c +#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d +#define HVCALL_RETARGET_INTERRUPT 0x007e +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 + +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 +#define HV_STATUS_INVALID_PARAMETER 5 +#define HV_STATUS_ACCESS_DENIED 6 +#define HV_STATUS_OPERATION_DENIED 8 +#define HV_STATUS_INSUFFICIENT_MEMORY 11 +#define HV_STATUS_INVALID_PORT_ID 17 +#define HV_STATUS_INVALID_CONNECTION_ID 18 +#define HV_STATUS_INSUFFICIENT_BUFFERS 19 + +/* hypercall options */ +#define HV_HYPERCALL_FAST_BIT BIT(16) + +#endif /* !SELFTEST_KVM_HYPERV_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 0b30b4e15c38..242ae8e09a65 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -13,6 +13,8 @@ #include <asm/msr-index.h> +#include "../kvm_util.h" + #define X86_EFLAGS_FIXED (1u << 1) #define X86_CR4_VME (1ul << 0) @@ -53,7 +55,8 @@ #define CPUID_PKU (1ul << 3) #define CPUID_LA57 (1ul << 16) -#define UNEXPECTED_VECTOR_PORT 0xfff0u +/* CPUID.0x8000_0001.EDX */ +#define CPUID_GBPAGES (1ul << 26) /* General Registers in 64-Bit Mode */ struct gpr64_regs { @@ -391,9 +394,13 @@ struct ex_regs { void vm_init_descriptor_tables(struct kvm_vm *vm); void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid); -void vm_handle_exception(struct kvm_vm *vm, int vector, +void vm_install_exception_handler(struct kvm_vm *vm, int vector, void (*handler)(struct ex_regs *)); +uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr); +void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, + uint64_t pte); + /* * set_cpuid() - overwrites a matching cpuid entry with the provided value. * matches based on ent->function && ent->index. returns true @@ -410,6 +417,14 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); +enum x86_page_size { + X86_PAGE_SIZE_4K = 0, + X86_PAGE_SIZE_2M, + X86_PAGE_SIZE_1G, +}; +void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + enum x86_page_size page_size); + /* * Basic CPU control in CR0 */ @@ -425,53 +440,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui #define X86_CR0_CD (1UL<<30) /* Cache Disable */ #define X86_CR0_PG (1UL<<31) /* Paging */ -#define APIC_DEFAULT_GPA 0xfee00000ULL - -/* APIC base address MSR and fields */ -#define MSR_IA32_APICBASE 0x0000001b -#define MSR_IA32_APICBASE_BSP (1<<8) -#define MSR_IA32_APICBASE_EXTD (1<<10) -#define MSR_IA32_APICBASE_ENABLE (1<<11) -#define MSR_IA32_APICBASE_BASE (0xfffff<<12) -#define GET_APIC_BASE(x) (((x) >> 12) << 12) - -#define APIC_BASE_MSR 0x800 -#define X2APIC_ENABLE (1UL << 10) -#define APIC_ID 0x20 -#define APIC_LVR 0x30 -#define GET_APIC_ID_FIELD(x) (((x) >> 24) & 0xFF) -#define APIC_TASKPRI 0x80 -#define APIC_PROCPRI 0xA0 -#define APIC_EOI 0xB0 -#define APIC_SPIV 0xF0 -#define APIC_SPIV_FOCUS_DISABLED (1 << 9) -#define APIC_SPIV_APIC_ENABLED (1 << 8) -#define APIC_ICR 0x300 -#define APIC_DEST_SELF 0x40000 -#define APIC_DEST_ALLINC 0x80000 -#define APIC_DEST_ALLBUT 0xC0000 -#define APIC_ICR_RR_MASK 0x30000 -#define APIC_ICR_RR_INVALID 0x00000 -#define APIC_ICR_RR_INPROG 0x10000 -#define APIC_ICR_RR_VALID 0x20000 -#define APIC_INT_LEVELTRIG 0x08000 -#define APIC_INT_ASSERT 0x04000 -#define APIC_ICR_BUSY 0x01000 -#define APIC_DEST_LOGICAL 0x00800 -#define APIC_DEST_PHYSICAL 0x00000 -#define APIC_DM_FIXED 0x00000 -#define APIC_DM_FIXED_MASK 0x00700 -#define APIC_DM_LOWEST 0x00100 -#define APIC_DM_SMI 0x00200 -#define APIC_DM_REMRD 0x00300 -#define APIC_DM_NMI 0x00400 -#define APIC_DM_INIT 0x00500 -#define APIC_DM_STARTUP 0x00600 -#define APIC_DM_EXTINT 0x00700 -#define APIC_VECTOR_MASK 0x000FF -#define APIC_ICR2 0x310 -#define SET_APIC_DEST_FIELD(x) ((x) << 24) - /* VMX_EPT_VPID_CAP bits */ #define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21) diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 65eb1079a161..583ceb0d1457 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -10,6 +10,7 @@ #include <stdint.h> #include "processor.h" +#include "apic.h" /* * Definitions of Primary Processor-Based VM-Execution Controls. @@ -607,15 +608,13 @@ bool nested_vmx_supported(void); void nested_vmx_check_supported(void); void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot); + uint64_t nested_paddr, uint64_t paddr); void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, - uint32_t eptp_memslot); + uint64_t nested_paddr, uint64_t paddr, uint64_t size); void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot, uint32_t eptp_memslot); + uint32_t memslot); void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot); -void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t eptp_memslot); +void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c new file mode 100644 index 000000000000..5906bbc08483 --- /dev/null +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kvm_binary_stats_test + * + * Copyright (C) 2021, Google LLC. + * + * Test the fd-based interface for KVM statistics. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include "test_util.h" + +#include "kvm_util.h" +#include "asm/kvm.h" +#include "linux/kvm.h" + +static void stats_test(int stats_fd) +{ + ssize_t ret; + int i; + size_t size_desc; + size_t size_data = 0; + struct kvm_stats_header *header; + char *id; + struct kvm_stats_desc *stats_desc; + u64 *stats_data; + struct kvm_stats_desc *pdesc; + + /* Read kvm stats header */ + header = malloc(sizeof(*header)); + TEST_ASSERT(header, "Allocate memory for stats header"); + + ret = read(stats_fd, header, sizeof(*header)); + TEST_ASSERT(ret == sizeof(*header), "Read stats header"); + size_desc = sizeof(*stats_desc) + header->name_size; + + /* Read kvm stats id string */ + id = malloc(header->name_size); + TEST_ASSERT(id, "Allocate memory for id string"); + ret = read(stats_fd, id, header->name_size); + TEST_ASSERT(ret == header->name_size, "Read id string"); + + /* Check id string, that should start with "kvm" */ + TEST_ASSERT(!strncmp(id, "kvm", 3) && strlen(id) < header->name_size, + "Invalid KVM stats type, id: %s", id); + + /* Sanity check for other fields in header */ + if (header->num_desc == 0) { + printf("No KVM stats defined!"); + return; + } + /* Check overlap */ + TEST_ASSERT(header->desc_offset > 0 && header->data_offset > 0 + && header->desc_offset >= sizeof(*header) + && header->data_offset >= sizeof(*header), + "Invalid offset fields in header"); + TEST_ASSERT(header->desc_offset > header->data_offset || + (header->desc_offset + size_desc * header->num_desc <= + header->data_offset), + "Descriptor block is overlapped with data block"); + + /* Allocate memory for stats descriptors */ + stats_desc = calloc(header->num_desc, size_desc); + TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors"); + /* Read kvm stats descriptors */ + ret = pread(stats_fd, stats_desc, + size_desc * header->num_desc, header->desc_offset); + TEST_ASSERT(ret == size_desc * header->num_desc, + "Read KVM stats descriptors"); + + /* Sanity check for fields in descriptors */ + for (i = 0; i < header->num_desc; ++i) { + pdesc = (void *)stats_desc + i * size_desc; + /* Check type,unit,base boundaries */ + TEST_ASSERT((pdesc->flags & KVM_STATS_TYPE_MASK) + <= KVM_STATS_TYPE_MAX, "Unknown KVM stats type"); + TEST_ASSERT((pdesc->flags & KVM_STATS_UNIT_MASK) + <= KVM_STATS_UNIT_MAX, "Unknown KVM stats unit"); + TEST_ASSERT((pdesc->flags & KVM_STATS_BASE_MASK) + <= KVM_STATS_BASE_MAX, "Unknown KVM stats base"); + /* Check exponent for stats unit + * Exponent for counter should be greater than or equal to 0 + * Exponent for unit bytes should be greater than or equal to 0 + * Exponent for unit seconds should be less than or equal to 0 + * Exponent for unit clock cycles should be greater than or + * equal to 0 + */ + switch (pdesc->flags & KVM_STATS_UNIT_MASK) { + case KVM_STATS_UNIT_NONE: + case KVM_STATS_UNIT_BYTES: + case KVM_STATS_UNIT_CYCLES: + TEST_ASSERT(pdesc->exponent >= 0, + "Unsupported KVM stats unit"); + break; + case KVM_STATS_UNIT_SECONDS: + TEST_ASSERT(pdesc->exponent <= 0, + "Unsupported KVM stats unit"); + break; + } + /* Check name string */ + TEST_ASSERT(strlen(pdesc->name) < header->name_size, + "KVM stats name(%s) too long", pdesc->name); + /* Check size field, which should not be zero */ + TEST_ASSERT(pdesc->size, "KVM descriptor(%s) with size of 0", + pdesc->name); + size_data += pdesc->size * sizeof(*stats_data); + } + /* Check overlap */ + TEST_ASSERT(header->data_offset >= header->desc_offset + || header->data_offset + size_data <= header->desc_offset, + "Data block is overlapped with Descriptor block"); + /* Check validity of all stats data size */ + TEST_ASSERT(size_data >= header->num_desc * sizeof(*stats_data), + "Data size is not correct"); + /* Check stats offset */ + for (i = 0; i < header->num_desc; ++i) { + pdesc = (void *)stats_desc + i * size_desc; + TEST_ASSERT(pdesc->offset < size_data, + "Invalid offset (%u) for stats: %s", + pdesc->offset, pdesc->name); + } + + /* Allocate memory for stats data */ + stats_data = malloc(size_data); + TEST_ASSERT(stats_data, "Allocate memory for stats data"); + /* Read kvm stats data as a bulk */ + ret = pread(stats_fd, stats_data, size_data, header->data_offset); + TEST_ASSERT(ret == size_data, "Read KVM stats data"); + /* Read kvm stats data one by one */ + size_data = 0; + for (i = 0; i < header->num_desc; ++i) { + pdesc = (void *)stats_desc + i * size_desc; + ret = pread(stats_fd, stats_data, + pdesc->size * sizeof(*stats_data), + header->data_offset + size_data); + TEST_ASSERT(ret == pdesc->size * sizeof(*stats_data), + "Read data of KVM stats: %s", pdesc->name); + size_data += pdesc->size * sizeof(*stats_data); + } + + free(stats_data); + free(stats_desc); + free(id); + free(header); +} + + +static void vm_stats_test(struct kvm_vm *vm) +{ + int stats_fd; + + /* Get fd for VM stats */ + stats_fd = vm_get_stats_fd(vm); + TEST_ASSERT(stats_fd >= 0, "Get VM stats fd"); + + stats_test(stats_fd); + close(stats_fd); + TEST_ASSERT(fcntl(stats_fd, F_GETFD) == -1, "Stats fd not freed"); +} + +static void vcpu_stats_test(struct kvm_vm *vm, int vcpu_id) +{ + int stats_fd; + + /* Get fd for VCPU stats */ + stats_fd = vcpu_get_stats_fd(vm, vcpu_id); + TEST_ASSERT(stats_fd >= 0, "Get VCPU stats fd"); + + stats_test(stats_fd); + close(stats_fd); + TEST_ASSERT(fcntl(stats_fd, F_GETFD) == -1, "Stats fd not freed"); +} + +#define DEFAULT_NUM_VM 4 +#define DEFAULT_NUM_VCPU 4 + +/* + * Usage: kvm_bin_form_stats [#vm] [#vcpu] + * The first parameter #vm set the number of VMs being created. + * The second parameter #vcpu set the number of VCPUs being created. + * By default, DEFAULT_NUM_VM VM and DEFAULT_NUM_VCPU VCPU for the VM would be + * created for testing. + */ + +int main(int argc, char *argv[]) +{ + int i, j; + struct kvm_vm **vms; + int max_vm = DEFAULT_NUM_VM; + int max_vcpu = DEFAULT_NUM_VCPU; + + /* Get the number of VMs and VCPUs that would be created for testing. */ + if (argc > 1) { + max_vm = strtol(argv[1], NULL, 0); + if (max_vm <= 0) + max_vm = DEFAULT_NUM_VM; + } + if (argc > 2) { + max_vcpu = strtol(argv[2], NULL, 0); + if (max_vcpu <= 0) + max_vcpu = DEFAULT_NUM_VCPU; + } + + /* Check the extension for binary stats */ + if (kvm_check_cap(KVM_CAP_BINARY_STATS_FD) <= 0) { + print_skip("Binary form statistics interface is not supported"); + exit(KSFT_SKIP); + } + + /* Create VMs and VCPUs */ + vms = malloc(sizeof(vms[0]) * max_vm); + TEST_ASSERT(vms, "Allocate memory for storing VM pointers"); + for (i = 0; i < max_vm; ++i) { + vms[i] = vm_create(VM_MODE_DEFAULT, + DEFAULT_GUEST_PHY_PAGES, O_RDWR); + for (j = 0; j < max_vcpu; ++j) + vm_vcpu_add(vms[i], j); + } + + /* Check stats read for every VM and VCPU */ + for (i = 0; i < max_vm; ++i) { + vm_stats_test(vms[i]); + for (j = 0; j < max_vcpu; ++j) + vcpu_stats_test(vms[i], j); + } + + for (i = 0; i < max_vm; ++i) + kvm_vm_free(vms[i]); + free(vms); + return 0; +} diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 1c4753fff19e..0d04a7db7f24 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -268,7 +268,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Create a VM with enough guest pages */ guest_num_pages = test_mem_size / guest_page_size; - vm = vm_create_with_vcpus(mode, nr_vcpus, + vm = vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, guest_num_pages, 0, guest_code, NULL); /* Align down GPA of the testing memslot */ @@ -303,7 +303,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) TEST_MEM_SLOT_INDEX, guest_num_pages, 0); /* Do mapping(GVA->GPA) for the testing memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); /* Cache the HVA pointer of the region */ host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); diff --git a/tools/testing/selftests/kvm/lib/aarch64/handlers.S b/tools/testing/selftests/kvm/lib/aarch64/handlers.S new file mode 100644 index 000000000000..0e443eadfac6 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/handlers.S @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +.macro save_registers + add sp, sp, #-16 * 17 + + stp x0, x1, [sp, #16 * 0] + stp x2, x3, [sp, #16 * 1] + stp x4, x5, [sp, #16 * 2] + stp x6, x7, [sp, #16 * 3] + stp x8, x9, [sp, #16 * 4] + stp x10, x11, [sp, #16 * 5] + stp x12, x13, [sp, #16 * 6] + stp x14, x15, [sp, #16 * 7] + stp x16, x17, [sp, #16 * 8] + stp x18, x19, [sp, #16 * 9] + stp x20, x21, [sp, #16 * 10] + stp x22, x23, [sp, #16 * 11] + stp x24, x25, [sp, #16 * 12] + stp x26, x27, [sp, #16 * 13] + stp x28, x29, [sp, #16 * 14] + + /* + * This stores sp_el1 into ex_regs.sp so exception handlers can "look" + * at it. It will _not_ be used to restore the sp on return from the + * exception so handlers can not update it. + */ + add x1, sp, #16 * 17 + stp x30, x1, [sp, #16 * 15] /* x30, SP */ + + mrs x1, elr_el1 + mrs x2, spsr_el1 + stp x1, x2, [sp, #16 * 16] /* PC, PSTATE */ +.endm + +.macro restore_registers + ldp x1, x2, [sp, #16 * 16] /* PC, PSTATE */ + msr elr_el1, x1 + msr spsr_el1, x2 + + /* sp is not restored */ + ldp x30, xzr, [sp, #16 * 15] /* x30, SP */ + + ldp x28, x29, [sp, #16 * 14] + ldp x26, x27, [sp, #16 * 13] + ldp x24, x25, [sp, #16 * 12] + ldp x22, x23, [sp, #16 * 11] + ldp x20, x21, [sp, #16 * 10] + ldp x18, x19, [sp, #16 * 9] + ldp x16, x17, [sp, #16 * 8] + ldp x14, x15, [sp, #16 * 7] + ldp x12, x13, [sp, #16 * 6] + ldp x10, x11, [sp, #16 * 5] + ldp x8, x9, [sp, #16 * 4] + ldp x6, x7, [sp, #16 * 3] + ldp x4, x5, [sp, #16 * 2] + ldp x2, x3, [sp, #16 * 1] + ldp x0, x1, [sp, #16 * 0] + + add sp, sp, #16 * 17 + + eret +.endm + +.pushsection ".entry.text", "ax" +.balign 0x800 +.global vectors +vectors: +.popsection + +.set vector, 0 + +/* + * Build an exception handler for vector and append a jump to it into + * vectors (while making sure that it's 0x80 aligned). + */ +.macro HANDLER, label +handler_\label: + save_registers + mov x0, sp + mov x1, #vector + bl route_exception + restore_registers + +.pushsection ".entry.text", "ax" +.balign 0x80 + b handler_\label +.popsection + +.set vector, vector + 1 +.endm + +.macro HANDLER_INVALID +.pushsection ".entry.text", "ax" +.balign 0x80 +/* This will abort so no need to save and restore registers. */ + mov x0, #vector + mov x1, #0 /* ec */ + mov x2, #0 /* valid_ec */ + b kvm_exit_unexpected_exception +.popsection + +.set vector, vector + 1 +.endm + +/* + * Caution: be sure to not add anything between the declaration of vectors + * above and these macro calls that will build the vectors table below it. + */ + HANDLER_INVALID // Synchronous EL1t + HANDLER_INVALID // IRQ EL1t + HANDLER_INVALID // FIQ EL1t + HANDLER_INVALID // Error EL1t + + HANDLER el1h_sync // Synchronous EL1h + HANDLER el1h_irq // IRQ EL1h + HANDLER el1h_fiq // FIQ EL1h + HANDLER el1h_error // Error EL1h + + HANDLER el0_sync_64 // Synchronous 64-bit EL0 + HANDLER el0_irq_64 // IRQ 64-bit EL0 + HANDLER el0_fiq_64 // FIQ 64-bit EL0 + HANDLER el0_error_64 // Error 64-bit EL0 + + HANDLER el0_sync_32 // Synchronous 32-bit EL0 + HANDLER el0_irq_32 // IRQ 32-bit EL0 + HANDLER el0_fiq_32 // FIQ 32-bit EL0 + HANDLER el0_error_32 // Error 32-bit EL0 diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index cee92d477dc0..632b74d6b3ca 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -6,14 +6,16 @@ */ #include <linux/compiler.h> +#include <assert.h> #include "kvm_util.h" #include "../kvm_util_internal.h" #include "processor.h" -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 #define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN 0xac0000 +static vm_vaddr_t exception_handlers; + static uint64_t page_align(struct kvm_vm *vm, uint64_t v) { return (v + vm->page_size) & ~(vm->page_size - 1); @@ -72,19 +74,19 @@ static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm) return 1 << (vm->page_shift - 3); } -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) +void virt_pgd_alloc(struct kvm_vm *vm) { if (!vm->pgd_created) { vm_paddr_t paddr = vm_phy_pages_alloc(vm, page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); vm->pgd = paddr; vm->pgd_created = true; } } -void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot, uint64_t flags) +static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint64_t flags) { uint8_t attr_idx = flags & 7; uint64_t *ptep; @@ -104,25 +106,19 @@ void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, paddr, vm->max_gfn, vm->page_size); ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8; - if (!*ptep) { - *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - *ptep |= 3; - } + if (!*ptep) + *ptep = vm_alloc_page_table(vm) | 3; switch (vm->pgtable_levels) { case 4: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8; - if (!*ptep) { - *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - *ptep |= 3; - } + if (!*ptep) + *ptep = vm_alloc_page_table(vm) | 3; /* fall through */ case 3: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, vaddr) * 8; - if (!*ptep) { - *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - *ptep |= 3; - } + if (!*ptep) + *ptep = vm_alloc_page_table(vm) | 3; /* fall through */ case 2: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8; @@ -135,12 +131,11 @@ void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, *ptep |= (attr_idx << 2) | (1 << 10) /* Access Flag */; } -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot) +void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */ - _virt_pg_map(vm, vaddr, paddr, pgd_memslot, attr_idx); + _virt_pg_map(vm, vaddr, paddr, attr_idx); } vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) @@ -302,7 +297,7 @@ void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, DEFAULT_STACK_PGS * vm->page_size : vm->page_size; uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, 0, 0); + DEFAULT_ARM64_GUEST_STACK_VADDR_MIN); vm_vcpu_add(vm, vcpuid); aarch64_vcpu_setup(vm, vcpuid, init); @@ -334,6 +329,100 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) va_end(ap); } +void kvm_exit_unexpected_exception(int vector, uint64_t ec, bool valid_ec) +{ + ucall(UCALL_UNHANDLED, 3, vector, ec, valid_ec); + while (1) + ; +} + void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) { + struct ucall uc; + + if (get_ucall(vm, vcpuid, &uc) != UCALL_UNHANDLED) + return; + + if (uc.args[2]) /* valid_ec */ { + assert(VECTOR_IS_SYNC(uc.args[0])); + TEST_FAIL("Unexpected exception (vector:0x%lx, ec:0x%lx)", + uc.args[0], uc.args[1]); + } else { + assert(!VECTOR_IS_SYNC(uc.args[0])); + TEST_FAIL("Unexpected exception (vector:0x%lx)", + uc.args[0]); + } +} + +struct handlers { + handler_fn exception_handlers[VECTOR_NUM][ESR_EC_NUM]; +}; + +void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid) +{ + extern char vectors; + + set_reg(vm, vcpuid, ARM64_SYS_REG(VBAR_EL1), (uint64_t)&vectors); +} + +void route_exception(struct ex_regs *regs, int vector) +{ + struct handlers *handlers = (struct handlers *)exception_handlers; + bool valid_ec; + int ec = 0; + + switch (vector) { + case VECTOR_SYNC_CURRENT: + case VECTOR_SYNC_LOWER_64: + ec = (read_sysreg(esr_el1) >> ESR_EC_SHIFT) & ESR_EC_MASK; + valid_ec = true; + break; + case VECTOR_IRQ_CURRENT: + case VECTOR_IRQ_LOWER_64: + case VECTOR_FIQ_CURRENT: + case VECTOR_FIQ_LOWER_64: + case VECTOR_ERROR_CURRENT: + case VECTOR_ERROR_LOWER_64: + ec = 0; + valid_ec = false; + break; + default: + valid_ec = false; + goto unexpected_exception; + } + + if (handlers && handlers->exception_handlers[vector][ec]) + return handlers->exception_handlers[vector][ec](regs); + +unexpected_exception: + kvm_exit_unexpected_exception(vector, ec, valid_ec); +} + +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + vm->handlers = vm_vaddr_alloc(vm, sizeof(struct handlers), + vm->page_size); + + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; +} + +void vm_install_sync_handler(struct kvm_vm *vm, int vector, int ec, + void (*handler)(struct ex_regs *)) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(VECTOR_IS_SYNC(vector)); + assert(vector < VECTOR_NUM); + assert(ec < ESR_EC_NUM); + handlers->exception_handlers[vector][ec] = handler; +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(!VECTOR_IS_SYNC(vector)); + assert(vector < VECTOR_NUM); + handlers->exception_handlers[vector][0] = handler; } diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index 2f37b90ee1a9..e0b0164e9af8 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -14,7 +14,7 @@ static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa) if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1)) return false; - virt_pg_map(vm, gpa, gpa, 0); + virt_pg_map(vm, gpa, gpa); ucall_exit_mmio_addr = (vm_vaddr_t *)gpa; sync_global_to_guest(vm, ucall_exit_mmio_addr); diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c index bc75a91e00a6..eac44f5d0db0 100644 --- a/tools/testing/selftests/kvm/lib/elf.c +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -111,8 +111,7 @@ static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp) * by the image and it needs to have sufficient available physical pages, to * back the virtual pages used to load the image. */ -void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, - uint32_t data_memslot, uint32_t pgd_memslot) +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename) { off_t offset, offset_rv; Elf64_Ehdr hdr; @@ -164,8 +163,7 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, seg_vend |= vm->page_size - 1; size_t seg_size = seg_vend - seg_vstart + 1; - vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart, - data_memslot, pgd_memslot); + vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart); TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate " "virtual memory for segment at requested min addr,\n" " segment idx: %u\n" diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c index 25bff307c71f..c330f414ef96 100644 --- a/tools/testing/selftests/kvm/lib/guest_modes.c +++ b/tools/testing/selftests/kvm/lib/guest_modes.c @@ -22,6 +22,22 @@ void guest_modes_append_default(void) } } #endif +#ifdef __s390x__ + { + int kvm_fd, vm_fd; + struct kvm_s390_vm_cpu_processor info; + + kvm_fd = open_kvm_dev_path_or_exit(); + vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, 0); + kvm_device_access(vm_fd, KVM_S390_VM_CPU_MODEL, + KVM_S390_VM_CPU_PROCESSOR, &info, false); + close(vm_fd); + close(kvm_fd); + /* Starting with z13 we have 47bits of physical address */ + if (info.ibc >= 0x30) + guest_mode_append(VM_MODE_P47V64_4K, true, true); + } +#endif } void for_each_guest_mode(void (*func)(enum vm_guest_mode, void *), void *arg) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index fc83f6c5902d..10a8ed691c66 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -32,6 +32,34 @@ static void *align(void *x, size_t size) } /* + * Open KVM_DEV_PATH if available, otherwise exit the entire program. + * + * Input Args: + * flags - The flags to pass when opening KVM_DEV_PATH. + * + * Return: + * The opened file descriptor of /dev/kvm. + */ +static int _open_kvm_dev_path_or_exit(int flags) +{ + int fd; + + fd = open(KVM_DEV_PATH, flags); + if (fd < 0) { + print_skip("%s not available, is KVM loaded? (errno: %d)", + KVM_DEV_PATH, errno); + exit(KSFT_SKIP); + } + + return fd; +} + +int open_kvm_dev_path_or_exit(void) +{ + return _open_kvm_dev_path_or_exit(O_RDONLY); +} + +/* * Capability * * Input Args: @@ -52,12 +80,9 @@ int kvm_check_cap(long cap) int ret; int kvm_fd; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); - + kvm_fd = open_kvm_dev_path_or_exit(); ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap); - TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n" + TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION IOCTL failed,\n" " rc: %i errno: %i", ret, errno); close(kvm_fd); @@ -128,9 +153,7 @@ void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) static void vm_open(struct kvm_vm *vm, int perm) { - vm->kvm_fd = open(KVM_DEV_PATH, perm); - if (vm->kvm_fd < 0) - exit(KSFT_SKIP); + vm->kvm_fd = _open_kvm_dev_path_or_exit(perm); if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) { print_skip("immediate_exit not available"); @@ -152,6 +175,8 @@ const char *vm_guest_mode_string(uint32_t i) [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", + [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", + [VM_MODE_P44V64_4K] = "PA-bits:44, VA-bits:64, 4K pages", }; _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES, "Missing new mode strings?"); @@ -169,6 +194,8 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { { 40, 48, 0x1000, 12 }, { 40, 48, 0x10000, 16 }, { 0, 0, 0x1000, 12 }, + { 47, 64, 0x1000, 12 }, + { 44, 64, 0x1000, 12 }, }; _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, "Missing new mode params?"); @@ -203,7 +230,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) TEST_ASSERT(vm != NULL, "Insufficient Memory"); INIT_LIST_HEAD(&vm->vcpus); - INIT_LIST_HEAD(&vm->userspace_mem_regions); + vm->regions.gpa_tree = RB_ROOT; + vm->regions.hva_tree = RB_ROOT; + hash_init(vm->regions.slot_hash); vm->mode = mode; vm->type = 0; @@ -252,6 +281,12 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); #endif break; + case VM_MODE_P47V64_4K: + vm->pgtable_levels = 5; + break; + case VM_MODE_P44V64_4K: + vm->pgtable_levels = 5; + break; default: TEST_FAIL("Unknown guest mode, mode: 0x%x", mode); } @@ -283,21 +318,50 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) return vm; } +/* + * VM Create with customized parameters + * + * Input Args: + * mode - VM Mode (e.g. VM_MODE_P52V48_4K) + * nr_vcpus - VCPU count + * slot0_mem_pages - Slot0 physical memory size + * extra_mem_pages - Non-slot0 physical memory total size + * num_percpu_pages - Per-cpu physical memory pages + * guest_code - Guest entry point + * vcpuids - VCPU IDs + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + * + * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K), + * with customized slot0 memory size, at least 512 pages currently. + * extra_mem_pages is only used to calculate the maximum page table size, + * no real memory allocation for non-slot0 memory in this function. + */ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t extra_mem_pages, uint32_t num_percpu_pages, - void *guest_code, uint32_t vcpuids[]) + uint64_t slot0_mem_pages, uint64_t extra_mem_pages, + uint32_t num_percpu_pages, void *guest_code, + uint32_t vcpuids[]) { + uint64_t vcpu_pages, extra_pg_pages, pages; + struct kvm_vm *vm; + int i; + + /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ + if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) + slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES; + /* The maximum page table size for a memory region will be when the * smallest pages are used. Considering each page contains x page * table descriptors, the total extra size for page tables (for extra * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller * than N/x*2. */ - uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; - uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; - uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; - struct kvm_vm *vm; - int i; + vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; + extra_pg_pages = (slot0_mem_pages + extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; + pages = slot0_mem_pages + vcpu_pages + extra_pg_pages; TEST_ASSERT(nr_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), "nr_vcpus = %d too large for host, max-vcpus = %d", @@ -306,7 +370,7 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, pages = vm_adjust_num_guest_pages(mode, pages); vm = vm_create(mode, pages, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + kvm_vm_elf_load(vm, program_invocation_name); #ifdef __x86_64__ vm_create_irqchip(vm); @@ -316,10 +380,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, uint32_t vcpuid = vcpuids ? vcpuids[i] : i; vm_vcpu_add_default(vm, vcpuid, guest_code); - -#ifdef __x86_64__ - vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid()); -#endif } return vm; @@ -329,8 +389,8 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me uint32_t num_percpu_pages, void *guest_code, uint32_t vcpuids[]) { - return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, extra_mem_pages, - num_percpu_pages, guest_code, vcpuids); + return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, + extra_mem_pages, num_percpu_pages, guest_code, vcpuids); } struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, @@ -355,13 +415,14 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, */ void kvm_vm_restart(struct kvm_vm *vmp, int perm) { + int ctr; struct userspace_mem_region *region; vm_open(vmp, perm); if (vmp->has_irqchip) vm_create_irqchip(vmp); - list_for_each_entry(region, &vmp->userspace_mem_regions, list) { + hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) { int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" " rc: %i errno: %i\n" @@ -424,14 +485,21 @@ uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) static struct userspace_mem_region * userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) { - struct userspace_mem_region *region; + struct rb_node *node; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + for (node = vm->regions.gpa_tree.rb_node; node; ) { + struct userspace_mem_region *region = + container_of(node, struct userspace_mem_region, gpa_node); uint64_t existing_start = region->region.guest_phys_addr; uint64_t existing_end = region->region.guest_phys_addr + region->region.memory_size - 1; if (start <= existing_end && end >= existing_start) return region; + + if (start < existing_start) + node = node->rb_left; + else + node = node->rb_right; } return NULL; @@ -546,11 +614,16 @@ void kvm_vm_release(struct kvm_vm *vmp) } static void __vm_mem_region_delete(struct kvm_vm *vm, - struct userspace_mem_region *region) + struct userspace_mem_region *region, + bool unlink) { int ret; - list_del(®ion->list); + if (unlink) { + rb_erase(®ion->gpa_node, &vm->regions.gpa_tree); + rb_erase(®ion->hva_node, &vm->regions.hva_tree); + hash_del(®ion->slot_node); + } region->region.memory_size = 0; ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); @@ -569,14 +642,16 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, */ void kvm_vm_free(struct kvm_vm *vmp) { - struct userspace_mem_region *region, *tmp; + int ctr; + struct hlist_node *node; + struct userspace_mem_region *region; if (vmp == NULL) return; /* Free userspace_mem_regions. */ - list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list) - __vm_mem_region_delete(vmp, region); + hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) + __vm_mem_region_delete(vmp, region, false); /* Free sparsebit arrays. */ sparsebit_free(&vmp->vpages_valid); @@ -658,13 +733,64 @@ int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len) return 0; } +static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, + struct userspace_mem_region *region) +{ + struct rb_node **cur, *parent; + + for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) { + struct userspace_mem_region *cregion; + + cregion = container_of(*cur, typeof(*cregion), gpa_node); + parent = *cur; + if (region->region.guest_phys_addr < + cregion->region.guest_phys_addr) + cur = &(*cur)->rb_left; + else { + TEST_ASSERT(region->region.guest_phys_addr != + cregion->region.guest_phys_addr, + "Duplicate GPA in region tree"); + + cur = &(*cur)->rb_right; + } + } + + rb_link_node(®ion->gpa_node, parent, cur); + rb_insert_color(®ion->gpa_node, gpa_tree); +} + +static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree, + struct userspace_mem_region *region) +{ + struct rb_node **cur, *parent; + + for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) { + struct userspace_mem_region *cregion; + + cregion = container_of(*cur, typeof(*cregion), hva_node); + parent = *cur; + if (region->host_mem < cregion->host_mem) + cur = &(*cur)->rb_left; + else { + TEST_ASSERT(region->host_mem != + cregion->host_mem, + "Duplicate HVA in region tree"); + + cur = &(*cur)->rb_right; + } + } + + rb_link_node(®ion->hva_node, parent, cur); + rb_insert_color(®ion->hva_node, hva_tree); +} + /* * VM Userspace Memory Region Add * * Input Args: * vm - Virtual Machine - * backing_src - Storage source for this region. - * NULL to use anonymous memory. + * src_type - Storage source for this region. + * NULL to use anonymous memory. * guest_paddr - Starting guest physical address * slot - KVM region slot * npages - Number of physical pages @@ -722,7 +848,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, (uint64_t) region->region.memory_size); /* Confirm no region with the requested slot already exists. */ - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, + slot) { if (region->region.slot != slot) continue; @@ -755,11 +882,30 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, if (alignment > 1) region->mmap_size += alignment; + region->fd = -1; + if (backing_src_is_shared(src_type)) { + int memfd_flags = MFD_CLOEXEC; + + if (src_type == VM_MEM_SRC_SHARED_HUGETLB) + memfd_flags |= MFD_HUGETLB; + + region->fd = memfd_create("kvm_selftest", memfd_flags); + TEST_ASSERT(region->fd != -1, + "memfd_create failed, errno: %i", errno); + + ret = ftruncate(region->fd, region->mmap_size); + TEST_ASSERT(ret == 0, "ftruncate failed, errno: %i", errno); + + ret = fallocate(region->fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, + region->mmap_size); + TEST_ASSERT(ret == 0, "fallocate failed, errno: %i", errno); + } + region->mmap_start = mmap(NULL, region->mmap_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS - | vm_mem_backing_src_alias(src_type)->flag, - -1, 0); + vm_mem_backing_src_alias(src_type)->flag, + region->fd, 0); TEST_ASSERT(region->mmap_start != MAP_FAILED, "test_malloc failed, mmap_start: %p errno: %i", region->mmap_start, errno); @@ -793,8 +939,23 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, ret, errno, slot, flags, guest_paddr, (uint64_t) region->region.memory_size); - /* Add to linked-list of memory regions. */ - list_add(®ion->list, &vm->userspace_mem_regions); + /* Add to quick lookup data structures */ + vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); + vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region); + hash_add(vm->regions.slot_hash, ®ion->slot_node, slot); + + /* If shared memory, create an alias. */ + if (region->fd >= 0) { + region->mmap_alias = mmap(NULL, region->mmap_size, + PROT_READ | PROT_WRITE, + vm_mem_backing_src_alias(src_type)->flag, + region->fd, 0); + TEST_ASSERT(region->mmap_alias != MAP_FAILED, + "mmap of alias failed, errno: %i", errno); + + /* Align host alias address */ + region->host_alias = align(region->mmap_alias, alignment); + } } /* @@ -817,10 +978,10 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot) { struct userspace_mem_region *region; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, + memslot) if (region->region.slot == memslot) return region; - } fprintf(stderr, "No mem region with the requested slot found,\n" " requested slot: %u\n", memslot); @@ -905,7 +1066,7 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) */ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) { - __vm_mem_region_delete(vm, memslot2region(vm, slot)); + __vm_mem_region_delete(vm, memslot2region(vm, slot), true); } /* @@ -925,9 +1086,7 @@ static int vcpu_mmap_sz(void) { int dev_fd, ret; - dev_fd = open(KVM_DEV_PATH, O_RDONLY); - if (dev_fd < 0) - exit(KSFT_SKIP); + dev_fd = open_kvm_dev_path_or_exit(); ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); TEST_ASSERT(ret >= sizeof(struct kvm_run), @@ -1093,12 +1252,13 @@ va_found: * a unique set of pages, with the minimum real allocation being at least * a page. */ -vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, - uint32_t data_memslot, uint32_t pgd_memslot) +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min) { uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); - virt_pgd_alloc(vm, pgd_memslot); + virt_pgd_alloc(vm); + vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages, + KVM_UTIL_MIN_PFN * vm->page_size, 0); /* * Find an unused range of virtual page addresses of at least @@ -1108,13 +1268,9 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, /* Map the virtual pages. */ for (vm_vaddr_t vaddr = vaddr_start; pages > 0; - pages--, vaddr += vm->page_size) { - vm_paddr_t paddr; - - paddr = vm_phy_page_alloc(vm, - KVM_UTIL_MIN_PFN * vm->page_size, data_memslot); + pages--, vaddr += vm->page_size, paddr += vm->page_size) { - virt_pg_map(vm, vaddr, paddr, pgd_memslot); + virt_pg_map(vm, vaddr, paddr); sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); @@ -1124,6 +1280,44 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, } /* + * VM Virtual Address Allocate Pages + * + * Input Args: + * vm - Virtual Machine + * + * Output Args: None + * + * Return: + * Starting guest virtual address + * + * Allocates at least N system pages worth of bytes within the virtual address + * space of the vm. + */ +vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages) +{ + return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR); +} + +/* + * VM Virtual Address Allocate Page + * + * Input Args: + * vm - Virtual Machine + * + * Output Args: None + * + * Return: + * Starting guest virtual address + * + * Allocates at least one system page worth of bytes within the virtual address + * space of the vm. + */ +vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm) +{ + return vm_vaddr_alloc_pages(vm, 1); +} + +/* * Map a range of VM virtual address to the VM's physical address * * Input Args: @@ -1141,7 +1335,7 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, * @npages starting at @vaddr to the page range starting at @paddr. */ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - unsigned int npages, uint32_t pgd_memslot) + unsigned int npages) { size_t page_size = vm->page_size; size_t size = npages * page_size; @@ -1150,7 +1344,7 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - virt_pg_map(vm, vaddr, paddr, pgd_memslot); + virt_pg_map(vm, vaddr, paddr); vaddr += page_size; paddr += page_size; } @@ -1177,16 +1371,14 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) { struct userspace_mem_region *region; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { - if ((gpa >= region->region.guest_phys_addr) - && (gpa <= (region->region.guest_phys_addr - + region->region.memory_size - 1))) - return (void *) ((uintptr_t) region->host_mem - + (gpa - region->region.guest_phys_addr)); + region = userspace_mem_region_find(vm, gpa, gpa); + if (!region) { + TEST_FAIL("No vm physical memory at 0x%lx", gpa); + return NULL; } - TEST_FAIL("No vm physical memory at 0x%lx", gpa); - return NULL; + return (void *)((uintptr_t)region->host_mem + + (gpa - region->region.guest_phys_addr)); } /* @@ -1208,15 +1400,22 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) */ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) { - struct userspace_mem_region *region; + struct rb_node *node; + + for (node = vm->regions.hva_tree.rb_node; node; ) { + struct userspace_mem_region *region = + container_of(node, struct userspace_mem_region, hva_node); - list_for_each_entry(region, &vm->userspace_mem_regions, list) { - if ((hva >= region->host_mem) - && (hva <= (region->host_mem - + region->region.memory_size - 1))) - return (vm_paddr_t) ((uintptr_t) - region->region.guest_phys_addr - + (hva - (uintptr_t) region->host_mem)); + if (hva >= region->host_mem) { + if (hva <= (region->host_mem + + region->region.memory_size - 1)) + return (vm_paddr_t)((uintptr_t) + region->region.guest_phys_addr + + (hva - (uintptr_t)region->host_mem)); + + node = node->rb_right; + } else + node = node->rb_left; } TEST_FAIL("No mapping to a guest physical address, hva: %p", hva); @@ -1224,6 +1423,42 @@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) } /* + * Address VM physical to Host Virtual *alias*. + * + * Input Args: + * vm - Virtual Machine + * gpa - VM physical address + * + * Output Args: None + * + * Return: + * Equivalent address within the host virtual *alias* area, or NULL + * (without failing the test) if the guest memory is not shared (so + * no alias exists). + * + * When vm_create() and related functions are called with a shared memory + * src_type, we also create a writable, shared alias mapping of the + * underlying guest memory. This allows the host to manipulate guest memory + * without mapping that memory in the guest's address space. And, for + * userfaultfd-based demand paging, we can do so without triggering userfaults. + */ +void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa) +{ + struct userspace_mem_region *region; + uintptr_t offset; + + region = userspace_mem_region_find(vm, gpa, gpa); + if (!region) + return NULL; + + if (!region->host_alias) + return NULL; + + offset = gpa - region->region.guest_phys_addr; + return (void *) ((uintptr_t) region->host_alias + offset); +} + +/* * VM Create IRQ Chip * * Input Args: @@ -1822,6 +2057,7 @@ int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, */ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { + int ctr; struct userspace_mem_region *region; struct vcpu *vcpu; @@ -1829,7 +2065,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); fprintf(stream, "%*sMem Regions:\n", indent, ""); - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) { fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " "host_virt: %p\n", indent + 2, "", (uint64_t) region->region.guest_phys_addr, @@ -1978,6 +2214,14 @@ vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, return vm_phy_pages_alloc(vm, 1, paddr_min, memslot); } +/* Arbitrary minimum physical address used for virtual translation tables. */ +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 + +vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm) +{ + return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); +} + /* * Address Guest Virtual to Host Virtual * @@ -2015,10 +2259,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm) if (vm == NULL) { /* Ensure that the KVM vendor-specific module is loaded. */ - f = fopen(KVM_DEV_PATH, "r"); - TEST_ASSERT(f != NULL, "Error in opening KVM dev file: %d", - errno); - fclose(f); + close(open_kvm_dev_path_or_exit()); } f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r"); @@ -2041,7 +2282,7 @@ unsigned int vm_get_page_shift(struct kvm_vm *vm) return vm->page_shift; } -unsigned int vm_get_max_gfn(struct kvm_vm *vm) +uint64_t vm_get_max_gfn(struct kvm_vm *vm) { return vm->max_gfn; } @@ -2090,3 +2331,15 @@ unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size) n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size); return vm_adjust_num_guest_pages(mode, n); } + +int vm_get_stats_fd(struct kvm_vm *vm) +{ + return ioctl(vm->fd, KVM_GET_STATS_FD, NULL); +} + +int vcpu_get_stats_fd(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + + return ioctl(vcpu->fd, KVM_GET_STATS_FD, NULL); +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index 91ce1b5d480b..a03febc24ba6 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -8,6 +8,9 @@ #ifndef SELFTEST_KVM_UTIL_INTERNAL_H #define SELFTEST_KVM_UTIL_INTERNAL_H +#include "linux/hashtable.h" +#include "linux/rbtree.h" + #include "sparsebit.h" struct userspace_mem_region { @@ -16,9 +19,13 @@ struct userspace_mem_region { int fd; off_t offset; void *host_mem; + void *host_alias; void *mmap_start; + void *mmap_alias; size_t mmap_size; - struct list_head list; + struct rb_node gpa_node; + struct rb_node hva_node; + struct hlist_node slot_node; }; struct vcpu { @@ -31,6 +38,12 @@ struct vcpu { uint32_t dirty_gfns_count; }; +struct userspace_mem_regions { + struct rb_root gpa_tree; + struct rb_root hva_tree; + DECLARE_HASHTABLE(slot_hash, 9); +}; + struct kvm_vm { int mode; unsigned long type; @@ -43,7 +56,7 @@ struct kvm_vm { unsigned int va_bits; uint64_t max_gfn; struct list_head vcpus; - struct list_head userspace_mem_regions; + struct userspace_mem_regions regions; struct sparsebit *vpages_valid; struct sparsebit *vpages_mapped; bool has_irqchip; diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 81490b9b4e32..b488f4aefea8 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2020, Google LLC. */ +#include <inttypes.h> #include "kvm_util.h" #include "perf_test_util.h" @@ -68,7 +69,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, "Guest memory size is not guest page size aligned."); - vm = vm_create_with_vcpus(mode, vcpus, + vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, 0, guest_code, NULL); @@ -80,7 +81,8 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, */ TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), "Requested more guest memory than address space allows.\n" - " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", + " guest pages: %" PRIx64 " max gfn: %" PRIx64 + " vcpus: %d wss: %" PRIx64 "]\n", guest_num_pages, vm_get_max_gfn(vm), vcpus, vcpu_memory_bytes); @@ -99,7 +101,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, guest_num_pages, 0); /* Do mapping for the demand paging memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); ucall_init(vm, NULL); diff --git a/tools/testing/selftests/kvm/lib/rbtree.c b/tools/testing/selftests/kvm/lib/rbtree.c new file mode 100644 index 000000000000..a703f0194ea3 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/rbtree.c @@ -0,0 +1 @@ +#include "../../../../lib/rbtree.c" diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index 0152f356c099..f87c7137598e 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -9,11 +9,9 @@ #include "kvm_util.h" #include "../kvm_util_internal.h" -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 - #define PAGES_PER_REGION 4 -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) +void virt_pgd_alloc(struct kvm_vm *vm) { vm_paddr_t paddr; @@ -24,7 +22,7 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) return; paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size); vm->pgd = paddr; @@ -36,12 +34,12 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) * a page table (ri == 4). Returns a suitable region/segment table entry * which points to the freshly allocated pages. */ -static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot) +static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri) { uint64_t taddr; taddr = vm_phy_pages_alloc(vm, ri < 4 ? PAGES_PER_REGION : 1, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); memset(addr_gpa2hva(vm, taddr), 0xff, PAGES_PER_REGION * vm->page_size); return (taddr & REGION_ENTRY_ORIGIN) @@ -49,8 +47,7 @@ static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot) | ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH); } -void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, - uint32_t memslot) +void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa) { int ri, idx; uint64_t *entry; @@ -77,7 +74,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, for (ri = 1; ri <= 4; ri++) { idx = (gva >> (64 - 11 * ri)) & 0x7ffu; if (entry[idx] & REGION_ENTRY_INVALID) - entry[idx] = virt_alloc_region(vm, ri, memslot); + entry[idx] = virt_alloc_region(vm, ri); entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); } @@ -170,7 +167,7 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) vm->page_size); stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); + DEFAULT_GUEST_STACK_VADDR_MIN); vm_vcpu_add(vm, vcpuid); diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 63d2bc7d757b..af1031fed97f 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -166,72 +166,89 @@ size_t get_def_hugetlb_pagesz(void) return 0; } +#define ANON_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS) +#define ANON_HUGE_FLAGS (ANON_FLAGS | MAP_HUGETLB) + const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i) { static const struct vm_mem_backing_src_alias aliases[] = { [VM_MEM_SRC_ANONYMOUS] = { .name = "anonymous", - .flag = 0, + .flag = ANON_FLAGS, }, [VM_MEM_SRC_ANONYMOUS_THP] = { .name = "anonymous_thp", - .flag = 0, + .flag = ANON_FLAGS, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB] = { .name = "anonymous_hugetlb", - .flag = MAP_HUGETLB, + .flag = ANON_HUGE_FLAGS, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = { .name = "anonymous_hugetlb_16kb", - .flag = MAP_HUGETLB | MAP_HUGE_16KB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_16KB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = { .name = "anonymous_hugetlb_64kb", - .flag = MAP_HUGETLB | MAP_HUGE_64KB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_64KB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = { .name = "anonymous_hugetlb_512kb", - .flag = MAP_HUGETLB | MAP_HUGE_512KB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_512KB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = { .name = "anonymous_hugetlb_1mb", - .flag = MAP_HUGETLB | MAP_HUGE_1MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_1MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = { .name = "anonymous_hugetlb_2mb", - .flag = MAP_HUGETLB | MAP_HUGE_2MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_2MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = { .name = "anonymous_hugetlb_8mb", - .flag = MAP_HUGETLB | MAP_HUGE_8MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_8MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = { .name = "anonymous_hugetlb_16mb", - .flag = MAP_HUGETLB | MAP_HUGE_16MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_16MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = { .name = "anonymous_hugetlb_32mb", - .flag = MAP_HUGETLB | MAP_HUGE_32MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_32MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = { .name = "anonymous_hugetlb_256mb", - .flag = MAP_HUGETLB | MAP_HUGE_256MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_256MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = { .name = "anonymous_hugetlb_512mb", - .flag = MAP_HUGETLB | MAP_HUGE_512MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_512MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = { .name = "anonymous_hugetlb_1gb", - .flag = MAP_HUGETLB | MAP_HUGE_1GB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_1GB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = { .name = "anonymous_hugetlb_2gb", - .flag = MAP_HUGETLB | MAP_HUGE_2GB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_2GB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = { .name = "anonymous_hugetlb_16gb", - .flag = MAP_HUGETLB | MAP_HUGE_16GB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_16GB, + }, + [VM_MEM_SRC_SHMEM] = { + .name = "shmem", + .flag = MAP_SHARED, + }, + [VM_MEM_SRC_SHARED_HUGETLB] = { + .name = "shared_hugetlb", + /* + * No MAP_HUGETLB, we use MFD_HUGETLB instead. Since + * we're using "file backed" memory, we need to specify + * this when the FD is created, not when the area is + * mapped. + */ + .flag = MAP_SHARED, }, }; _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES, @@ -250,10 +267,12 @@ size_t get_backing_src_pagesz(uint32_t i) switch (i) { case VM_MEM_SRC_ANONYMOUS: + case VM_MEM_SRC_SHMEM: return getpagesize(); case VM_MEM_SRC_ANONYMOUS_THP: return get_trans_hugepagesz(); case VM_MEM_SRC_ANONYMOUS_HUGETLB: + case VM_MEM_SRC_SHARED_HUGETLB: return get_def_hugetlb_pagesz(); default: return MAP_HUGE_PAGE_SIZE(flag); diff --git a/tools/testing/selftests/kvm/lib/x86_64/apic.c b/tools/testing/selftests/kvm/lib/x86_64/apic.c new file mode 100644 index 000000000000..7168e25c194e --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/apic.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/x86_64/processor.c + * + * Copyright (C) 2021, Google LLC. + */ + +#include "apic.h" + +void apic_disable(void) +{ + wrmsr(MSR_IA32_APICBASE, + rdmsr(MSR_IA32_APICBASE) & + ~(MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD)); +} + +void xapic_enable(void) +{ + uint64_t val = rdmsr(MSR_IA32_APICBASE); + + /* Per SDM: to enable xAPIC when in x2APIC must first disable APIC */ + if (val & MSR_IA32_APICBASE_EXTD) { + apic_disable(); + wrmsr(MSR_IA32_APICBASE, + rdmsr(MSR_IA32_APICBASE) | MSR_IA32_APICBASE_ENABLE); + } else if (!(val & MSR_IA32_APICBASE_ENABLE)) { + wrmsr(MSR_IA32_APICBASE, val | MSR_IA32_APICBASE_ENABLE); + } + + /* + * Per SDM: reset value of spurious interrupt vector register has the + * APIC software enabled bit=0. It must be enabled in addition to the + * enable bit in the MSR. + */ + val = xapic_read_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED; + xapic_write_reg(APIC_SPIV, val); +} + +void x2apic_enable(void) +{ + wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) | + MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD); + x2apic_write_reg(APIC_SPIV, + x2apic_read_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED); +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index a8906e60a108..28cb881f440d 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -17,13 +17,10 @@ #define DEFAULT_CODE_SELECTOR 0x8 #define DEFAULT_DATA_SELECTOR 0x10 -/* Minimum physical address used for virtual translation tables. */ -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 - vm_vaddr_t exception_handlers; /* Virtual translation table structure declarations */ -struct pageMapL4Entry { +struct pageUpperEntry { uint64_t present:1; uint64_t writable:1; uint64_t user:1; @@ -33,37 +30,7 @@ struct pageMapL4Entry { uint64_t ignored_06:1; uint64_t page_size:1; uint64_t ignored_11_08:4; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; - -struct pageDirectoryPointerEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t ignored_06:1; - uint64_t page_size:1; - uint64_t ignored_11_08:4; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; - -struct pageDirectoryEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t ignored_06:1; - uint64_t page_size:1; - uint64_t ignored_11_08:4; - uint64_t address:40; + uint64_t pfn:40; uint64_t ignored_62_52:11; uint64_t execute_disable:1; }; @@ -79,7 +46,7 @@ struct pageTableEntry { uint64_t reserved_07:1; uint64_t global:1; uint64_t ignored_11_09:3; - uint64_t address:40; + uint64_t pfn:40; uint64_t ignored_62_52:11; uint64_t execute_disable:1; }; @@ -207,96 +174,211 @@ void sregs_dump(FILE *stream, struct kvm_sregs *sregs, } } -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) +void virt_pgd_alloc(struct kvm_vm *vm) { TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); /* If needed, create page map l4 table. */ if (!vm->pgd_created) { - vm_paddr_t paddr = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - vm->pgd = paddr; + vm->pgd = vm_alloc_page_table(vm); vm->pgd_created = true; } } -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot) +static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr, + int level) +{ + uint64_t *page_table = addr_gpa2hva(vm, pt_pfn << vm->page_shift); + int index = vaddr >> (vm->page_shift + level * 9) & 0x1ffu; + + return &page_table[index]; +} + +static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm, + uint64_t pt_pfn, + uint64_t vaddr, + uint64_t paddr, + int level, + enum x86_page_size page_size) +{ + struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level); + + if (!pte->present) { + pte->writable = true; + pte->present = true; + pte->page_size = (level == page_size); + if (pte->page_size) + pte->pfn = paddr >> vm->page_shift; + else + pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift; + } else { + /* + * Entry already present. Assert that the caller doesn't want + * a hugepage at this level, and that there isn't a hugepage at + * this level. + */ + TEST_ASSERT(level != page_size, + "Cannot create hugepage at level: %u, vaddr: 0x%lx\n", + page_size, vaddr); + TEST_ASSERT(!pte->page_size, + "Cannot create page table at level: %u, vaddr: 0x%lx\n", + level, vaddr); + } + return pte; +} + +void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + enum x86_page_size page_size) +{ + const uint64_t pg_size = 1ull << ((page_size * 9) + 12); + struct pageUpperEntry *pml4e, *pdpe, *pde; + struct pageTableEntry *pte; + + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, + "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + TEST_ASSERT((vaddr % pg_size) == 0, + "Virtual address not aligned,\n" + "vaddr: 0x%lx page size: 0x%lx", vaddr, pg_size); + TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), + "Invalid virtual address, vaddr: 0x%lx", vaddr); + TEST_ASSERT((paddr % pg_size) == 0, + "Physical address not aligned,\n" + " paddr: 0x%lx page size: 0x%lx", paddr, pg_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + + /* + * Allocate upper level page tables, if not already present. Return + * early if a hugepage was created. + */ + pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift, + vaddr, paddr, 3, page_size); + if (pml4e->page_size) + return; + + pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size); + if (pdpe->page_size) + return; + + pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size); + if (pde->page_size) + return; + + /* Fill in page table entry. */ + pte = virt_get_pte(vm, pde->pfn, vaddr, 0); + TEST_ASSERT(!pte->present, + "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr); + pte->pfn = paddr >> vm->page_shift; + pte->writable = true; + pte->present = 1; +} + +void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +{ + __virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K); +} + +static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, + uint64_t vaddr) { uint16_t index[4]; - struct pageMapL4Entry *pml4e; + struct pageUpperEntry *pml4e, *pdpe, *pde; + struct pageTableEntry *pte; + struct kvm_cpuid_entry2 *entry; + struct kvm_sregs sregs; + int max_phy_addr; + /* Set the bottom 52 bits. */ + uint64_t rsvd_mask = 0x000fffffffffffff; + + entry = kvm_get_supported_cpuid_index(0x80000008, 0); + max_phy_addr = entry->eax & 0x000000ff; + /* Clear the bottom bits of the reserved mask. */ + rsvd_mask = (rsvd_mask >> max_phy_addr) << max_phy_addr; + + /* + * SDM vol 3, fig 4-11 "Formats of CR3 and Paging-Structure Entries + * with 4-Level Paging and 5-Level Paging". + * If IA32_EFER.NXE = 0 and the P flag of a paging-structure entry is 1, + * the XD flag (bit 63) is reserved. + */ + vcpu_sregs_get(vm, vcpuid, &sregs); + if ((sregs.efer & EFER_NX) == 0) { + rsvd_mask |= (1ull << 63); + } TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); - - TEST_ASSERT((vaddr % vm->page_size) == 0, - "Virtual address not on page boundary,\n" - " vaddr: 0x%lx vm->page_size: 0x%x", - vaddr, vm->page_size); TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), "Invalid virtual address, vaddr: 0x%lx", vaddr); - TEST_ASSERT((paddr % vm->page_size) == 0, - "Physical address not on page boundary,\n" - " paddr: 0x%lx vm->page_size: 0x%x", - paddr, vm->page_size); - TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, - "Physical address beyond beyond maximum supported,\n" - " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", - paddr, vm->max_gfn, vm->page_size); + /* + * Based on the mode check above there are 48 bits in the vaddr, so + * shift 16 to sign extend the last bit (bit-47), + */ + TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16), + "Canonical check failed. The virtual address is invalid."); index[0] = (vaddr >> 12) & 0x1ffu; index[1] = (vaddr >> 21) & 0x1ffu; index[2] = (vaddr >> 30) & 0x1ffu; index[3] = (vaddr >> 39) & 0x1ffu; - /* Allocate page directory pointer table if not present. */ pml4e = addr_gpa2hva(vm, vm->pgd); - if (!pml4e[index[3]].present) { - pml4e[index[3]].address = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) - >> vm->page_shift; - pml4e[index[3]].writable = true; - pml4e[index[3]].present = true; - } + TEST_ASSERT(pml4e[index[3]].present, + "Expected pml4e to be present for gva: 0x%08lx", vaddr); + TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) & + (rsvd_mask | (1ull << 7))) == 0, + "Unexpected reserved bits set."); + + pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size); + TEST_ASSERT(pdpe[index[2]].present, + "Expected pdpe to be present for gva: 0x%08lx", vaddr); + TEST_ASSERT(pdpe[index[2]].page_size == 0, + "Expected pdpe to map a pde not a 1-GByte page."); + TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0, + "Unexpected reserved bits set."); + + pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size); + TEST_ASSERT(pde[index[1]].present, + "Expected pde to be present for gva: 0x%08lx", vaddr); + TEST_ASSERT(pde[index[1]].page_size == 0, + "Expected pde to map a pte not a 2-MByte page."); + TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0, + "Unexpected reserved bits set."); + + pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size); + TEST_ASSERT(pte[index[0]].present, + "Expected pte to be present for gva: 0x%08lx", vaddr); + + return &pte[index[0]]; +} - /* Allocate page directory table if not present. */ - struct pageDirectoryPointerEntry *pdpe; - pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); - if (!pdpe[index[2]].present) { - pdpe[index[2]].address = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) - >> vm->page_shift; - pdpe[index[2]].writable = true; - pdpe[index[2]].present = true; - } +uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) +{ + struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr); - /* Allocate page table if not present. */ - struct pageDirectoryEntry *pde; - pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); - if (!pde[index[1]].present) { - pde[index[1]].address = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) - >> vm->page_shift; - pde[index[1]].writable = true; - pde[index[1]].present = true; - } + return *(uint64_t *)pte; +} - /* Fill in page table entry. */ - struct pageTableEntry *pte; - pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); - pte[index[0]].address = paddr >> vm->page_shift; - pte[index[0]].writable = true; - pte[index[0]].present = 1; +void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, + uint64_t pte) +{ + struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid, + vaddr); + + *(uint64_t *)new_pte = pte; } void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - struct pageMapL4Entry *pml4e, *pml4e_start; - struct pageDirectoryPointerEntry *pdpe, *pdpe_start; - struct pageDirectoryEntry *pde, *pde_start; + struct pageUpperEntry *pml4e, *pml4e_start; + struct pageUpperEntry *pdpe, *pdpe_start; + struct pageUpperEntry *pde, *pde_start; struct pageTableEntry *pte, *pte_start; if (!vm->pgd_created) @@ -307,8 +389,7 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, ""); - pml4e_start = (struct pageMapL4Entry *) addr_gpa2hva(vm, - vm->pgd); + pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1]; if (!pml4e->present) @@ -317,11 +398,10 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) " %u\n", indent, "", pml4e - pml4e_start, pml4e, - addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->address, + addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn, pml4e->writable, pml4e->execute_disable); - pdpe_start = addr_gpa2hva(vm, pml4e->address - * vm->page_size); + pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size); for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { pdpe = &pdpe_start[n2]; if (!pdpe->present) @@ -331,11 +411,10 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) indent, "", pdpe - pdpe_start, pdpe, addr_hva2gpa(vm, pdpe), - (uint64_t) pdpe->address, pdpe->writable, + (uint64_t) pdpe->pfn, pdpe->writable, pdpe->execute_disable); - pde_start = addr_gpa2hva(vm, - pdpe->address * vm->page_size); + pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size); for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { pde = &pde_start[n3]; if (!pde->present) @@ -344,11 +423,10 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) "0x%-12lx 0x%-10lx %u %u\n", indent, "", pde - pde_start, pde, addr_hva2gpa(vm, pde), - (uint64_t) pde->address, pde->writable, + (uint64_t) pde->pfn, pde->writable, pde->execute_disable); - pte_start = addr_gpa2hva(vm, - pde->address * vm->page_size); + pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size); for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { pte = &pte_start[n4]; if (!pte->present) @@ -359,7 +437,7 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) indent, "", pte - pte_start, pte, addr_hva2gpa(vm, pte), - (uint64_t) pte->address, + (uint64_t) pte->pfn, pte->writable, pte->execute_disable, pte->dirty, @@ -480,9 +558,7 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint16_t index[4]; - struct pageMapL4Entry *pml4e; - struct pageDirectoryPointerEntry *pdpe; - struct pageDirectoryEntry *pde; + struct pageUpperEntry *pml4e, *pdpe, *pde; struct pageTableEntry *pte; TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " @@ -499,43 +575,39 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) if (!pml4e[index[3]].present) goto unmapped_gva; - pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); + pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size); if (!pdpe[index[2]].present) goto unmapped_gva; - pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); + pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size); if (!pde[index[1]].present) goto unmapped_gva; - pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); + pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size); if (!pte[index[0]].present) goto unmapped_gva; - return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu); + return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu); unmapped_gva: TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); exit(EXIT_FAILURE); } -static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt, int gdt_memslot, - int pgd_memslot) +static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) { if (!vm->gdt) - vm->gdt = vm_vaddr_alloc(vm, getpagesize(), - KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); + vm->gdt = vm_vaddr_alloc_page(vm); dt->base = vm->gdt; dt->limit = getpagesize(); } static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, - int selector, int gdt_memslot, - int pgd_memslot) + int selector) { if (!vm->tss) - vm->tss = vm_vaddr_alloc(vm, getpagesize(), - KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); + vm->tss = vm_vaddr_alloc_page(vm); memset(segp, 0, sizeof(*segp)); segp->base = vm->tss; @@ -546,7 +618,7 @@ static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, kvm_seg_fill_gdt_64bit(vm, segp); } -static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot) +static void vcpu_setup(struct kvm_vm *vm, int vcpuid) { struct kvm_sregs sregs; @@ -555,7 +627,7 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m sregs.idt.limit = 0; - kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot); + kvm_setup_gdt(vm, &sregs.gdt); switch (vm->mode) { case VM_MODE_PXXV48_4K: @@ -567,7 +639,7 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs); kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds); kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es); - kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot); + kvm_setup_tss_64bit(vm, &sregs.tr, 0x18); break; default: @@ -584,11 +656,11 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) struct kvm_regs regs; vm_vaddr_t stack_vaddr; stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), - DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); + DEFAULT_GUEST_STACK_VADDR_MIN); /* Create VCPU */ vm_vcpu_add(vm, vcpuid); - vcpu_setup(vm, vcpuid, 0, 0); + vcpu_setup(vm, vcpuid); /* Setup guest general purpose registers */ vcpu_regs_get(vm, vcpuid, ®s); @@ -600,6 +672,9 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) /* Setup the MP state */ mp_state.mp_state = 0; vcpu_set_mp_state(vm, vcpuid, &mp_state); + + /* Setup supported CPUIDs */ + vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid()); } /* @@ -657,9 +732,7 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void) return cpuid; cpuid = allocate_kvm_cpuid2(); - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n", @@ -691,9 +764,7 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index) buffer.header.nmsrs = 1; buffer.entry.index = msr_index; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); r = ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header); TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" @@ -986,9 +1057,7 @@ struct kvm_msr_list *kvm_get_msr_index_list(void) struct kvm_msr_list *list; int nmsrs, r, kvm_fd; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); nmsrs = kvm_get_num_msrs_fd(kvm_fd); list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); @@ -1207,7 +1276,7 @@ static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, void kvm_exit_unexpected_vector(uint32_t value) { - outl(UNEXPECTED_VECTOR_PORT, value); + ucall(UCALL_UNHANDLED, 1, value); } void route_exception(struct ex_regs *regs) @@ -1228,8 +1297,8 @@ void vm_init_descriptor_tables(struct kvm_vm *vm) extern void *idt_handlers; int i; - vm->idt = vm_vaddr_alloc(vm, getpagesize(), 0x2000, 0, 0); - vm->handlers = vm_vaddr_alloc(vm, 256 * sizeof(void *), 0x2000, 0, 0); + vm->idt = vm_vaddr_alloc_page(vm); + vm->handlers = vm_vaddr_alloc_page(vm); /* Handlers have the same address in both address spaces.*/ for (i = 0; i < NUM_INTERRUPTS; i++) set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, @@ -1250,8 +1319,8 @@ void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid) *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; } -void vm_handle_exception(struct kvm_vm *vm, int vector, - void (*handler)(struct ex_regs *)) +void vm_install_exception_handler(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) { vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); @@ -1260,16 +1329,13 @@ void vm_handle_exception(struct kvm_vm *vm, int vector, void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) { - if (vcpu_state(vm, vcpuid)->exit_reason == KVM_EXIT_IO - && vcpu_state(vm, vcpuid)->io.port == UNEXPECTED_VECTOR_PORT - && vcpu_state(vm, vcpuid)->io.size == 4) { - /* Grab pointer to io data */ - uint32_t *data = (void *)vcpu_state(vm, vcpuid) - + vcpu_state(vm, vcpuid)->io.data_offset; - - TEST_ASSERT(false, - "Unexpected vectored event in guest (vector:0x%x)", - *data); + struct ucall uc; + + if (get_ucall(vm, vcpuid, &uc) == UCALL_UNHANDLED) { + uint64_t vector = uc.args[0]; + + TEST_FAIL("Unexpected vectored event in guest (vector:0x%lx)", + vector); } } @@ -1312,9 +1378,7 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) return cpuid; cpuid = allocate_kvm_cpuid2(); - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_HV_CPUID failed %d %d\n", diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 827fe6028dd4..2ac98d70d02b 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -30,17 +30,14 @@ u64 rflags; struct svm_test_data * vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva) { - vm_vaddr_t svm_gva = vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + vm_vaddr_t svm_gva = vm_vaddr_alloc_page(vm); struct svm_test_data *svm = addr_gva2hva(vm, svm_gva); - svm->vmcb = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + svm->vmcb = (void *)vm_vaddr_alloc_page(vm); svm->vmcb_hva = addr_gva2hva(vm, (uintptr_t)svm->vmcb); svm->vmcb_gpa = addr_gva2gpa(vm, (uintptr_t)svm->vmcb); - svm->save_area = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + svm->save_area = (void *)vm_vaddr_alloc_page(vm); svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area); svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area); diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 2448b30e8efa..d089d8b850b5 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -77,50 +77,48 @@ int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) struct vmx_pages * vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) { - vm_vaddr_t vmx_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vm_vaddr_t vmx_gva = vm_vaddr_alloc_page(vm); struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva); /* Setup of a region of guest memory for the vmxon region. */ - vmx->vmxon = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmxon = (void *)vm_vaddr_alloc_page(vm); vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon); vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon); /* Setup of a region of guest memory for a vmcs. */ - vmx->vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmcs = (void *)vm_vaddr_alloc_page(vm); vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs); vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs); /* Setup of a region of guest memory for the MSR bitmap. */ - vmx->msr = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->msr = (void *)vm_vaddr_alloc_page(vm); vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr); vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr); memset(vmx->msr_hva, 0, getpagesize()); /* Setup of a region of guest memory for the shadow VMCS. */ - vmx->shadow_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->shadow_vmcs = (void *)vm_vaddr_alloc_page(vm); vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs); vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs); /* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */ - vmx->vmread = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmread = (void *)vm_vaddr_alloc_page(vm); vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread); vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread); memset(vmx->vmread_hva, 0, getpagesize()); - vmx->vmwrite = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmwrite = (void *)vm_vaddr_alloc_page(vm); vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite); vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); /* Setup of a region of guest memory for the VP Assist page. */ - vmx->vp_assist = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + vmx->vp_assist = (void *)vm_vaddr_alloc_page(vm); vmx->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)vmx->vp_assist); vmx->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vp_assist); /* Setup of a region of guest memory for the enlightened VMCS. */ - vmx->enlightened_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + vmx->enlightened_vmcs = (void *)vm_vaddr_alloc_page(vm); vmx->enlightened_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->enlightened_vmcs); vmx->enlightened_vmcs_gpa = @@ -395,7 +393,7 @@ void nested_vmx_check_supported(void) } void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot) + uint64_t nested_paddr, uint64_t paddr) { uint16_t index[4]; struct eptPageTableEntry *pml4e; @@ -428,9 +426,7 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, /* Allocate page directory pointer table if not present. */ pml4e = vmx->eptp_hva; if (!pml4e[index[3]].readable) { - pml4e[index[3]].address = vm_phy_page_alloc(vm, - KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) - >> vm->page_shift; + pml4e[index[3]].address = vm_alloc_page_table(vm) >> vm->page_shift; pml4e[index[3]].writable = true; pml4e[index[3]].readable = true; pml4e[index[3]].executable = true; @@ -440,9 +436,7 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, struct eptPageTableEntry *pdpe; pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); if (!pdpe[index[2]].readable) { - pdpe[index[2]].address = vm_phy_page_alloc(vm, - KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) - >> vm->page_shift; + pdpe[index[2]].address = vm_alloc_page_table(vm) >> vm->page_shift; pdpe[index[2]].writable = true; pdpe[index[2]].readable = true; pdpe[index[2]].executable = true; @@ -452,9 +446,7 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, struct eptPageTableEntry *pde; pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); if (!pde[index[1]].readable) { - pde[index[1]].address = vm_phy_page_alloc(vm, - KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) - >> vm->page_shift; + pde[index[1]].address = vm_alloc_page_table(vm) >> vm->page_shift; pde[index[1]].writable = true; pde[index[1]].readable = true; pde[index[1]].executable = true; @@ -494,8 +486,7 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * page range starting at nested_paddr to the page range starting at paddr. */ void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, - uint32_t eptp_memslot) + uint64_t nested_paddr, uint64_t paddr, uint64_t size) { size_t page_size = vm->page_size; size_t npages = size / page_size; @@ -504,7 +495,7 @@ void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - nested_pg_map(vmx, vm, nested_paddr, paddr, eptp_memslot); + nested_pg_map(vmx, vm, nested_paddr, paddr); nested_paddr += page_size; paddr += page_size; } @@ -514,7 +505,7 @@ void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, * physical pages in VM. */ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot, uint32_t eptp_memslot) + uint32_t memslot) { sparsebit_idx_t i, last; struct userspace_mem_region *region = @@ -530,24 +521,21 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, nested_map(vmx, vm, (uint64_t)i << vm->page_shift, (uint64_t)i << vm->page_shift, - 1 << vm->page_shift, - eptp_memslot); + 1 << vm->page_shift); } } void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot) { - vmx->eptp = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->eptp = (void *)vm_vaddr_alloc_page(vm); vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); } -void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t eptp_memslot) +void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm) { - vmx->apic_access = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + vmx->apic_access = (void *)vm_vaddr_alloc_page(vm); vmx->apic_access_hva = addr_gva2hva(vm, (uintptr_t)vmx->apic_access); vmx->apic_access_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->apic_access); } diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 6096bf0a5b34..98351ba0933c 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -71,14 +71,22 @@ struct memslot_antagonist_args { }; static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, - uint64_t nr_modifications, uint64_t gpa) + uint64_t nr_modifications) { + const uint64_t pages = 1; + uint64_t gpa; int i; + /* + * Add the dummy memslot just below the perf_test_util memslot, which is + * at the top of the guest physical address space. + */ + gpa = guest_test_phys_mem - pages * vm_get_page_size(vm); + for (i = 0; i < nr_modifications; i++) { usleep(delay); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa, - DUMMY_MEMSLOT_INDEX, 1, 0); + DUMMY_MEMSLOT_INDEX, pages, 0); vm_mem_region_delete(vm, DUMMY_MEMSLOT_INDEX); } @@ -120,11 +128,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("Started all vCPUs\n"); add_remove_memslot(vm, p->memslot_modification_delay, - p->nr_memslot_modifications, - guest_test_phys_mem + - (guest_percpu_mem_size * nr_vcpus) + - perf_test_args.host_page_size + - perf_test_args.guest_page_size); + p->nr_memslot_modifications); run_vcpus = false; diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c new file mode 100644 index 000000000000..d6e381e01db7 --- /dev/null +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -0,0 +1,1037 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A memslot-related performance benchmark. + * + * Copyright (C) 2021 Oracle and/or its affiliates. + * + * Basic guest setup / host vCPU thread code lifted from set_memory_region_test. + */ +#include <pthread.h> +#include <sched.h> +#include <semaphore.h> +#include <stdatomic.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <time.h> +#include <unistd.h> + +#include <linux/compiler.h> + +#include <test_util.h> +#include <kvm_util.h> +#include <processor.h> + +#define VCPU_ID 0 + +#define MEM_SIZE ((512U << 20) + 4096) +#define MEM_SIZE_PAGES (MEM_SIZE / 4096) +#define MEM_GPA 0x10000000UL +#define MEM_AUX_GPA MEM_GPA +#define MEM_SYNC_GPA MEM_AUX_GPA +#define MEM_TEST_GPA (MEM_AUX_GPA + 4096) +#define MEM_TEST_SIZE (MEM_SIZE - 4096) +static_assert(MEM_SIZE % 4096 == 0, "invalid mem size"); +static_assert(MEM_TEST_SIZE % 4096 == 0, "invalid mem test size"); + +/* + * 32 MiB is max size that gets well over 100 iterations on 509 slots. + * Considering that each slot needs to have at least one page up to + * 8194 slots in use can then be tested (although with slightly + * limited resolution). + */ +#define MEM_SIZE_MAP ((32U << 20) + 4096) +#define MEM_SIZE_MAP_PAGES (MEM_SIZE_MAP / 4096) +#define MEM_TEST_MAP_SIZE (MEM_SIZE_MAP - 4096) +#define MEM_TEST_MAP_SIZE_PAGES (MEM_TEST_MAP_SIZE / 4096) +static_assert(MEM_SIZE_MAP % 4096 == 0, "invalid map test region size"); +static_assert(MEM_TEST_MAP_SIZE % 4096 == 0, "invalid map test region size"); +static_assert(MEM_TEST_MAP_SIZE_PAGES % 2 == 0, "invalid map test region size"); +static_assert(MEM_TEST_MAP_SIZE_PAGES > 2, "invalid map test region size"); + +/* + * 128 MiB is min size that fills 32k slots with at least one page in each + * while at the same time gets 100+ iterations in such test + */ +#define MEM_TEST_UNMAP_SIZE (128U << 20) +#define MEM_TEST_UNMAP_SIZE_PAGES (MEM_TEST_UNMAP_SIZE / 4096) +/* 2 MiB chunk size like a typical huge page */ +#define MEM_TEST_UNMAP_CHUNK_PAGES (2U << (20 - 12)) +static_assert(MEM_TEST_UNMAP_SIZE <= MEM_TEST_SIZE, + "invalid unmap test region size"); +static_assert(MEM_TEST_UNMAP_SIZE % 4096 == 0, + "invalid unmap test region size"); +static_assert(MEM_TEST_UNMAP_SIZE_PAGES % + (2 * MEM_TEST_UNMAP_CHUNK_PAGES) == 0, + "invalid unmap test region size"); + +/* + * For the move active test the middle of the test area is placed on + * a memslot boundary: half lies in the memslot being moved, half in + * other memslot(s). + * + * When running this test with 32k memslots (32764, really) each memslot + * contains 4 pages. + * The last one additionally contains the remaining 21 pages of memory, + * for the total size of 25 pages. + * Hence, the maximum size here is 50 pages. + */ +#define MEM_TEST_MOVE_SIZE_PAGES (50) +#define MEM_TEST_MOVE_SIZE (MEM_TEST_MOVE_SIZE_PAGES * 4096) +#define MEM_TEST_MOVE_GPA_DEST (MEM_GPA + MEM_SIZE) +static_assert(MEM_TEST_MOVE_SIZE <= MEM_TEST_SIZE, + "invalid move test region size"); + +#define MEM_TEST_VAL_1 0x1122334455667788 +#define MEM_TEST_VAL_2 0x99AABBCCDDEEFF00 + +struct vm_data { + struct kvm_vm *vm; + pthread_t vcpu_thread; + uint32_t nslots; + uint64_t npages; + uint64_t pages_per_slot; + void **hva_slots; + bool mmio_ok; + uint64_t mmio_gpa_min; + uint64_t mmio_gpa_max; +}; + +struct sync_area { + atomic_bool start_flag; + atomic_bool exit_flag; + atomic_bool sync_flag; + void *move_area_ptr; +}; + +/* + * Technically, we need also for the atomic bool to be address-free, which + * is recommended, but not strictly required, by C11 for lockless + * implementations. + * However, in practice both GCC and Clang fulfill this requirement on + * all KVM-supported platforms. + */ +static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless"); + +static sem_t vcpu_ready; + +static bool map_unmap_verify; + +static bool verbose; +#define pr_info_v(...) \ + do { \ + if (verbose) \ + pr_info(__VA_ARGS__); \ + } while (0) + +static void *vcpu_worker(void *data) +{ + struct vm_data *vm = data; + struct kvm_run *run; + struct ucall uc; + uint64_t cmd; + + run = vcpu_state(vm->vm, VCPU_ID); + while (1) { + vcpu_run(vm->vm, VCPU_ID); + + if (run->exit_reason == KVM_EXIT_IO) { + cmd = get_ucall(vm->vm, VCPU_ID, &uc); + if (cmd != UCALL_SYNC) + break; + + sem_post(&vcpu_ready); + continue; + } + + if (run->exit_reason != KVM_EXIT_MMIO) + break; + + TEST_ASSERT(vm->mmio_ok, "Unexpected mmio exit"); + TEST_ASSERT(run->mmio.is_write, "Unexpected mmio read"); + TEST_ASSERT(run->mmio.len == 8, + "Unexpected exit mmio size = %u", run->mmio.len); + TEST_ASSERT(run->mmio.phys_addr >= vm->mmio_gpa_min && + run->mmio.phys_addr <= vm->mmio_gpa_max, + "Unexpected exit mmio address = 0x%llx", + run->mmio.phys_addr); + } + + if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT) + TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2]); + + return NULL; +} + +static void wait_for_vcpu(void) +{ + struct timespec ts; + + TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts), + "clock_gettime() failed: %d\n", errno); + + ts.tv_sec += 2; + TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts), + "sem_timedwait() failed: %d\n", errno); +} + +static void *vm_gpa2hva(struct vm_data *data, uint64_t gpa, uint64_t *rempages) +{ + uint64_t gpage, pgoffs; + uint32_t slot, slotoffs; + void *base; + + TEST_ASSERT(gpa >= MEM_GPA, "Too low gpa to translate"); + TEST_ASSERT(gpa < MEM_GPA + data->npages * 4096, + "Too high gpa to translate"); + gpa -= MEM_GPA; + + gpage = gpa / 4096; + pgoffs = gpa % 4096; + slot = min(gpage / data->pages_per_slot, (uint64_t)data->nslots - 1); + slotoffs = gpage - (slot * data->pages_per_slot); + + if (rempages) { + uint64_t slotpages; + + if (slot == data->nslots - 1) + slotpages = data->npages - slot * data->pages_per_slot; + else + slotpages = data->pages_per_slot; + + TEST_ASSERT(!pgoffs, + "Asking for remaining pages in slot but gpa not page aligned"); + *rempages = slotpages - slotoffs; + } + + base = data->hva_slots[slot]; + return (uint8_t *)base + slotoffs * 4096 + pgoffs; +} + +static uint64_t vm_slot2gpa(struct vm_data *data, uint32_t slot) +{ + TEST_ASSERT(slot < data->nslots, "Too high slot number"); + + return MEM_GPA + slot * data->pages_per_slot * 4096; +} + +static struct vm_data *alloc_vm(void) +{ + struct vm_data *data; + + data = malloc(sizeof(*data)); + TEST_ASSERT(data, "malloc(vmdata) failed"); + + data->vm = NULL; + data->hva_slots = NULL; + + return data; +} + +static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, + void *guest_code, uint64_t mempages, + struct timespec *slot_runtime) +{ + uint32_t max_mem_slots; + uint64_t rempages; + uint64_t guest_addr; + uint32_t slot; + struct timespec tstart; + struct sync_area *sync; + + max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); + TEST_ASSERT(max_mem_slots > 1, + "KVM_CAP_NR_MEMSLOTS should be greater than 1"); + TEST_ASSERT(nslots > 1 || nslots == -1, + "Slot count cap should be greater than 1"); + if (nslots != -1) + max_mem_slots = min(max_mem_slots, (uint32_t)nslots); + pr_info_v("Allowed number of memory slots: %"PRIu32"\n", max_mem_slots); + + TEST_ASSERT(mempages > 1, + "Can't test without any memory"); + + data->npages = mempages; + data->nslots = max_mem_slots - 1; + data->pages_per_slot = mempages / data->nslots; + if (!data->pages_per_slot) { + *maxslots = mempages + 1; + return false; + } + + rempages = mempages % data->nslots; + data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); + TEST_ASSERT(data->hva_slots, "malloc() fail"); + + data->vm = vm_create_default(VCPU_ID, mempages, guest_code); + + pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", + max_mem_slots - 1, data->pages_per_slot, rempages); + + clock_gettime(CLOCK_MONOTONIC, &tstart); + for (slot = 1, guest_addr = MEM_GPA; slot < max_mem_slots; slot++) { + uint64_t npages; + + npages = data->pages_per_slot; + if (slot == max_mem_slots - 1) + npages += rempages; + + vm_userspace_mem_region_add(data->vm, VM_MEM_SRC_ANONYMOUS, + guest_addr, slot, npages, + 0); + guest_addr += npages * 4096; + } + *slot_runtime = timespec_elapsed(tstart); + + for (slot = 0, guest_addr = MEM_GPA; slot < max_mem_slots - 1; slot++) { + uint64_t npages; + uint64_t gpa; + + npages = data->pages_per_slot; + if (slot == max_mem_slots - 2) + npages += rempages; + + gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr, + slot + 1); + TEST_ASSERT(gpa == guest_addr, + "vm_phy_pages_alloc() failed\n"); + + data->hva_slots[slot] = addr_gpa2hva(data->vm, guest_addr); + memset(data->hva_slots[slot], 0, npages * 4096); + + guest_addr += npages * 4096; + } + + virt_map(data->vm, MEM_GPA, MEM_GPA, mempages); + + sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); + atomic_init(&sync->start_flag, false); + atomic_init(&sync->exit_flag, false); + atomic_init(&sync->sync_flag, false); + + data->mmio_ok = false; + + return true; +} + +static void launch_vm(struct vm_data *data) +{ + pr_info_v("Launching the test VM\n"); + + pthread_create(&data->vcpu_thread, NULL, vcpu_worker, data); + + /* Ensure the guest thread is spun up. */ + wait_for_vcpu(); +} + +static void free_vm(struct vm_data *data) +{ + kvm_vm_free(data->vm); + free(data->hva_slots); + free(data); +} + +static void wait_guest_exit(struct vm_data *data) +{ + pthread_join(data->vcpu_thread, NULL); +} + +static void let_guest_run(struct sync_area *sync) +{ + atomic_store_explicit(&sync->start_flag, true, memory_order_release); +} + +static void guest_spin_until_start(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + while (!atomic_load_explicit(&sync->start_flag, memory_order_acquire)) + ; +} + +static void make_guest_exit(struct sync_area *sync) +{ + atomic_store_explicit(&sync->exit_flag, true, memory_order_release); +} + +static bool _guest_should_exit(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + return atomic_load_explicit(&sync->exit_flag, memory_order_acquire); +} + +#define guest_should_exit() unlikely(_guest_should_exit()) + +/* + * noinline so we can easily see how much time the host spends waiting + * for the guest. + * For the same reason use alarm() instead of polling clock_gettime() + * to implement a wait timeout. + */ +static noinline void host_perform_sync(struct sync_area *sync) +{ + alarm(2); + + atomic_store_explicit(&sync->sync_flag, true, memory_order_release); + while (atomic_load_explicit(&sync->sync_flag, memory_order_acquire)) + ; + + alarm(0); +} + +static bool guest_perform_sync(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + bool expected; + + do { + if (guest_should_exit()) + return false; + + expected = true; + } while (!atomic_compare_exchange_weak_explicit(&sync->sync_flag, + &expected, false, + memory_order_acq_rel, + memory_order_relaxed)); + + return true; +} + +static void guest_code_test_memslot_move(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + uintptr_t base = (typeof(base))READ_ONCE(sync->move_area_ptr); + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (!guest_should_exit()) { + uintptr_t ptr; + + for (ptr = base; ptr < base + MEM_TEST_MOVE_SIZE; + ptr += 4096) + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + /* + * No host sync here since the MMIO exits are so expensive + * that the host would spend most of its time waiting for + * the guest and so instead of measuring memslot move + * performance we would measure the performance and + * likelihood of MMIO exits + */ + } + + GUEST_DONE(); +} + +static void guest_code_test_memslot_map(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (1) { + uintptr_t ptr; + + for (ptr = MEM_TEST_GPA; + ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; ptr += 4096) + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + if (!guest_perform_sync()) + break; + + for (ptr = MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; + ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE; ptr += 4096) + *(uint64_t *)ptr = MEM_TEST_VAL_2; + + if (!guest_perform_sync()) + break; + } + + GUEST_DONE(); +} + +static void guest_code_test_memslot_unmap(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (1) { + uintptr_t ptr = MEM_TEST_GPA; + + /* + * We can afford to access (map) just a small number of pages + * per host sync as otherwise the host will spend + * a significant amount of its time waiting for the guest + * (instead of doing unmap operations), so this will + * effectively turn this test into a map performance test. + * + * Just access a single page to be on the safe side. + */ + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + if (!guest_perform_sync()) + break; + + ptr += MEM_TEST_UNMAP_SIZE / 2; + *(uint64_t *)ptr = MEM_TEST_VAL_2; + + if (!guest_perform_sync()) + break; + } + + GUEST_DONE(); +} + +static void guest_code_test_memslot_rw(void) +{ + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (1) { + uintptr_t ptr; + + for (ptr = MEM_TEST_GPA; + ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + if (!guest_perform_sync()) + break; + + for (ptr = MEM_TEST_GPA + 4096 / 2; + ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) { + uint64_t val = *(uint64_t *)ptr; + + GUEST_ASSERT_1(val == MEM_TEST_VAL_2, val); + *(uint64_t *)ptr = 0; + } + + if (!guest_perform_sync()) + break; + } + + GUEST_DONE(); +} + +static bool test_memslot_move_prepare(struct vm_data *data, + struct sync_area *sync, + uint64_t *maxslots, bool isactive) +{ + uint64_t movesrcgpa, movetestgpa; + + movesrcgpa = vm_slot2gpa(data, data->nslots - 1); + + if (isactive) { + uint64_t lastpages; + + vm_gpa2hva(data, movesrcgpa, &lastpages); + if (lastpages < MEM_TEST_MOVE_SIZE_PAGES / 2) { + *maxslots = 0; + return false; + } + } + + movetestgpa = movesrcgpa - (MEM_TEST_MOVE_SIZE / (isactive ? 2 : 1)); + sync->move_area_ptr = (void *)movetestgpa; + + if (isactive) { + data->mmio_ok = true; + data->mmio_gpa_min = movesrcgpa; + data->mmio_gpa_max = movesrcgpa + MEM_TEST_MOVE_SIZE / 2 - 1; + } + + return true; +} + +static bool test_memslot_move_prepare_active(struct vm_data *data, + struct sync_area *sync, + uint64_t *maxslots) +{ + return test_memslot_move_prepare(data, sync, maxslots, true); +} + +static bool test_memslot_move_prepare_inactive(struct vm_data *data, + struct sync_area *sync, + uint64_t *maxslots) +{ + return test_memslot_move_prepare(data, sync, maxslots, false); +} + +static void test_memslot_move_loop(struct vm_data *data, struct sync_area *sync) +{ + uint64_t movesrcgpa; + + movesrcgpa = vm_slot2gpa(data, data->nslots - 1); + vm_mem_region_move(data->vm, data->nslots - 1 + 1, + MEM_TEST_MOVE_GPA_DEST); + vm_mem_region_move(data->vm, data->nslots - 1 + 1, movesrcgpa); +} + +static void test_memslot_do_unmap(struct vm_data *data, + uint64_t offsp, uint64_t count) +{ + uint64_t gpa, ctr; + + for (gpa = MEM_TEST_GPA + offsp * 4096, ctr = 0; ctr < count; ) { + uint64_t npages; + void *hva; + int ret; + + hva = vm_gpa2hva(data, gpa, &npages); + TEST_ASSERT(npages, "Empty memory slot at gptr 0x%"PRIx64, gpa); + npages = min(npages, count - ctr); + ret = madvise(hva, npages * 4096, MADV_DONTNEED); + TEST_ASSERT(!ret, + "madvise(%p, MADV_DONTNEED) on VM memory should not fail for gptr 0x%"PRIx64, + hva, gpa); + ctr += npages; + gpa += npages * 4096; + } + TEST_ASSERT(ctr == count, + "madvise(MADV_DONTNEED) should exactly cover all of the requested area"); +} + +static void test_memslot_map_unmap_check(struct vm_data *data, + uint64_t offsp, uint64_t valexp) +{ + uint64_t gpa; + uint64_t *val; + + if (!map_unmap_verify) + return; + + gpa = MEM_TEST_GPA + offsp * 4096; + val = (typeof(val))vm_gpa2hva(data, gpa, NULL); + TEST_ASSERT(*val == valexp, + "Guest written values should read back correctly before unmap (%"PRIu64" vs %"PRIu64" @ %"PRIx64")", + *val, valexp, gpa); + *val = 0; +} + +static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync) +{ + /* + * Unmap the second half of the test area while guest writes to (maps) + * the first half. + */ + test_memslot_do_unmap(data, MEM_TEST_MAP_SIZE_PAGES / 2, + MEM_TEST_MAP_SIZE_PAGES / 2); + + /* + * Wait for the guest to finish writing the first half of the test + * area, verify the written value on the first and the last page of + * this area and then unmap it. + * Meanwhile, the guest is writing to (mapping) the second half of + * the test area. + */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1); + test_memslot_map_unmap_check(data, + MEM_TEST_MAP_SIZE_PAGES / 2 - 1, + MEM_TEST_VAL_1); + test_memslot_do_unmap(data, 0, MEM_TEST_MAP_SIZE_PAGES / 2); + + + /* + * Wait for the guest to finish writing the second half of the test + * area and verify the written value on the first and the last page + * of this area. + * The area will be unmapped at the beginning of the next loop + * iteration. + * Meanwhile, the guest is writing to (mapping) the first half of + * the test area. + */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES / 2, + MEM_TEST_VAL_2); + test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES - 1, + MEM_TEST_VAL_2); +} + +static void test_memslot_unmap_loop_common(struct vm_data *data, + struct sync_area *sync, + uint64_t chunk) +{ + uint64_t ctr; + + /* + * Wait for the guest to finish mapping page(s) in the first half + * of the test area, verify the written value and then perform unmap + * of this area. + * Meanwhile, the guest is writing to (mapping) page(s) in the second + * half of the test area. + */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1); + for (ctr = 0; ctr < MEM_TEST_UNMAP_SIZE_PAGES / 2; ctr += chunk) + test_memslot_do_unmap(data, ctr, chunk); + + /* Likewise, but for the opposite host / guest areas */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, MEM_TEST_UNMAP_SIZE_PAGES / 2, + MEM_TEST_VAL_2); + for (ctr = MEM_TEST_UNMAP_SIZE_PAGES / 2; + ctr < MEM_TEST_UNMAP_SIZE_PAGES; ctr += chunk) + test_memslot_do_unmap(data, ctr, chunk); +} + +static void test_memslot_unmap_loop(struct vm_data *data, + struct sync_area *sync) +{ + test_memslot_unmap_loop_common(data, sync, 1); +} + +static void test_memslot_unmap_loop_chunked(struct vm_data *data, + struct sync_area *sync) +{ + test_memslot_unmap_loop_common(data, sync, MEM_TEST_UNMAP_CHUNK_PAGES); +} + +static void test_memslot_rw_loop(struct vm_data *data, struct sync_area *sync) +{ + uint64_t gptr; + + for (gptr = MEM_TEST_GPA + 4096 / 2; + gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) + *(uint64_t *)vm_gpa2hva(data, gptr, NULL) = MEM_TEST_VAL_2; + + host_perform_sync(sync); + + for (gptr = MEM_TEST_GPA; + gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) { + uint64_t *vptr = (typeof(vptr))vm_gpa2hva(data, gptr, NULL); + uint64_t val = *vptr; + + TEST_ASSERT(val == MEM_TEST_VAL_1, + "Guest written values should read back correctly (is %"PRIu64" @ %"PRIx64")", + val, gptr); + *vptr = 0; + } + + host_perform_sync(sync); +} + +struct test_data { + const char *name; + uint64_t mem_size; + void (*guest_code)(void); + bool (*prepare)(struct vm_data *data, struct sync_area *sync, + uint64_t *maxslots); + void (*loop)(struct vm_data *data, struct sync_area *sync); +}; + +static bool test_execute(int nslots, uint64_t *maxslots, + unsigned int maxtime, + const struct test_data *tdata, + uint64_t *nloops, + struct timespec *slot_runtime, + struct timespec *guest_runtime) +{ + uint64_t mem_size = tdata->mem_size ? : MEM_SIZE_PAGES; + struct vm_data *data; + struct sync_area *sync; + struct timespec tstart; + bool ret = true; + + data = alloc_vm(); + if (!prepare_vm(data, nslots, maxslots, tdata->guest_code, + mem_size, slot_runtime)) { + ret = false; + goto exit_free; + } + + sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); + + if (tdata->prepare && + !tdata->prepare(data, sync, maxslots)) { + ret = false; + goto exit_free; + } + + launch_vm(data); + + clock_gettime(CLOCK_MONOTONIC, &tstart); + let_guest_run(sync); + + while (1) { + *guest_runtime = timespec_elapsed(tstart); + if (guest_runtime->tv_sec >= maxtime) + break; + + tdata->loop(data, sync); + + (*nloops)++; + } + + make_guest_exit(sync); + wait_guest_exit(data); + +exit_free: + free_vm(data); + + return ret; +} + +static const struct test_data tests[] = { + { + .name = "map", + .mem_size = MEM_SIZE_MAP_PAGES, + .guest_code = guest_code_test_memslot_map, + .loop = test_memslot_map_loop, + }, + { + .name = "unmap", + .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1, + .guest_code = guest_code_test_memslot_unmap, + .loop = test_memslot_unmap_loop, + }, + { + .name = "unmap chunked", + .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1, + .guest_code = guest_code_test_memslot_unmap, + .loop = test_memslot_unmap_loop_chunked, + }, + { + .name = "move active area", + .guest_code = guest_code_test_memslot_move, + .prepare = test_memslot_move_prepare_active, + .loop = test_memslot_move_loop, + }, + { + .name = "move inactive area", + .guest_code = guest_code_test_memslot_move, + .prepare = test_memslot_move_prepare_inactive, + .loop = test_memslot_move_loop, + }, + { + .name = "RW", + .guest_code = guest_code_test_memslot_rw, + .loop = test_memslot_rw_loop + }, +}; + +#define NTESTS ARRAY_SIZE(tests) + +struct test_args { + int tfirst; + int tlast; + int nslots; + int seconds; + int runs; +}; + +static void help(char *name, struct test_args *targs) +{ + int ctr; + + pr_info("usage: %s [-h] [-v] [-d] [-s slots] [-f first_test] [-e last_test] [-l test_length] [-r run_count]\n", + name); + pr_info(" -h: print this help screen.\n"); + pr_info(" -v: enable verbose mode (not for benchmarking).\n"); + pr_info(" -d: enable extra debug checks.\n"); + pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n", + targs->nslots); + pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n", + targs->tfirst, NTESTS - 1); + pr_info(" -e: specify the last test to run (currently: %i; max %zu)\n", + targs->tlast, NTESTS - 1); + pr_info(" -l: specify the test length in seconds (currently: %i)\n", + targs->seconds); + pr_info(" -r: specify the number of runs per test (currently: %i)\n", + targs->runs); + + pr_info("\nAvailable tests:\n"); + for (ctr = 0; ctr < NTESTS; ctr++) + pr_info("%d: %s\n", ctr, tests[ctr].name); +} + +static bool parse_args(int argc, char *argv[], + struct test_args *targs) +{ + int opt; + + while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) { + switch (opt) { + case 'h': + default: + help(argv[0], targs); + return false; + case 'v': + verbose = true; + break; + case 'd': + map_unmap_verify = true; + break; + case 's': + targs->nslots = atoi(optarg); + if (targs->nslots <= 0 && targs->nslots != -1) { + pr_info("Slot count cap has to be positive or -1 for no cap\n"); + return false; + } + break; + case 'f': + targs->tfirst = atoi(optarg); + if (targs->tfirst < 0) { + pr_info("First test to run has to be non-negative\n"); + return false; + } + break; + case 'e': + targs->tlast = atoi(optarg); + if (targs->tlast < 0 || targs->tlast >= NTESTS) { + pr_info("Last test to run has to be non-negative and less than %zu\n", + NTESTS); + return false; + } + break; + case 'l': + targs->seconds = atoi(optarg); + if (targs->seconds < 0) { + pr_info("Test length in seconds has to be non-negative\n"); + return false; + } + break; + case 'r': + targs->runs = atoi(optarg); + if (targs->runs <= 0) { + pr_info("Runs per test has to be positive\n"); + return false; + } + break; + } + } + + if (optind < argc) { + help(argv[0], targs); + return false; + } + + if (targs->tfirst > targs->tlast) { + pr_info("First test to run cannot be greater than the last test to run\n"); + return false; + } + + return true; +} + +struct test_result { + struct timespec slot_runtime, guest_runtime, iter_runtime; + int64_t slottimens, runtimens; + uint64_t nloops; +}; + +static bool test_loop(const struct test_data *data, + const struct test_args *targs, + struct test_result *rbestslottime, + struct test_result *rbestruntime) +{ + uint64_t maxslots; + struct test_result result; + + result.nloops = 0; + if (!test_execute(targs->nslots, &maxslots, targs->seconds, data, + &result.nloops, + &result.slot_runtime, &result.guest_runtime)) { + if (maxslots) + pr_info("Memslot count too high for this test, decrease the cap (max is %"PRIu64")\n", + maxslots); + else + pr_info("Memslot count may be too high for this test, try adjusting the cap\n"); + + return false; + } + + pr_info("Test took %ld.%.9lds for slot setup + %ld.%.9lds all iterations\n", + result.slot_runtime.tv_sec, result.slot_runtime.tv_nsec, + result.guest_runtime.tv_sec, result.guest_runtime.tv_nsec); + if (!result.nloops) { + pr_info("No full loops done - too short test time or system too loaded?\n"); + return true; + } + + result.iter_runtime = timespec_div(result.guest_runtime, + result.nloops); + pr_info("Done %"PRIu64" iterations, avg %ld.%.9lds each\n", + result.nloops, + result.iter_runtime.tv_sec, + result.iter_runtime.tv_nsec); + result.slottimens = timespec_to_ns(result.slot_runtime); + result.runtimens = timespec_to_ns(result.iter_runtime); + + /* + * Only rank the slot setup time for tests using the whole test memory + * area so they are comparable + */ + if (!data->mem_size && + (!rbestslottime->slottimens || + result.slottimens < rbestslottime->slottimens)) + *rbestslottime = result; + if (!rbestruntime->runtimens || + result.runtimens < rbestruntime->runtimens) + *rbestruntime = result; + + return true; +} + +int main(int argc, char *argv[]) +{ + struct test_args targs = { + .tfirst = 0, + .tlast = NTESTS - 1, + .nslots = -1, + .seconds = 5, + .runs = 1, + }; + struct test_result rbestslottime; + int tctr; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + if (!parse_args(argc, argv, &targs)) + return -1; + + rbestslottime.slottimens = 0; + for (tctr = targs.tfirst; tctr <= targs.tlast; tctr++) { + const struct test_data *data = &tests[tctr]; + unsigned int runctr; + struct test_result rbestruntime; + + if (tctr > targs.tfirst) + pr_info("\n"); + + pr_info("Testing %s performance with %i runs, %d seconds each\n", + data->name, targs.runs, targs.seconds); + + rbestruntime.runtimens = 0; + for (runctr = 0; runctr < targs.runs; runctr++) + if (!test_loop(data, &targs, + &rbestslottime, &rbestruntime)) + break; + + if (rbestruntime.runtimens) + pr_info("Best runtime result was %ld.%.9lds per iteration (with %"PRIu64" iterations)\n", + rbestruntime.iter_runtime.tv_sec, + rbestruntime.iter_runtime.tv_nsec, + rbestruntime.nloops); + } + + if (rbestslottime.slottimens) + pr_info("Best slot setup time for the whole test area was %ld.%.9lds\n", + rbestslottime.slot_runtime.tv_sec, + rbestslottime.slot_runtime.tv_nsec); + + return 0; +} diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 978f5b5f4dc0..72a1c9b4882c 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -132,7 +132,7 @@ static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code) gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT); TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n"); - virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0); + virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2); /* Ditto for the host mapping so that both pages can be zeroed. */ hva = addr_gpa2hva(vm, MEM_REGION_GPA); @@ -376,8 +376,9 @@ static void test_add_max_memory_regions(void) pr_info("Adding slots 0..%i, each memory region with %dK size\n", (max_mem_slots - 1), MEM_REGION_SIZE >> 10); - mem = mmap(NULL, MEM_REGION_SIZE * max_mem_slots + alignment, - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + mem = mmap(NULL, (size_t)max_mem_slots * MEM_REGION_SIZE + alignment, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0); TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host"); mem_aligned = (void *)(((size_t) mem + alignment - 1) & ~(alignment - 1)); @@ -401,7 +402,7 @@ static void test_add_max_memory_regions(void) TEST_ASSERT(ret == -1 && errno == EINVAL, "Adding one more memory slot should fail with EINVAL"); - munmap(mem, MEM_REGION_SIZE * max_mem_slots + alignment); + munmap(mem, (size_t)max_mem_slots * MEM_REGION_SIZE + alignment); munmap(mem_extra, MEM_REGION_SIZE); kvm_vm_free(vm); } diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index fcc840088c91..ecec30865a74 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -73,8 +73,6 @@ static void steal_time_init(struct kvm_vm *vm) for (i = 0; i < NR_VCPUS; ++i) { int ret; - vcpu_set_cpuid(vm, i, kvm_get_supported_cpuid()); - /* ST_GPA_BASE is identity mapped */ st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); sync_global_to_guest(vm, st_gva[i]); @@ -295,7 +293,7 @@ int main(int ac, char **av) vm = vm_create_default(0, 0, guest_code); gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE * NR_VCPUS); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); - virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages, 0); + virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages); ucall_init(vm, NULL); /* Add the rest of the VCPUs */ @@ -322,7 +320,7 @@ int main(int ac, char **av) run_delay = get_run_delay(); pthread_create(&thread, &attr, do_steal_time, NULL); do - pthread_yield(); + sched_yield(); while (get_run_delay() - run_delay < MIN_RUN_DELAY_NS); pthread_join(thread, NULL); run_delay = get_run_delay() - run_delay; diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c new file mode 100644 index 000000000000..f070ff0224fa --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020, Google LLC. + * + * Tests for KVM_CAP_EXIT_ON_EMULATION_FAILURE capability. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ + +#include "test_util.h" +#include "kvm_util.h" +#include "vmx.h" + +#define VCPU_ID 1 +#define PAGE_SIZE 4096 +#define MAXPHYADDR 36 + +#define MEM_REGION_GVA 0x0000123456789000 +#define MEM_REGION_GPA 0x0000000700000000 +#define MEM_REGION_SLOT 10 +#define MEM_REGION_SIZE PAGE_SIZE + +static void guest_code(void) +{ + __asm__ __volatile__("flds (%[addr])" + :: [addr]"r"(MEM_REGION_GVA)); + + GUEST_DONE(); +} + +static void run_guest(struct kvm_vm *vm) +{ + int rc; + + rc = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc); +} + +/* + * Accessors to get R/M, REG, and Mod bits described in the SDM vol 2, + * figure 2-2 "Table Interpretation of ModR/M Byte (C8H)". + */ +#define GET_RM(insn_byte) (insn_byte & 0x7) +#define GET_REG(insn_byte) ((insn_byte & 0x38) >> 3) +#define GET_MOD(insn_byte) ((insn_byte & 0xc) >> 6) + +/* Ensure we are dealing with a simple 2-byte flds instruction. */ +static bool is_flds(uint8_t *insn_bytes, uint8_t insn_size) +{ + return insn_size >= 2 && + insn_bytes[0] == 0xd9 && + GET_REG(insn_bytes[1]) == 0x0 && + GET_MOD(insn_bytes[1]) == 0x0 && + /* Ensure there is no SIB byte. */ + GET_RM(insn_bytes[1]) != 0x4 && + /* Ensure there is no displacement byte. */ + GET_RM(insn_bytes[1]) != 0x5; +} + +static void process_exit_on_emulation_error(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_regs regs; + uint8_t *insn_bytes; + uint8_t insn_size; + uint64_t flags; + + TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, + "Unexpected exit reason: %u (%s)", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION, + "Unexpected suberror: %u", + run->emulation_failure.suberror); + + if (run->emulation_failure.ndata >= 1) { + flags = run->emulation_failure.flags; + if ((flags & KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES) && + run->emulation_failure.ndata >= 3) { + insn_size = run->emulation_failure.insn_size; + insn_bytes = run->emulation_failure.insn_bytes; + + TEST_ASSERT(insn_size <= 15 && insn_size > 0, + "Unexpected instruction size: %u", + insn_size); + + TEST_ASSERT(is_flds(insn_bytes, insn_size), + "Unexpected instruction. Expected 'flds' (0xd9 /0)"); + + /* + * If is_flds() succeeded then the instruction bytes + * contained an flds instruction that is 2-bytes in + * length (ie: no prefix, no SIB, no displacement). + */ + vcpu_regs_get(vm, VCPU_ID, ®s); + regs.rip += 2; + vcpu_regs_set(vm, VCPU_ID, ®s); + } + } +} + +static void do_guest_assert(struct kvm_vm *vm, struct ucall *uc) +{ + TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0], __FILE__, + uc->args[1]); +} + +static void check_for_guest_assert(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + if (run->exit_reason == KVM_EXIT_IO && + get_ucall(vm, VCPU_ID, &uc) == UCALL_ABORT) { + do_guest_assert(vm, &uc); + } +} + +static void process_ucall_done(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + check_for_guest_assert(vm); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s)", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + TEST_ASSERT(get_ucall(vm, VCPU_ID, &uc) == UCALL_DONE, + "Unexpected ucall command: %lu, expected UCALL_DONE (%d)", + uc.cmd, UCALL_DONE); +} + +static uint64_t process_ucall(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s)", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_SYNC: + break; + case UCALL_ABORT: + do_guest_assert(vm, &uc); + break; + case UCALL_DONE: + process_ucall_done(vm); + break; + default: + TEST_ASSERT(false, "Unexpected ucall"); + } + + return uc.cmd; +} + +int main(int argc, char *argv[]) +{ + struct kvm_enable_cap emul_failure_cap = { + .cap = KVM_CAP_EXIT_ON_EMULATION_FAILURE, + .args[0] = 1, + }; + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid2 *cpuid; + struct kvm_vm *vm; + uint64_t gpa, pte; + uint64_t *hva; + int rc; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + vm = vm_create_default(VCPU_ID, 0, guest_code); + + if (!kvm_check_cap(KVM_CAP_SMALLER_MAXPHYADDR)) { + printf("module parameter 'allow_smaller_maxphyaddr' is not set. Skipping test.\n"); + return 0; + } + + cpuid = kvm_get_supported_cpuid(); + + entry = kvm_get_supported_cpuid_index(0x80000008, 0); + entry->eax = (entry->eax & 0xffffff00) | MAXPHYADDR; + set_cpuid(cpuid, entry); + + vcpu_set_cpuid(vm, VCPU_ID, cpuid); + + rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE); + TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable"); + vm_enable_cap(vm, &emul_failure_cap); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + MEM_REGION_GPA, MEM_REGION_SLOT, + MEM_REGION_SIZE / PAGE_SIZE, 0); + gpa = vm_phy_pages_alloc(vm, MEM_REGION_SIZE / PAGE_SIZE, + MEM_REGION_GPA, MEM_REGION_SLOT); + TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n"); + virt_map(vm, MEM_REGION_GVA, MEM_REGION_GPA, 1); + hva = addr_gpa2hva(vm, MEM_REGION_GPA); + memset(hva, 0, PAGE_SIZE); + pte = vm_get_page_table_entry(vm, VCPU_ID, MEM_REGION_GVA); + vm_set_page_table_entry(vm, VCPU_ID, MEM_REGION_GVA, pte | (1ull << 36)); + + run_guest(vm); + process_exit_on_emulation_error(vm); + run_guest(vm); + + TEST_ASSERT(process_ucall(vm) == UCALL_DONE, "Expected UCALL_DONE"); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 63096cea26c6..2b46dcca86a8 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -22,15 +22,6 @@ static int ud_count; -void enable_x2apic(void) -{ - uint32_t spiv_reg = APIC_BASE_MSR + (APIC_SPIV >> 4); - - wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) | - MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD); - wrmsr(spiv_reg, rdmsr(spiv_reg) | APIC_SPIV_APIC_ENABLED); -} - static void guest_ud_handler(struct ex_regs *regs) { ud_count++; @@ -59,7 +50,7 @@ void guest_code(struct vmx_pages *vmx_pages) #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - enable_x2apic(); + x2apic_enable(); GUEST_SYNC(1); GUEST_SYNC(2); @@ -121,14 +112,38 @@ void inject_nmi(struct kvm_vm *vm) vcpu_events_set(vm, VCPU_ID, &events); } +static void save_restore_vm(struct kvm_vm *vm) +{ + struct kvm_regs regs1, regs2; + struct kvm_x86_state *state; + + state = vcpu_save_state(vm, VCPU_ID); + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vm, VCPU_ID, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + kvm_vm_restart(vm, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + vcpu_set_hv_cpuid(vm, VCPU_ID); + vcpu_enable_evmcs(vm, VCPU_ID); + vcpu_load_state(vm, VCPU_ID, state); + free(state); + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vm, VCPU_ID, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); +} + int main(int argc, char *argv[]) { vm_vaddr_t vmx_pages_gva = 0; - struct kvm_regs regs1, regs2; struct kvm_vm *vm; struct kvm_run *run; - struct kvm_x86_state *state; struct ucall uc; int stage; @@ -145,21 +160,18 @@ int main(int argc, char *argv[]) vcpu_set_hv_cpuid(vm, VCPU_ID); vcpu_enable_evmcs(vm, VCPU_ID); - run = vcpu_state(vm, VCPU_ID); - - vcpu_regs_get(vm, VCPU_ID, ®s1); - vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vm, VCPU_ID); - vm_handle_exception(vm, UD_VECTOR, guest_ud_handler); - vm_handle_exception(vm, NMI_VECTOR, guest_nmi_handler); + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); + vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); pr_info("Running L1 which uses EVMCS to run L2\n"); for (stage = 1;; stage++) { + run = vcpu_state(vm, VCPU_ID); _vcpu_run(vm, VCPU_ID); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", @@ -184,32 +196,23 @@ int main(int argc, char *argv[]) uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", stage, (ulong)uc.args[1]); - state = vcpu_save_state(vm, VCPU_ID); - memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, VCPU_ID, ®s1); - - kvm_vm_release(vm); - - /* Restore state in a new VM. */ - kvm_vm_restart(vm, O_RDWR); - vm_vcpu_add(vm, VCPU_ID); - vcpu_set_hv_cpuid(vm, VCPU_ID); - vcpu_enable_evmcs(vm, VCPU_ID); - vcpu_load_state(vm, VCPU_ID, state); - run = vcpu_state(vm, VCPU_ID); - free(state); - - memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, VCPU_ID, ®s2); - TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), - "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", - (ulong) regs2.rdi, (ulong) regs2.rsi); + save_restore_vm(vm); /* Force immediate L2->L1 exit before resuming */ if (stage == 8) { pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n"); inject_nmi(vm); } + + /* + * Do KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE for a freshly + * restored VM (before the first KVM_RUN) to check that + * KVM_STATE_NESTED_EVMCS is not lost. + */ + if (stage == 9) { + pr_info("Trying extra KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE cycle\n"); + save_restore_vm(vm); + } } done: diff --git a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c index 9b78e8889638..a711f83749ea 100644 --- a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c @@ -19,7 +19,12 @@ struct { u32 function; u32 index; } mangled_cpuids[] = { + /* + * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR, + * which are not controlled for by this test. + */ {.function = 0xd, .index = 0}, + {.function = 0xd, .index = 1}, }; static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid) @@ -140,8 +145,7 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid) { int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]); - vm_vaddr_t gva = vm_vaddr_alloc(vm, size, - getpagesize(), 0, 0); + vm_vaddr_t gva = vm_vaddr_alloc(vm, size, KVM_UTIL_MIN_VADDR); struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva); memcpy(guest_cpuids, cpuid, size); diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c index cb953df4d7d0..8aed0db1331d 100644 --- a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c +++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c @@ -37,9 +37,7 @@ static void test_get_msr_index(void) int old_res, res, kvm_fd, r; struct kvm_msr_list *list; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); old_res = kvm_num_index_msrs(kvm_fd, 0); TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); @@ -101,9 +99,7 @@ static void test_get_msr_feature(void) int res, old_res, i, kvm_fd; struct kvm_msr_list *feature_list; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); old_res = kvm_num_feature_msrs(kvm_fd, 0); TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c index 7f1d2765572c..e0b2bb1339b1 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -7,6 +7,7 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "hyperv.h" struct ms_hyperv_tsc_page { volatile u32 tsc_sequence; @@ -15,13 +16,6 @@ struct ms_hyperv_tsc_page { volatile s64 tsc_offset; } __packed; -#define HV_X64_MSR_GUEST_OS_ID 0x40000000 -#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 -#define HV_X64_MSR_REFERENCE_TSC 0x40000021 -#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 -#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 -#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 - /* Simplified mul_u64_u64_shr() */ static inline u64 mul_u64_u64_shr64(u64 a, u64 b) { @@ -220,8 +214,8 @@ int main(void) vcpu_set_hv_cpuid(vm, VCPU_ID); - tsc_page_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); - memset(addr_gpa2hva(vm, tsc_page_gva), 0x0, getpagesize()); + tsc_page_gva = vm_vaddr_alloc_page(vm); + memset(addr_gva2hva(vm, tsc_page_gva), 0x0, getpagesize()); TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0, "TSC page has to be page aligned\n"); vcpu_args_set(vm, VCPU_ID, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva)); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c new file mode 100644 index 000000000000..91d88aaa9899 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -0,0 +1,684 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Red Hat, Inc. + * + * Tests for Hyper-V features enablement + */ +#include <asm/kvm_para.h> +#include <linux/kvm_para.h> +#include <stdint.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "hyperv.h" + +#define VCPU_ID 0 +#define LINUX_OS_ID ((u64)0x8100 << 48) + +extern unsigned char rdmsr_start; +extern unsigned char rdmsr_end; + +static u64 do_rdmsr(u32 idx) +{ + u32 lo, hi; + + asm volatile("rdmsr_start: rdmsr;" + "rdmsr_end:" + : "=a"(lo), "=c"(hi) + : "c"(idx)); + + return (((u64) hi) << 32) | lo; +} + +extern unsigned char wrmsr_start; +extern unsigned char wrmsr_end; + +static void do_wrmsr(u32 idx, u64 val) +{ + u32 lo, hi; + + lo = val; + hi = val >> 32; + + asm volatile("wrmsr_start: wrmsr;" + "wrmsr_end:" + : : "a"(lo), "c"(idx), "d"(hi)); +} + +static int nr_gp; +static int nr_ud; + +static inline u64 hypercall(u64 control, vm_vaddr_t input_address, + vm_vaddr_t output_address) +{ + u64 hv_status; + + asm volatile("mov %3, %%r8\n" + "vmcall" + : "=a" (hv_status), + "+c" (control), "+d" (input_address) + : "r" (output_address) + : "cc", "memory", "r8", "r9", "r10", "r11"); + + return hv_status; +} + +static void guest_gp_handler(struct ex_regs *regs) +{ + unsigned char *rip = (unsigned char *)regs->rip; + bool r, w; + + r = rip == &rdmsr_start; + w = rip == &wrmsr_start; + GUEST_ASSERT(r || w); + + nr_gp++; + + if (r) + regs->rip = (uint64_t)&rdmsr_end; + else + regs->rip = (uint64_t)&wrmsr_end; +} + +static void guest_ud_handler(struct ex_regs *regs) +{ + nr_ud++; + regs->rip += 3; +} + +struct msr_data { + uint32_t idx; + bool available; + bool write; + u64 write_val; +}; + +struct hcall_data { + uint64_t control; + uint64_t expect; + bool ud_expected; +}; + +static void guest_msr(struct msr_data *msr) +{ + int i = 0; + + while (msr->idx) { + WRITE_ONCE(nr_gp, 0); + if (!msr->write) + do_rdmsr(msr->idx); + else + do_wrmsr(msr->idx, msr->write_val); + + if (msr->available) + GUEST_ASSERT(READ_ONCE(nr_gp) == 0); + else + GUEST_ASSERT(READ_ONCE(nr_gp) == 1); + + GUEST_SYNC(i++); + } + + GUEST_DONE(); +} + +static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) +{ + int i = 0; + u64 res, input, output; + + wrmsr(HV_X64_MSR_GUEST_OS_ID, LINUX_OS_ID); + wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa); + + while (hcall->control) { + nr_ud = 0; + if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) { + input = pgs_gpa; + output = pgs_gpa + 4096; + } else { + input = output = 0; + } + + res = hypercall(hcall->control, input, output); + if (hcall->ud_expected) + GUEST_ASSERT(nr_ud == 1); + else + GUEST_ASSERT(res == hcall->expect); + + GUEST_SYNC(i++); + } + + GUEST_DONE(); +} + +static void hv_set_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 *feat, + struct kvm_cpuid_entry2 *recomm, + struct kvm_cpuid_entry2 *dbg) +{ + TEST_ASSERT(set_cpuid(cpuid, feat), + "failed to set KVM_CPUID_FEATURES leaf"); + TEST_ASSERT(set_cpuid(cpuid, recomm), + "failed to set HYPERV_CPUID_ENLIGHTMENT_INFO leaf"); + TEST_ASSERT(set_cpuid(cpuid, dbg), + "failed to set HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES leaf"); + vcpu_set_cpuid(vm, VCPU_ID, cpuid); +} + +static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr, + struct kvm_cpuid2 *best) +{ + struct kvm_run *run; + struct ucall uc; + int stage = 0, r; + struct kvm_cpuid_entry2 feat = { + .function = HYPERV_CPUID_FEATURES + }; + struct kvm_cpuid_entry2 recomm = { + .function = HYPERV_CPUID_ENLIGHTMENT_INFO + }; + struct kvm_cpuid_entry2 dbg = { + .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES + }; + struct kvm_enable_cap cap = {0}; + + run = vcpu_state(vm, VCPU_ID); + + while (true) { + switch (stage) { + case 0: + /* + * Only available when Hyper-V identification is set + */ + msr->idx = HV_X64_MSR_GUEST_OS_ID; + msr->write = 0; + msr->available = 0; + break; + case 1: + msr->idx = HV_X64_MSR_HYPERCALL; + msr->write = 0; + msr->available = 0; + break; + case 2: + feat.eax |= HV_MSR_HYPERCALL_AVAILABLE; + /* + * HV_X64_MSR_GUEST_OS_ID has to be written first to make + * HV_X64_MSR_HYPERCALL available. + */ + msr->idx = HV_X64_MSR_GUEST_OS_ID; + msr->write = 1; + msr->write_val = LINUX_OS_ID; + msr->available = 1; + break; + case 3: + msr->idx = HV_X64_MSR_GUEST_OS_ID; + msr->write = 0; + msr->available = 1; + break; + case 4: + msr->idx = HV_X64_MSR_HYPERCALL; + msr->write = 0; + msr->available = 1; + break; + + case 5: + msr->idx = HV_X64_MSR_VP_RUNTIME; + msr->write = 0; + msr->available = 0; + break; + case 6: + feat.eax |= HV_MSR_VP_RUNTIME_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 7: + /* Read only */ + msr->write = 1; + msr->write_val = 1; + msr->available = 0; + break; + + case 8: + msr->idx = HV_X64_MSR_TIME_REF_COUNT; + msr->write = 0; + msr->available = 0; + break; + case 9: + feat.eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 10: + /* Read only */ + msr->write = 1; + msr->write_val = 1; + msr->available = 0; + break; + + case 11: + msr->idx = HV_X64_MSR_VP_INDEX; + msr->write = 0; + msr->available = 0; + break; + case 12: + feat.eax |= HV_MSR_VP_INDEX_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 13: + /* Read only */ + msr->write = 1; + msr->write_val = 1; + msr->available = 0; + break; + + case 14: + msr->idx = HV_X64_MSR_RESET; + msr->write = 0; + msr->available = 0; + break; + case 15: + feat.eax |= HV_MSR_RESET_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 16: + msr->write = 1; + msr->write_val = 0; + msr->available = 1; + break; + + case 17: + msr->idx = HV_X64_MSR_REFERENCE_TSC; + msr->write = 0; + msr->available = 0; + break; + case 18: + feat.eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 19: + msr->write = 1; + msr->write_val = 0; + msr->available = 1; + break; + + case 20: + msr->idx = HV_X64_MSR_EOM; + msr->write = 0; + msr->available = 0; + break; + case 21: + /* + * Remains unavailable even with KVM_CAP_HYPERV_SYNIC2 + * capability enabled and guest visible CPUID bit unset. + */ + cap.cap = KVM_CAP_HYPERV_SYNIC2; + vcpu_enable_cap(vm, VCPU_ID, &cap); + break; + case 22: + feat.eax |= HV_MSR_SYNIC_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 23: + msr->write = 1; + msr->write_val = 0; + msr->available = 1; + break; + + case 24: + msr->idx = HV_X64_MSR_STIMER0_CONFIG; + msr->write = 0; + msr->available = 0; + break; + case 25: + feat.eax |= HV_MSR_SYNTIMER_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 26: + msr->write = 1; + msr->write_val = 0; + msr->available = 1; + break; + case 27: + /* Direct mode test */ + msr->write = 1; + msr->write_val = 1 << 12; + msr->available = 0; + break; + case 28: + feat.edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; + msr->available = 1; + break; + + case 29: + msr->idx = HV_X64_MSR_EOI; + msr->write = 0; + msr->available = 0; + break; + case 30: + feat.eax |= HV_MSR_APIC_ACCESS_AVAILABLE; + msr->write = 1; + msr->write_val = 1; + msr->available = 1; + break; + + case 31: + msr->idx = HV_X64_MSR_TSC_FREQUENCY; + msr->write = 0; + msr->available = 0; + break; + case 32: + feat.eax |= HV_ACCESS_FREQUENCY_MSRS; + msr->write = 0; + msr->available = 1; + break; + case 33: + /* Read only */ + msr->write = 1; + msr->write_val = 1; + msr->available = 0; + break; + + case 34: + msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL; + msr->write = 0; + msr->available = 0; + break; + case 35: + feat.eax |= HV_ACCESS_REENLIGHTENMENT; + msr->write = 0; + msr->available = 1; + break; + case 36: + msr->write = 1; + msr->write_val = 1; + msr->available = 1; + break; + case 37: + /* Can only write '0' */ + msr->idx = HV_X64_MSR_TSC_EMULATION_STATUS; + msr->write = 1; + msr->write_val = 1; + msr->available = 0; + break; + + case 38: + msr->idx = HV_X64_MSR_CRASH_P0; + msr->write = 0; + msr->available = 0; + break; + case 39: + feat.edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + msr->write = 0; + msr->available = 1; + break; + case 40: + msr->write = 1; + msr->write_val = 1; + msr->available = 1; + break; + + case 41: + msr->idx = HV_X64_MSR_SYNDBG_STATUS; + msr->write = 0; + msr->available = 0; + break; + case 42: + feat.edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; + dbg.eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + msr->write = 0; + msr->available = 1; + break; + case 43: + msr->write = 1; + msr->write_val = 0; + msr->available = 1; + break; + + case 44: + /* END */ + msr->idx = 0; + break; + } + + hv_set_cpuid(vm, best, &feat, &recomm, &dbg); + + if (msr->idx) + pr_debug("Stage %d: testing msr: 0x%x for %s\n", stage, + msr->idx, msr->write ? "write" : "read"); + else + pr_debug("Stage %d: finish\n", stage); + + r = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(!r, "vcpu_run failed: %d\n", r); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "unexpected exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(uc.args[1] == stage, + "Unexpected stage: %ld (%d expected)\n", + uc.args[1], stage); + break; + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + return; + case UCALL_DONE: + return; + } + + stage++; + } +} + +static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall, + void *input, void *output, struct kvm_cpuid2 *best) +{ + struct kvm_run *run; + struct ucall uc; + int stage = 0, r; + struct kvm_cpuid_entry2 feat = { + .function = HYPERV_CPUID_FEATURES, + .eax = HV_MSR_HYPERCALL_AVAILABLE + }; + struct kvm_cpuid_entry2 recomm = { + .function = HYPERV_CPUID_ENLIGHTMENT_INFO + }; + struct kvm_cpuid_entry2 dbg = { + .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES + }; + + run = vcpu_state(vm, VCPU_ID); + + while (true) { + switch (stage) { + case 0: + hcall->control = 0xdeadbeef; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + + case 1: + hcall->control = HVCALL_POST_MESSAGE; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 2: + feat.ebx |= HV_POST_MESSAGES; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + + case 3: + hcall->control = HVCALL_SIGNAL_EVENT; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 4: + feat.ebx |= HV_SIGNAL_EVENTS; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + + case 5: + hcall->control = HVCALL_RESET_DEBUG_SESSION; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + case 6: + dbg.eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 7: + feat.ebx |= HV_DEBUGGING; + hcall->expect = HV_STATUS_OPERATION_DENIED; + break; + + case 8: + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 9: + recomm.eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + hcall->expect = HV_STATUS_SUCCESS; + break; + case 10: + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 11: + recomm.eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; + hcall->expect = HV_STATUS_SUCCESS; + break; + + case 12: + hcall->control = HVCALL_SEND_IPI; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 13: + recomm.eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + case 14: + /* Nothing in 'sparse banks' -> success */ + hcall->control = HVCALL_SEND_IPI_EX; + hcall->expect = HV_STATUS_SUCCESS; + break; + + case 15: + hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 16: + recomm.ebx = 0xfff; + hcall->expect = HV_STATUS_SUCCESS; + break; + case 17: + /* XMM fast hypercall */ + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT; + hcall->ud_expected = true; + break; + case 18: + feat.edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; + hcall->ud_expected = false; + hcall->expect = HV_STATUS_SUCCESS; + break; + + case 19: + /* END */ + hcall->control = 0; + break; + } + + hv_set_cpuid(vm, best, &feat, &recomm, &dbg); + + if (hcall->control) + pr_debug("Stage %d: testing hcall: 0x%lx\n", stage, + hcall->control); + else + pr_debug("Stage %d: finish\n", stage); + + r = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(!r, "vcpu_run failed: %d\n", r); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "unexpected exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(uc.args[1] == stage, + "Unexpected stage: %ld (%d expected)\n", + uc.args[1], stage); + break; + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + return; + case UCALL_DONE: + return; + } + + stage++; + } +} + +int main(void) +{ + struct kvm_cpuid2 *best; + struct kvm_vm *vm; + vm_vaddr_t msr_gva, hcall_page, hcall_params; + struct kvm_enable_cap cap = { + .cap = KVM_CAP_HYPERV_ENFORCE_CPUID, + .args = {1} + }; + + /* Test MSRs */ + vm = vm_create_default(VCPU_ID, 0, guest_msr); + + msr_gva = vm_vaddr_alloc_page(vm); + memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize()); + vcpu_args_set(vm, VCPU_ID, 1, msr_gva); + vcpu_enable_cap(vm, VCPU_ID, &cap); + + vcpu_set_hv_cpuid(vm, VCPU_ID); + + best = kvm_get_supported_hv_cpuid(); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); + + pr_info("Testing access to Hyper-V specific MSRs\n"); + guest_test_msrs_access(vm, addr_gva2hva(vm, msr_gva), + best); + kvm_vm_free(vm); + + /* Test hypercalls */ + vm = vm_create_default(VCPU_ID, 0, guest_hcall); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); + + /* Hypercall input/output */ + hcall_page = vm_vaddr_alloc_pages(vm, 2); + memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); + + hcall_params = vm_vaddr_alloc_page(vm); + memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize()); + + vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params); + vcpu_enable_cap(vm, VCPU_ID, &cap); + + vcpu_set_hv_cpuid(vm, VCPU_ID); + + best = kvm_get_supported_hv_cpuid(); + + pr_info("Testing access to Hyper-V hypercalls\n"); + guest_test_hcalls_access(vm, addr_gva2hva(vm, hcall_params), + addr_gva2hva(vm, hcall_page), + addr_gva2hva(vm, hcall_page) + getpagesize(), + best); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 732b244d6956..04ed975662c9 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -227,7 +227,7 @@ int main(void) vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vm, VCPU_ID); - vm_handle_exception(vm, GP_VECTOR, guest_gp_handler); + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); enter_guest(vm); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/mmu_role_test.c b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c new file mode 100644 index 000000000000..da2325fcad87 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 1 + +#define MMIO_GPA 0x100000000ull + +static void guest_code(void) +{ + (void)READ_ONCE(*((uint64_t *)MMIO_GPA)); + (void)READ_ONCE(*((uint64_t *)MMIO_GPA)); + + GUEST_ASSERT(0); +} + +static void guest_pf_handler(struct ex_regs *regs) +{ + /* PFEC == RSVD | PRESENT (read, kernel). */ + GUEST_ASSERT(regs->error_code == 0x9); + GUEST_DONE(); +} + +static void mmu_role_test(u32 *cpuid_reg, u32 evil_cpuid_val) +{ + u32 good_cpuid_val = *cpuid_reg; + struct kvm_run *run; + struct kvm_vm *vm; + uint64_t cmd; + int r; + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + run = vcpu_state(vm, VCPU_ID); + + /* Map 1gb page without a backing memlot. */ + __virt_pg_map(vm, MMIO_GPA, MMIO_GPA, X86_PAGE_SIZE_1G); + + r = _vcpu_run(vm, VCPU_ID); + + /* Guest access to the 1gb page should trigger MMIO. */ + TEST_ASSERT(r == 0, "vcpu_run failed: %d\n", r); + TEST_ASSERT(run->exit_reason == KVM_EXIT_MMIO, + "Unexpected exit reason: %u (%s), expected MMIO exit (1gb page w/o memslot)\n", + run->exit_reason, exit_reason_str(run->exit_reason)); + + TEST_ASSERT(run->mmio.len == 8, "Unexpected exit mmio size = %u", run->mmio.len); + + TEST_ASSERT(run->mmio.phys_addr == MMIO_GPA, + "Unexpected exit mmio address = 0x%llx", run->mmio.phys_addr); + + /* + * Effect the CPUID change for the guest and re-enter the guest. Its + * access should now #PF due to the PAGE_SIZE bit being reserved or + * the resulting GPA being invalid. Note, kvm_get_supported_cpuid() + * returns the struct that contains the entry being modified. Eww. + */ + *cpuid_reg = evil_cpuid_val; + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + /* + * Add a dummy memslot to coerce KVM into bumping the MMIO generation. + * KVM does not "officially" support mucking with CPUID after KVM_RUN, + * and will incorrectly reuse MMIO SPTEs. Don't delete the memslot! + * KVM x86 zaps all shadow pages on memslot deletion. + */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + MMIO_GPA << 1, 10, 1, 0); + + /* Set up a #PF handler to eat the RSVD #PF and signal all done! */ + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_install_exception_handler(vm, PF_VECTOR, guest_pf_handler); + + r = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(r == 0, "vcpu_run failed: %d\n", r); + + cmd = get_ucall(vm, VCPU_ID, NULL); + TEST_ASSERT(cmd == UCALL_DONE, + "Unexpected guest exit, exit_reason=%s, ucall.cmd = %lu\n", + exit_reason_str(run->exit_reason), cmd); + + /* + * Restore the happy CPUID value for the next test. Yes, changes are + * indeed persistent across VM destruction. + */ + *cpuid_reg = good_cpuid_val; + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct kvm_cpuid_entry2 *entry; + int opt; + + /* + * All tests are opt-in because TDP doesn't play nice with reserved #PF + * in the GVA->GPA translation. The hardware page walker doesn't let + * software change GBPAGES or MAXPHYADDR, and KVM doesn't manually walk + * the GVA on fault for performance reasons. + */ + bool do_gbpages = false; + bool do_maxphyaddr = false; + + setbuf(stdout, NULL); + + while ((opt = getopt(argc, argv, "gm")) != -1) { + switch (opt) { + case 'g': + do_gbpages = true; + break; + case 'm': + do_maxphyaddr = true; + break; + case 'h': + default: + printf("usage: %s [-g (GBPAGES)] [-m (MAXPHYADDR)]\n", argv[0]); + break; + } + } + + if (!do_gbpages && !do_maxphyaddr) { + print_skip("No sub-tests selected"); + return 0; + } + + entry = kvm_get_supported_cpuid_entry(0x80000001); + if (!(entry->edx & CPUID_GBPAGES)) { + print_skip("1gb hugepages not supported"); + return 0; + } + + if (do_gbpages) { + pr_info("Test MMIO after toggling CPUID.GBPAGES\n\n"); + mmu_role_test(&entry->edx, entry->edx & ~CPUID_GBPAGES); + } + + if (do_maxphyaddr) { + pr_info("Test MMIO after changing CPUID.MAXPHYADDR\n\n"); + entry = kvm_get_supported_cpuid_entry(0x80000008); + mmu_role_test(&entry->eax, (entry->eax & ~0xff) | 0x20); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index 12c558fc8074..ae76436af0cc 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -14,16 +14,12 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "apic.h" #define N_VCPU 2 #define VCPU_ID0 0 #define VCPU_ID1 1 -static uint32_t get_bsp_flag(void) -{ - return rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BSP; -} - static void guest_bsp_vcpu(void *arg) { GUEST_SYNC(1); @@ -94,7 +90,7 @@ static struct kvm_vm *create_vm(void) pages = vm_adjust_num_guest_pages(VM_MODE_DEFAULT, pages); vm = vm_create(VM_MODE_DEFAULT, pages, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + kvm_vm_elf_load(vm, program_invocation_name); vm_create_irqchip(vm); return vm; @@ -106,8 +102,6 @@ static void add_x86_vcpu(struct kvm_vm *vm, uint32_t vcpuid, bool bsp_code) vm_vcpu_add_default(vm, vcpuid, guest_bsp_vcpu); else vm_vcpu_add_default(vm, vcpuid, guest_not_bsp_vcpu); - - vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid()); } static void run_vm_bsp(uint32_t bsp_vcpu) diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index 613c42c5a9b8..d0fe2fdce58c 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -53,15 +53,28 @@ static inline void sync_with_host(uint64_t phase) : "+a" (phase)); } -void self_smi(void) +static void self_smi(void) { - wrmsr(APIC_BASE_MSR + (APIC_ICR >> 4), - APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI); + x2apic_write_reg(APIC_ICR, + APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI); } -void guest_code(void *arg) +static void l2_guest_code(void) { + sync_with_host(8); + + sync_with_host(10); + + vmcall(); +} + +static void guest_code(void *arg) +{ + #define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; uint64_t apicbase = rdmsr(MSR_IA32_APICBASE); + struct svm_test_data *svm = arg; + struct vmx_pages *vmx_pages = arg; sync_with_host(1); @@ -74,21 +87,50 @@ void guest_code(void *arg) sync_with_host(4); if (arg) { - if (cpu_has_svm()) - generic_svm_setup(arg, NULL, NULL); - else - GUEST_ASSERT(prepare_for_vmx_operation(arg)); + if (cpu_has_svm()) { + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + } else { + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + } sync_with_host(5); self_smi(); sync_with_host(7); + + if (cpu_has_svm()) { + run_guest(svm->vmcb, svm->vmcb_gpa); + svm->vmcb->save.rip += 3; + run_guest(svm->vmcb, svm->vmcb_gpa); + } else { + vmlaunch(); + vmresume(); + } + + /* Stages 8-11 are eaten by SMM (SMRAM_STAGE reported instead) */ + sync_with_host(12); } sync_with_host(DONE); } +void inject_smi(struct kvm_vm *vm) +{ + struct kvm_vcpu_events events; + + vcpu_events_get(vm, VCPU_ID, &events); + + events.smi.pending = 1; + events.flags |= KVM_VCPUEVENT_VALID_SMM; + + vcpu_events_set(vm, VCPU_ID, &events); +} + int main(int argc, char *argv[]) { vm_vaddr_t nested_gva = 0; @@ -147,6 +189,22 @@ int main(int argc, char *argv[]) "Unexpected stage: #%x, got %x", stage, stage_reported); + /* + * Enter SMM during L2 execution and check that we correctly + * return from it. Do not perform save/restore while in SMM yet. + */ + if (stage == 8) { + inject_smi(vm); + continue; + } + + /* + * Perform save/restore while the guest is in SMM triggered + * during L2 execution. + */ + if (stage == 10) + inject_smi(vm); + state = vcpu_save_state(vm, VCPU_ID); kvm_vm_release(vm); kvm_vm_restart(vm, O_RDWR); diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index d672f0a473f8..fc03a150278d 100644 --- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -24,6 +24,10 @@ #define UCALL_PIO_PORT ((uint16_t)0x1000) +struct ucall uc_none = { + .cmd = UCALL_NONE, +}; + /* * ucall is embedded here to protect against compiler reshuffling registers * before calling a function. In this test we only need to get KVM_EXIT_IO @@ -34,7 +38,8 @@ void guest_code(void) asm volatile("1: in %[port], %%al\n" "add $0x1, %%rbx\n" "jmp 1b" - : : [port] "d" (UCALL_PIO_PORT) : "rax", "rbx"); + : : [port] "d" (UCALL_PIO_PORT), "D" (&uc_none) + : "rax", "rbx"); } static void compare_regs(struct kvm_regs *left, struct kvm_regs *right) diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c index e357d8e222d4..5a6a662f2e59 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c @@ -18,15 +18,6 @@ #define rounded_rdmsr(x) ROUND(rdmsr(x)) #define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vm, 0, x)) -#define GUEST_ASSERT_EQ(a, b) do { \ - __typeof(a) _a = (a); \ - __typeof(b) _b = (b); \ - if (_a != _b) \ - ucall(UCALL_ABORT, 4, \ - "Failed guest assert: " \ - #a " == " #b, __LINE__, _a, _b); \ - } while(0) - static void guest_code(void) { u64 val = 0; diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index 72c0d0797522..e3e20e8848d0 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -574,7 +574,7 @@ static void test_msr_filter_allow(void) { vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vm, VCPU_ID); - vm_handle_exception(vm, GP_VECTOR, guest_gp_handler); + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); /* Process guest code userspace exits. */ run_guest_then_process_rdmsr(vm, MSR_IA32_XSS); @@ -588,12 +588,12 @@ static void test_msr_filter_allow(void) { run_guest_then_process_wrmsr(vm, MSR_NON_EXISTENT); run_guest_then_process_rdmsr(vm, MSR_NON_EXISTENT); - vm_handle_exception(vm, UD_VECTOR, guest_ud_handler); + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); run_guest(vm); - vm_handle_exception(vm, UD_VECTOR, NULL); + vm_install_exception_handler(vm, UD_VECTOR, NULL); if (process_ucall(vm) != UCALL_DONE) { - vm_handle_exception(vm, GP_VECTOR, guest_fep_gp_handler); + vm_install_exception_handler(vm, GP_VECTOR, guest_fep_gp_handler); /* Process emulated rdmsr and wrmsr instructions. */ run_guest_then_process_rdmsr(vm, MSR_IA32_XSS); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c index d14888b34adb..d438c4d3228a 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c @@ -96,7 +96,7 @@ int main(int argc, char *argv[]) } vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); - prepare_virtualize_apic_accesses(vmx, vm, 0); + prepare_virtualize_apic_accesses(vmx, vm); vcpu_args_set(vm, VCPU_ID, 2, vmx_pages_gva, high_gpa); while (!done) { diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 537de1068554..06a64980a5d2 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -97,7 +97,7 @@ int main(int argc, char *argv[]) * Add an identity map for GVA range [0xc0000000, 0xc0002000). This * affects both L1 and L2. However... */ - virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES, 0); + virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES); /* * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to @@ -107,9 +107,9 @@ int main(int argc, char *argv[]) * meaning after the last call to virt_map. */ prepare_eptp(vmx, vm, 0); - nested_map_memslot(vmx, vm, 0, 0); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096, 0); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096, 0); + nested_map_memslot(vmx, vm, 0); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); bmap = bitmap_alloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c new file mode 100644 index 000000000000..280c01fd2412 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vmx_nested_tsc_scaling_test + * + * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * This test case verifies that nested TSC scaling behaves as expected when + * both L1 and L2 are scaled using different ratios. For this test we scale + * L1 down and scale L2 up. + */ + +#include <time.h> + +#include "kvm_util.h" +#include "vmx.h" +#include "kselftest.h" + + +#define VCPU_ID 0 + +/* L2 is scaled up (from L1's perspective) by this factor */ +#define L2_SCALE_FACTOR 4ULL + +#define TSC_OFFSET_L2 ((uint64_t) -33125236320908) +#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48) + +#define L2_GUEST_STACK_SIZE 64 + +enum { USLEEP, UCHECK_L1, UCHECK_L2 }; +#define GUEST_SLEEP(sec) ucall(UCALL_SYNC, 2, USLEEP, sec) +#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq) + + +/* + * This function checks whether the "actual" TSC frequency of a guest matches + * its expected frequency. In order to account for delays in taking the TSC + * measurements, a difference of 1% between the actual and the expected value + * is tolerated. + */ +static void compare_tsc_freq(uint64_t actual, uint64_t expected) +{ + uint64_t tolerance, thresh_low, thresh_high; + + tolerance = expected / 100; + thresh_low = expected - tolerance; + thresh_high = expected + tolerance; + + TEST_ASSERT(thresh_low < actual, + "TSC freq is expected to be between %"PRIu64" and %"PRIu64 + " but it actually is %"PRIu64, + thresh_low, thresh_high, actual); + TEST_ASSERT(thresh_high > actual, + "TSC freq is expected to be between %"PRIu64" and %"PRIu64 + " but it actually is %"PRIu64, + thresh_low, thresh_high, actual); +} + +static void check_tsc_freq(int level) +{ + uint64_t tsc_start, tsc_end, tsc_freq; + + /* + * Reading the TSC twice with about a second's difference should give + * us an approximation of the TSC frequency from the guest's + * perspective. Now, this won't be completely accurate, but it should + * be good enough for the purposes of this test. + */ + tsc_start = rdmsr(MSR_IA32_TSC); + GUEST_SLEEP(1); + tsc_end = rdmsr(MSR_IA32_TSC); + + tsc_freq = tsc_end - tsc_start; + + GUEST_CHECK(level, tsc_freq); +} + +static void l2_guest_code(void) +{ + check_tsc_freq(UCHECK_L2); + + /* exit to L1 */ + __asm__ __volatile__("vmcall"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + + /* check that L1's frequency looks alright before launching L2 */ + check_tsc_freq(UCHECK_L1); + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* prepare the VMCS for L2 execution */ + prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* enable TSC offsetting and TSC scaling for L2 */ + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + + control = vmreadz(SECONDARY_VM_EXEC_CONTROL); + control |= SECONDARY_EXEC_TSC_SCALING; + vmwrite(SECONDARY_VM_EXEC_CONTROL, control); + + vmwrite(TSC_OFFSET, TSC_OFFSET_L2); + vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2); + vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32); + + /* launch L2 */ + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + /* check that L1's frequency still looks good */ + check_tsc_freq(UCHECK_L1); + + GUEST_DONE(); +} + +static void tsc_scaling_check_supported(void) +{ + if (!kvm_check_cap(KVM_CAP_TSC_CONTROL)) { + print_skip("TSC scaling not supported by the HW"); + exit(KSFT_SKIP); + } +} + +static void stable_tsc_check_supported(void) +{ + FILE *fp; + char buf[4]; + + fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); + if (fp == NULL) + goto skip_test; + + if (fgets(buf, sizeof(buf), fp) == NULL) + goto skip_test; + + if (strncmp(buf, "tsc", sizeof(buf))) + goto skip_test; + + return; +skip_test: + print_skip("Kernel does not use TSC clocksource - assuming that host TSC is not stable"); + exit(KSFT_SKIP); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + vm_vaddr_t vmx_pages_gva; + + uint64_t tsc_start, tsc_end; + uint64_t tsc_khz; + uint64_t l1_scale_factor; + uint64_t l0_tsc_freq = 0; + uint64_t l1_tsc_freq = 0; + uint64_t l2_tsc_freq = 0; + + nested_vmx_check_supported(); + tsc_scaling_check_supported(); + stable_tsc_check_supported(); + + /* + * We set L1's scale factor to be a random number from 2 to 10. + * Ideally we would do the same for L2's factor but that one is + * referenced by both main() and l1_guest_code() and using a global + * variable does not work. + */ + srand(time(NULL)); + l1_scale_factor = (rand() % 9) + 2; + printf("L1's scale down factor is: %"PRIu64"\n", l1_scale_factor); + printf("L2's scale up factor is: %llu\n", L2_SCALE_FACTOR); + + tsc_start = rdtsc(); + sleep(1); + tsc_end = rdtsc(); + + l0_tsc_freq = tsc_end - tsc_start; + printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + + tsc_khz = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL); + TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed"); + + /* scale down L1's TSC frequency */ + vcpu_ioctl(vm, VCPU_ID, KVM_SET_TSC_KHZ, + (void *) (tsc_khz / l1_scale_factor)); + + for (;;) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *) uc.args[0]); + case UCALL_SYNC: + switch (uc.args[0]) { + case USLEEP: + sleep(uc.args[1]); + break; + case UCHECK_L1: + l1_tsc_freq = uc.args[1]; + printf("L1's TSC frequency is around: %"PRIu64 + "\n", l1_tsc_freq); + + compare_tsc_freq(l1_tsc_freq, + l0_tsc_freq / l1_scale_factor); + break; + case UCHECK_L2: + l2_tsc_freq = uc.args[1]; + printf("L2's TSC frequency is around: %"PRIu64 + "\n", l2_tsc_freq); + + compare_tsc_freq(l2_tsc_freq, + l1_tsc_freq * L2_SCALE_FACTOR); + break; + } + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c index 2f964cdc273c..afbbc40df884 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c @@ -42,8 +42,6 @@ #define HALTER_VCPU_ID 0 #define SENDER_VCPU_ID 1 -volatile uint32_t *apic_base = (volatile uint32_t *)APIC_DEFAULT_GPA; - /* * Vector for IPI from sender vCPU to halting vCPU. * Value is arbitrary and was chosen for the alternating bit pattern. Any @@ -86,45 +84,6 @@ struct thread_params { uint64_t *pipis_rcvd; /* host address of ipis_rcvd global */ }; -uint32_t read_apic_reg(uint reg) -{ - return apic_base[reg >> 2]; -} - -void write_apic_reg(uint reg, uint32_t val) -{ - apic_base[reg >> 2] = val; -} - -void disable_apic(void) -{ - wrmsr(MSR_IA32_APICBASE, - rdmsr(MSR_IA32_APICBASE) & - ~(MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD)); -} - -void enable_xapic(void) -{ - uint64_t val = rdmsr(MSR_IA32_APICBASE); - - /* Per SDM: to enable xAPIC when in x2APIC must first disable APIC */ - if (val & MSR_IA32_APICBASE_EXTD) { - disable_apic(); - wrmsr(MSR_IA32_APICBASE, - rdmsr(MSR_IA32_APICBASE) | MSR_IA32_APICBASE_ENABLE); - } else if (!(val & MSR_IA32_APICBASE_ENABLE)) { - wrmsr(MSR_IA32_APICBASE, val | MSR_IA32_APICBASE_ENABLE); - } - - /* - * Per SDM: reset value of spurious interrupt vector register has the - * APIC software enabled bit=0. It must be enabled in addition to the - * enable bit in the MSR. - */ - val = read_apic_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED; - write_apic_reg(APIC_SPIV, val); -} - void verify_apic_base_addr(void) { uint64_t msr = rdmsr(MSR_IA32_APICBASE); @@ -136,10 +95,10 @@ void verify_apic_base_addr(void) static void halter_guest_code(struct test_data_page *data) { verify_apic_base_addr(); - enable_xapic(); + xapic_enable(); - data->halter_apic_id = GET_APIC_ID_FIELD(read_apic_reg(APIC_ID)); - data->halter_lvr = read_apic_reg(APIC_LVR); + data->halter_apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID)); + data->halter_lvr = xapic_read_reg(APIC_LVR); /* * Loop forever HLTing and recording halts & wakes. Disable interrupts @@ -150,8 +109,8 @@ static void halter_guest_code(struct test_data_page *data) * TPR and PPR for diagnostic purposes in case the test fails. */ for (;;) { - data->halter_tpr = read_apic_reg(APIC_TASKPRI); - data->halter_ppr = read_apic_reg(APIC_PROCPRI); + data->halter_tpr = xapic_read_reg(APIC_TASKPRI); + data->halter_ppr = xapic_read_reg(APIC_PROCPRI); data->hlt_count++; asm volatile("sti; hlt; cli"); data->wake_count++; @@ -166,7 +125,7 @@ static void halter_guest_code(struct test_data_page *data) static void guest_ipi_handler(struct ex_regs *regs) { ipis_rcvd++; - write_apic_reg(APIC_EOI, 77); + xapic_write_reg(APIC_EOI, 77); } static void sender_guest_code(struct test_data_page *data) @@ -179,7 +138,7 @@ static void sender_guest_code(struct test_data_page *data) uint64_t tsc_start; verify_apic_base_addr(); - enable_xapic(); + xapic_enable(); /* * Init interrupt command register for sending IPIs @@ -206,8 +165,8 @@ static void sender_guest_code(struct test_data_page *data) * First IPI can be sent unconditionally because halter vCPU * starts earlier. */ - write_apic_reg(APIC_ICR2, icr2_val); - write_apic_reg(APIC_ICR, icr_val); + xapic_write_reg(APIC_ICR2, icr2_val); + xapic_write_reg(APIC_ICR, icr_val); data->ipis_sent++; /* @@ -462,13 +421,13 @@ int main(int argc, char *argv[]) vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vm, HALTER_VCPU_ID); - vm_handle_exception(vm, IPI_VECTOR, guest_ipi_handler); + vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler); - virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA, 0); + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); vm_vcpu_add_default(vm, SENDER_VCPU_ID, sender_guest_code); - test_data_page_vaddr = vm_vaddr_alloc(vm, 0x1000, 0x1000, 0, 0); + test_data_page_vaddr = vm_vaddr_alloc_page(vm); data = (struct test_data_page *)addr_gva2hva(vm, test_data_page_vaddr); memset(data, 0, sizeof(*data)); diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 1f4a0599683c..117bf49a3d79 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -146,7 +146,7 @@ int main(int argc, char *argv[]) /* Map a region for the shared_info page */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0); - virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2, 0); + virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2); struct kvm_xen_hvm_config hvmc = { .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c index 8389e0bfd711..adc94452b57c 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c @@ -103,7 +103,7 @@ int main(int argc, char *argv[]) /* Map a region for the hypercall pages */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, HCALL_REGION_GPA, HCALL_REGION_SLOT, 2, 0); - virt_map(vm, HCALL_REGION_GPA, HCALL_REGION_GPA, 2, 0); + virt_map(vm, HCALL_REGION_GPA, HCALL_REGION_GPA, 2); for (;;) { volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); |