diff options
Diffstat (limited to 'arch/x86')
50 files changed, 1162 insertions, 433 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7bb15747fea2..0a7b885964ba 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -91,7 +91,7 @@ config X86 select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_BPF_JIT if X86_64 + select HAVE_EBPF_JIT if X86_64 select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL @@ -105,6 +105,7 @@ config X86 select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_EXIT_THREAD select HAVE_FENTRY if X86_64 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_FP_TEST @@ -130,6 +131,7 @@ config X86 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS + select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES select HAVE_PCSPKR_PLATFORM diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 265901a84f3f..5fa6ee2c2dde 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -17,7 +17,6 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y -CONFIG_RESOURCE_COUNTERS=y CONFIG_CGROUP_SCHED=y CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 0c8d7963483c..d28bdabcc87e 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -16,7 +16,6 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y -CONFIG_RESOURCE_COUNTERS=y CONFIG_CGROUP_SCHED=y CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set diff --git a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha-mb/sha1_x8_avx2.S index 8e1b47792b31..c9dae1cd2919 100644 --- a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S +++ b/arch/x86/crypto/sha-mb/sha1_x8_avx2.S @@ -296,7 +296,11 @@ W14 = TMP_ # ENTRY(sha1_x8_avx2) - push RSP_SAVE + # save callee-saved clobbered registers to comply with C function ABI + push %r12 + push %r13 + push %r14 + push %r15 #save rsp mov %rsp, RSP_SAVE @@ -446,7 +450,12 @@ lloop: ## Postamble mov RSP_SAVE, %rsp - pop RSP_SAVE + + # restore callee-saved clobbered registers + pop %r15 + pop %r14 + pop %r13 + pop %r12 ret ENDPROC(sha1_x8_avx2) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index b3cf81333a54..ab220ac9b3b9 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -163,7 +163,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) addr = 0; } - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; addr = get_unmapped_area(NULL, addr, image->size - image->sym_vvar_start, 0, 0); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 73a75aa5a66d..33787ee817f0 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2202,7 +2202,7 @@ static int backtrace_stack(void *data, char *name) static int backtrace_address(void *data, unsigned long addr, int reliable) { - struct perf_callchain_entry *entry = data; + struct perf_callchain_entry_ctx *entry = data; return perf_callchain_store(entry, addr); } @@ -2214,7 +2214,7 @@ static const struct stacktrace_ops backtrace_ops = { }; void -perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ @@ -2268,7 +2268,7 @@ static unsigned long get_segment_base(unsigned int segment) #include <asm/compat.h> static inline int -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { /* 32-bit process in 64-bit kernel. */ unsigned long ss_base, cs_base; @@ -2283,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); - while (entry->nr < sysctl_perf_event_max_stack) { + while (entry->nr < entry->max_stack) { unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; @@ -2309,14 +2309,14 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) } #else static inline int -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { return 0; } #endif void -perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stack_frame frame; const void __user *fp; @@ -2343,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; pagefault_disable(); - while (entry->nr < sysctl_perf_event_max_stack) { + while (entry->nr < entry->max_stack) { unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index 0a5ede187d9c..eb0533558c2b 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -826,7 +826,7 @@ static int p4_hw_config(struct perf_event *event) * Clear bits we reserve to be managed by kernel itself * and never allowed from a user space */ - event->attr.config &= P4_CONFIG_MASK; + event->attr.config &= P4_CONFIG_MASK; rc = p4_validate_raw_event(event); if (rc) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 16c178916412..fce74062d981 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -891,7 +891,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id return -ENODEV; pkg = topology_phys_to_logical_pkg(phys_id); - if (WARN_ON_ONCE(pkg < 0)) + if (pkg < 0) return -EINVAL; if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index ae6aad1d24f7..f5e737ff0022 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -116,13 +116,13 @@ static struct linux_binfmt aout_format = { .min_coredump = PAGE_SIZE }; -static void set_brk(unsigned long start, unsigned long end) +static unsigned long set_brk(unsigned long start, unsigned long end) { start = PAGE_ALIGN(start); end = PAGE_ALIGN(end); if (end <= start) - return; - vm_brk(start, end - start); + return start; + return vm_brk(start, end - start); } #ifdef CONFIG_COREDUMP @@ -349,7 +349,10 @@ static int load_aout_binary(struct linux_binprm *bprm) #endif if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) { - vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); + error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); + if (IS_ERR_VALUE(error)) + return error; + read_code(bprm->file, N_TXTADDR(ex), fd_offset, ex.a_text+ex.a_data); goto beyond_if; @@ -372,10 +375,13 @@ static int load_aout_binary(struct linux_binprm *bprm) if (error != N_DATADDR(ex)) return error; } + beyond_if: - set_binfmt(&aout_format); + error = set_brk(current->mm->start_brk, current->mm->brk); + if (IS_ERR_VALUE(error)) + return error; - set_brk(current->mm->start_brk, current->mm->brk); + set_binfmt(&aout_format); current->mm->start_stack = (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); @@ -434,7 +440,9 @@ static int load_aout_library(struct file *file) error_time = jiffies; } #endif - vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); + retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); + if (IS_ERR_VALUE(retval)) + goto out; read_code(file, start_addr, N_TXTOFF(ex), ex.a_text + ex.a_data); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b7e394485a5f..e0fbe7e70dc1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -562,7 +562,6 @@ struct kvm_vcpu_arch { struct { u64 msr_val; u64 last_steal; - u64 accum_steal; struct gfn_to_hva_cache stime; struct kvm_steal_time steal; } st; @@ -774,6 +773,11 @@ struct kvm_arch { u8 nr_reserved_ioapic_pins; bool disabled_lapic_found; + + /* Struct members for AVIC */ + u32 ldr_mode; + struct page *avic_logical_id_table_page; + struct page *avic_physical_id_table_page; }; struct kvm_vm_stat { @@ -804,6 +808,7 @@ struct kvm_vcpu_stat { u32 halt_exits; u32 halt_successful_poll; u32 halt_attempted_poll; + u32 halt_poll_invalid; u32 halt_wakeup; u32 request_irq_exits; u32 irq_exits; @@ -848,6 +853,9 @@ struct kvm_x86_ops { bool (*cpu_has_high_real_mode_segbase)(void); void (*cpuid_update)(struct kvm_vcpu *vcpu); + int (*vm_init)(struct kvm *kvm); + void (*vm_destroy)(struct kvm *kvm); + /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); void (*vcpu_free)(struct kvm_vcpu *vcpu); @@ -914,7 +922,7 @@ struct kvm_x86_ops { bool (*get_enable_apicv)(void); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); - void (*hwapic_isr_update)(struct kvm *kvm, int isr); + void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); @@ -990,8 +998,13 @@ struct kvm_x86_ops { */ int (*pre_block)(struct kvm_vcpu *vcpu); void (*post_block)(struct kvm_vcpu *vcpu); + + void (*vcpu_blocking)(struct kvm_vcpu *vcpu); + void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); + int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); + void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -1341,7 +1354,18 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); -static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} -static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->vcpu_blocking) + kvm_x86_ops->vcpu_blocking(vcpu); +} + +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->vcpu_unblocking) + kvm_x86_ops->vcpu_unblocking(vcpu); +} + +static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index 7e68f9558552..a7f9181f63f3 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -32,8 +32,6 @@ static inline int klp_check_compiler_support(void) #endif return 0; } -int klp_write_module_reloc(struct module *mod, unsigned long type, - unsigned long loc, unsigned long value); static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f86491a7bc9d..1a27396b6ea0 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -181,6 +181,7 @@ static inline int pmd_trans_huge(pmd_t pmd) return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; } +#define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { return boot_cpu_has(X86_FEATURE_PSE); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 6136d99f537b..d0fe23ec7e98 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -78,7 +78,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 exit_int_info; u32 exit_int_info_err; u64 nested_ctl; - u8 reserved_4[16]; + u64 avic_vapic_bar; + u8 reserved_4[8]; u32 event_inj; u32 event_inj_err; u64 nested_cr3; @@ -88,7 +89,11 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 next_rip; u8 insn_len; u8 insn_bytes[15]; - u8 reserved_6[800]; + u64 avic_backing_page; /* Offset 0xe0 */ + u8 reserved_6[8]; /* Offset 0xe8 */ + u64 avic_logical_id; /* Offset 0xf0 */ + u64 avic_physical_id; /* Offset 0xf8 */ + u8 reserved_7[768]; }; @@ -111,6 +116,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_INTR_MASKING_SHIFT 24 #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) +#define AVIC_ENABLE_SHIFT 31 +#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) + #define SVM_INTERRUPT_SHADOW_MASK 1 #define SVM_IOIO_STR_SHIFT 2 diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 12f9653bde8d..2982387ba817 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -5,6 +5,7 @@ */ #include <linux/errno.h> #include <linux/compiler.h> +#include <linux/kasan-checks.h> #include <linux/thread_info.h> #include <linux/string.h> #include <asm/asm.h> @@ -721,6 +722,8 @@ copy_from_user(void *to, const void __user *from, unsigned long n) might_fault(); + kasan_check_write(to, n); + /* * While we would like to have the compiler do the checking for us * even in the non-constant size case, any false positives there are @@ -754,6 +757,8 @@ copy_to_user(void __user *to, const void *from, unsigned long n) { int sz = __compiletime_object_size(from); + kasan_check_read(from, n); + might_fault(); /* See the comment in copy_from_user() above. */ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 3fe0eac59462..4b32da24faaf 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -33,46 +33,10 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero * the specified block with access_ok() before calling this function. * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. - * - * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault - * we return the initial request size (1, 2 or 4), as copy_*_user should do. - * If a store crosses a page boundary and gets a fault, the x86 will not write - * anything, so this is accurate. */ - static __always_inline unsigned long __must_check __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - __uaccess_begin(); - __put_user_size(*(u8 *)from, (u8 __user *)to, - 1, ret, 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __put_user_size(*(u16 *)from, (u16 __user *)to, - 2, ret, 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __put_user_size(*(u32 *)from, (u32 __user *)to, - 4, ret, 4); - __uaccess_end(); - return ret; - case 8: - __uaccess_begin(); - __put_user_size(*(u64 *)from, (u64 __user *)to, - 8, ret, 8); - __uaccess_end(); - return ret; - } - } return __copy_to_user_ll(to, from, n); } @@ -101,32 +65,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { - /* Avoid zeroing the tail if the copy fails.. - * If 'n' is constant and 1, 2, or 4, we do still zero on a failure, - * but as the zeroing behaviour is only significant when n is not - * constant, that shouldn't be a problem. - */ - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - __uaccess_begin(); - __get_user_size(*(u8 *)to, from, 1, ret, 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __get_user_size(*(u16 *)to, from, 2, ret, 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __get_user_size(*(u32 *)to, from, 4, ret, 4); - __uaccess_end(); - return ret; - } - } return __copy_from_user_ll_nozero(to, from, n); } diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 307698688fa1..2eac2aa3e37f 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -7,6 +7,7 @@ #include <linux/compiler.h> #include <linux/errno.h> #include <linux/lockdep.h> +#include <linux/kasan-checks.h> #include <asm/alternative.h> #include <asm/cpufeatures.h> #include <asm/page.h> @@ -109,6 +110,7 @@ static __always_inline __must_check int __copy_from_user(void *dst, const void __user *src, unsigned size) { might_fault(); + kasan_check_write(dst, size); return __copy_from_user_nocheck(dst, src, size); } @@ -175,6 +177,7 @@ static __always_inline __must_check int __copy_to_user(void __user *dst, const void *src, unsigned size) { might_fault(); + kasan_check_read(src, size); return __copy_to_user_nocheck(dst, src, size); } @@ -242,12 +245,14 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) static __must_check __always_inline int __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size) { + kasan_check_write(dst, size); return __copy_from_user_nocheck(dst, src, size); } static __must_check __always_inline int __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) { + kasan_check_read(src, size); return __copy_to_user_nocheck(dst, src, size); } @@ -258,6 +263,7 @@ static inline int __copy_from_user_nocache(void *dst, const void __user *src, unsigned size) { might_fault(); + kasan_check_write(dst, size); return __copy_user_nocache(dst, src, size, 1); } @@ -265,6 +271,7 @@ static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { + kasan_check_write(dst, size); return __copy_user_nocache(dst, src, size, 0); } diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index cd54147cb365..739c0c594022 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -216,9 +216,9 @@ struct kvm_cpuid_entry2 { __u32 padding[3]; }; -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) -#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) -#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX (1 << 0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC (1 << 1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT (1 << 2) /* for KVM_SET_CPUID2 */ struct kvm_cpuid2 { diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 8a4add8e4639..b9e9bb2c6089 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -73,6 +73,8 @@ #define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_XSETBV 0x08d #define SVM_EXIT_NPF 0x400 +#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 +#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 #define SVM_EXIT_ERR -1 @@ -107,8 +109,10 @@ { SVM_EXIT_SMI, "smi" }, \ { SVM_EXIT_INIT, "init" }, \ { SVM_EXIT_VINTR, "vintr" }, \ + { SVM_EXIT_CR0_SEL_WRITE, "cr0_sel_write" }, \ { SVM_EXIT_CPUID, "cpuid" }, \ { SVM_EXIT_INVD, "invd" }, \ + { SVM_EXIT_PAUSE, "pause" }, \ { SVM_EXIT_HLT, "hlt" }, \ { SVM_EXIT_INVLPG, "invlpg" }, \ { SVM_EXIT_INVLPGA, "invlpga" }, \ @@ -127,7 +131,10 @@ { SVM_EXIT_MONITOR, "monitor" }, \ { SVM_EXIT_MWAIT, "mwait" }, \ { SVM_EXIT_XSETBV, "xsetbv" }, \ - { SVM_EXIT_NPF, "npf" } + { SVM_EXIT_NPF, "npf" }, \ + { SVM_EXIT_RSM, "rsm" }, \ + { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ + { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" } #endif /* _UAPI__SVM_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9abf8551c7e4..0503f5bfb18d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -83,7 +83,6 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f115a58f7c84..9414f84584e4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -445,7 +445,6 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); - acpi_penalize_sci_irq(bus_irq, trigger, polarity); /* * stash over-ride to indicate we've been here diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 045e424fb368..7788ce643bf4 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,7 +18,6 @@ #include <linux/nmi.h> #include <linux/module.h> #include <linux/delay.h> -#include <linux/seq_buf.h> #ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index cbb3cf09b065..65cbbcd48fe4 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -422,7 +422,7 @@ static void show_saved_mc(void) data_size = get_datasize(mc_saved_header); date = mc_saved_header->date; - pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n", + pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n", i, sig, pf, rev, total_size, date & 0xffff, date >> 24, diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 2af478e3fd4e..f2356bda2b05 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -19,8 +19,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/efi.h> -#include <linux/verify_pefile.h> -#include <keys/system_keyring.h> +#include <linux/verification.h> #include <asm/bootparam.h> #include <asm/setup.h> @@ -529,18 +528,9 @@ static int bzImage64_cleanup(void *loader_data) #ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) { - bool trusted; - int ret; - - ret = verify_pefile_signature(kernel, kernel_len, - system_trusted_keyring, - VERIFYING_KEXEC_PE_SIGNATURE, - &trusted); - if (ret < 0) - return ret; - if (!trusted) - return -EKEYREJECTED; - return 0; + return verify_pefile_signature(kernel, kernel_len, + NULL, + VERIFYING_KEXEC_PE_SIGNATURE); } #endif diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c deleted file mode 100644 index 92fc1a51f994..000000000000 --- a/arch/x86/kernel/livepatch.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * livepatch.c - x86-specific Kernel Live Patching Core - * - * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> - * Copyright (C) 2014 SUSE - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/module.h> -#include <linux/uaccess.h> -#include <asm/elf.h> -#include <asm/livepatch.h> - -/** - * klp_write_module_reloc() - write a relocation in a module - * @mod: module in which the section to be modified is found - * @type: ELF relocation type (see asm/elf.h) - * @loc: address that the relocation should be written to - * @value: relocation value (sym address + addend) - * - * This function writes a relocation to the specified location for - * a particular module. - */ -int klp_write_module_reloc(struct module *mod, unsigned long type, - unsigned long loc, unsigned long value) -{ - size_t size = 4; - unsigned long val; - unsigned long core = (unsigned long)mod->core_layout.base; - unsigned long core_size = mod->core_layout.size; - - switch (type) { - case R_X86_64_NONE: - return 0; - case R_X86_64_64: - val = value; - size = 8; - break; - case R_X86_64_32: - val = (u32)value; - break; - case R_X86_64_32S: - val = (s32)value; - break; - case R_X86_64_PC32: - val = (u32)(value - loc); - break; - default: - /* unsupported relocation type */ - return -EINVAL; - } - - if (loc < core || loc >= core + core_size) - /* loc does not point to any symbol inside the module */ - return -EINVAL; - - return probe_kernel_write((void *)loc, &val, size); -} diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index ba7fbba9831b..5a294e48b185 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -538,3 +538,48 @@ overflow: return -ENOEXEC; } #endif /* CONFIG_KEXEC_FILE */ + +static int +kexec_mark_range(unsigned long start, unsigned long end, bool protect) +{ + struct page *page; + unsigned int nr_pages; + + /* + * For physical range: [start, end]. We must skip the unassigned + * crashk resource with zero-valued "end" member. + */ + if (!end || start > end) + return 0; + + page = pfn_to_page(start >> PAGE_SHIFT); + nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + if (protect) + return set_pages_ro(page, nr_pages); + else + return set_pages_rw(page, nr_pages); +} + +static void kexec_mark_crashkres(bool protect) +{ + unsigned long control; + + kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect); + + /* Don't touch the control code page used in crash_kexec().*/ + control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); + /* Control code page is located in the 2nd page. */ + kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect); + control += KEXEC_CONTROL_PAGE_SIZE; + kexec_mark_range(control, crashk_res.end, protect); +} + +void arch_kexec_protect_crashkres(void) +{ + kexec_mark_crashkres(true); +} + +void arch_kexec_unprotect_crashkres(void) +{ + kexec_mark_crashkres(false); +} diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index ed48a9f465f8..61924222a9e1 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -182,7 +182,8 @@ GLOBAL(ftrace_graph_call) jmp ftrace_stub #endif -GLOBAL(ftrace_stub) +/* This is weak to keep gas from relaxing the jumps */ +WEAK(ftrace_stub) retq END(ftrace_caller) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2915d54e9dd5..96becbbb52e0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -97,10 +97,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - struct task_struct *me = current; - struct thread_struct *t = &me->thread; + struct thread_struct *t = &tsk->thread; unsigned long *bp = t->io_bitmap_ptr; struct fpu *fpu = &t->fpu; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2367ae07eb76..c4e7b3991b60 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -398,6 +398,11 @@ static void __init reserve_initrd(void) memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); } + +static void __init early_initrd_acpi_init(void) +{ + early_acpi_table_init((void *)initrd_start, initrd_end - initrd_start); +} #else static void __init early_reserve_initrd(void) { @@ -405,6 +410,9 @@ static void __init early_reserve_initrd(void) static void __init reserve_initrd(void) { } +static void __init early_initrd_acpi_init(void) +{ +} #endif /* CONFIG_BLK_DEV_INITRD */ static void __init parse_setup_data(void) @@ -1138,9 +1146,7 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); -#if defined(CONFIG_ACPI) && defined(CONFIG_BLK_DEV_INITRD) - acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start); -#endif + early_initrd_acpi_init(); vsmp_init(); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 9db47090ead0..5f42d038fcb4 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -443,7 +443,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, spin_lock(&ioapic->lock); if (trigger_mode != IOAPIC_LEVEL_TRIG || - kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) + kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) continue; ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index a22a488b4622..3069281904d3 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c @@ -254,7 +254,7 @@ int kvm_iommu_map_guest(struct kvm *kvm) !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) { printk(KERN_WARNING "%s: No interrupt remapping support," " disallowing device assignment." - " Re-enble with \"allow_unsafe_assigned_interrupts=1\"" + " Re-enable with \"allow_unsafe_assigned_interrupts=1\"" " module option.\n", __func__); iommu_domain_free(kvm->arch.iommu_domain); kvm->arch.iommu_domain = NULL; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 54ead79e444b..dfb4c6476877 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -382,9 +382,6 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u32 i, nr_ioapic_pins; int idx; - /* kvm->irq_routing must be read after clearing - * KVM_SCAN_IOAPIC. */ - smp_mb(); idx = srcu_read_lock(&kvm->irq_srcu); table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); nr_ioapic_pins = min_t(u32, table->nr_rt_entries, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1a2da0e5a373..bbb5b283ff63 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -59,9 +59,8 @@ /* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ #define apic_debug(fmt, arg...) -#define APIC_LVT_NUM 6 /* 14 is the version for Xeon and Pentium 8.4.8*/ -#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) +#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ #define APIC_SHORT_MASK 0xc0000 @@ -73,14 +72,6 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul -#define VEC_POS(v) ((v) & (32 - 1)) -#define REG_POS(v) (((v) >> 5) << 4) - -static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) -{ - *((u32 *) (apic->regs + reg_off)) = val; -} - static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -94,11 +85,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) apic_test_vector(vector, apic->regs + APIC_IRR); } -static inline void apic_set_vector(int vec, void *bitmap) -{ - set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - static inline void apic_clear_vector(int vec, void *bitmap) { clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -173,7 +159,7 @@ static void recalculate_apic_map(struct kvm *kvm) continue; aid = kvm_apic_id(apic); - ldr = kvm_apic_get_reg(apic, APIC_LDR); + ldr = kvm_lapic_get_reg(apic, APIC_LDR); if (aid < ARRAY_SIZE(new->phys_map)) new->phys_map[aid] = apic; @@ -182,7 +168,7 @@ static void recalculate_apic_map(struct kvm *kvm) new->mode |= KVM_APIC_MODE_X2APIC; } else if (ldr) { ldr = GET_APIC_LOGICAL_ID(ldr); - if (kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) + if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) new->mode |= KVM_APIC_MODE_XAPIC_FLAT; else new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; @@ -212,7 +198,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) { bool enabled = val & APIC_SPIV_APIC_ENABLED; - apic_set_reg(apic, APIC_SPIV, val); + kvm_lapic_set_reg(apic, APIC_SPIV, val); if (enabled != apic->sw_enabled) { apic->sw_enabled = enabled; @@ -226,13 +212,13 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) { - apic_set_reg(apic, APIC_ID, id << 24); + kvm_lapic_set_reg(apic, APIC_ID, id << 24); recalculate_apic_map(apic->vcpu->kvm); } static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) { - apic_set_reg(apic, APIC_LDR, id); + kvm_lapic_set_reg(apic, APIC_LDR, id); recalculate_apic_map(apic->vcpu->kvm); } @@ -240,19 +226,19 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id) { u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - apic_set_reg(apic, APIC_ID, id << 24); - apic_set_reg(apic, APIC_LDR, ldr); + kvm_lapic_set_reg(apic, APIC_ID, id << 24); + kvm_lapic_set_reg(apic, APIC_LDR, ldr); recalculate_apic_map(apic->vcpu->kvm); } static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) { - return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); + return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); } static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) { - return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; + return kvm_lapic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; } static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) @@ -287,10 +273,10 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) v |= APIC_LVR_DIRECTED_EOI; - apic_set_reg(apic, APIC_LVR, v); + kvm_lapic_set_reg(apic, APIC_LVR, v); } -static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = { +static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = { LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ LVT_MASK | APIC_MODE_MASK, /* LVTPC */ @@ -349,16 +335,6 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); -static inline void apic_set_irr(int vec, struct kvm_lapic *apic) -{ - apic_set_vector(vec, apic->regs + APIC_IRR); - /* - * irr_pending must be true if any interrupt is pending; set it after - * APIC_IRR to avoid race with apic_clear_irr - */ - apic->irr_pending = true; -} - static inline int apic_search_irr(struct kvm_lapic *apic) { return find_highest_vector(apic->regs + APIC_IRR); @@ -416,7 +392,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(vcpu->arch.apicv_active)) - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); + kvm_x86_ops->hwapic_isr_update(vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -464,7 +440,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(vcpu->arch.apicv_active)) - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + kvm_x86_ops->hwapic_isr_update(vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; @@ -549,8 +525,8 @@ static void apic_update_ppr(struct kvm_lapic *apic) u32 tpr, isrv, ppr, old_ppr; int isr; - old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI); - tpr = kvm_apic_get_reg(apic, APIC_TASKPRI); + old_ppr = kvm_lapic_get_reg(apic, APIC_PROCPRI); + tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI); isr = apic_find_highest_isr(apic); isrv = (isr != -1) ? isr : 0; @@ -563,7 +539,7 @@ static void apic_update_ppr(struct kvm_lapic *apic) apic, ppr, isr, isrv); if (old_ppr != ppr) { - apic_set_reg(apic, APIC_PROCPRI, ppr); + kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); if (ppr < old_ppr) kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } @@ -571,7 +547,7 @@ static void apic_update_ppr(struct kvm_lapic *apic) static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) { - apic_set_reg(apic, APIC_TASKPRI, tpr); + kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr); apic_update_ppr(apic); } @@ -601,7 +577,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) if (kvm_apic_broadcast(apic, mda)) return true; - logical_id = kvm_apic_get_reg(apic, APIC_LDR); + logical_id = kvm_lapic_get_reg(apic, APIC_LDR); if (apic_x2apic_mode(apic)) return ((logical_id >> 16) == (mda >> 16)) @@ -610,7 +586,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) logical_id = GET_APIC_LOGICAL_ID(logical_id); mda = GET_APIC_DEST_FIELD(mda); - switch (kvm_apic_get_reg(apic, APIC_DFR)) { + switch (kvm_lapic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: return (logical_id & mda) != 0; case APIC_DFR_CLUSTER: @@ -618,7 +594,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) && (logical_id & mda & 0xf) != 0; default: apic_debug("Bad DFR vcpu %d: %08x\n", - apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR)); + apic->vcpu->vcpu_id, kvm_lapic_get_reg(apic, APIC_DFR)); return false; } } @@ -668,6 +644,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, return false; } } +EXPORT_SYMBOL_GPL(kvm_apic_match_dest); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, const unsigned long *bitmap, u32 bitmap_size) @@ -921,7 +898,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { if (trig_mode) - apic_set_vector(vector, apic->regs + APIC_TMR); + kvm_lapic_set_vector(vector, apic->regs + APIC_TMR); else apic_clear_vector(vector, apic->regs + APIC_TMR); } @@ -929,7 +906,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (vcpu->arch.apicv_active) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); else { - apic_set_irr(vector, apic); + kvm_lapic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); @@ -1073,8 +1050,8 @@ EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); static void apic_send_ipi(struct kvm_lapic *apic) { - u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); - u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2); + u32 icr_low = kvm_lapic_get_reg(apic, APIC_ICR); + u32 icr_high = kvm_lapic_get_reg(apic, APIC_ICR2); struct kvm_lapic_irq irq; irq.vector = icr_low & APIC_VECTOR_MASK; @@ -1111,7 +1088,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) ASSERT(apic != NULL); /* if initial count is 0, current count should also be 0 */ - if (kvm_apic_get_reg(apic, APIC_TMICT) == 0 || + if (kvm_lapic_get_reg(apic, APIC_TMICT) == 0 || apic->lapic_timer.period == 0) return 0; @@ -1168,13 +1145,13 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) break; case APIC_PROCPRI: apic_update_ppr(apic); - val = kvm_apic_get_reg(apic, offset); + val = kvm_lapic_get_reg(apic, offset); break; case APIC_TASKPRI: report_tpr_access(apic, false); /* fall thru */ default: - val = kvm_apic_get_reg(apic, offset); + val = kvm_lapic_get_reg(apic, offset); break; } @@ -1186,7 +1163,7 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) return container_of(dev, struct kvm_lapic, dev); } -static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, +int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data) { unsigned char alignment = offset & 0xf; @@ -1223,6 +1200,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, } return 0; } +EXPORT_SYMBOL_GPL(kvm_lapic_reg_read); static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { @@ -1240,7 +1218,7 @@ static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; - apic_reg_read(apic, offset, len, data); + kvm_lapic_reg_read(apic, offset, len, data); return 0; } @@ -1249,7 +1227,7 @@ static void update_divide_count(struct kvm_lapic *apic) { u32 tmp1, tmp2, tdcr; - tdcr = kvm_apic_get_reg(apic, APIC_TDCR); + tdcr = kvm_lapic_get_reg(apic, APIC_TDCR); tmp1 = tdcr & 0xf; tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; apic->divide_count = 0x1 << (tmp2 & 0x7); @@ -1260,7 +1238,7 @@ static void update_divide_count(struct kvm_lapic *apic) static void apic_update_lvtt(struct kvm_lapic *apic) { - u32 timer_mode = kvm_apic_get_reg(apic, APIC_LVTT) & + u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask; if (apic->lapic_timer.timer_mode != timer_mode) { @@ -1296,7 +1274,7 @@ static void apic_timer_expired(struct kvm_lapic *apic) static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 reg = kvm_apic_get_reg(apic, APIC_LVTT); + u32 reg = kvm_lapic_get_reg(apic, APIC_LVTT); if (kvm_apic_hw_enabled(apic)) { int vec = reg & APIC_VECTOR_MASK; @@ -1344,7 +1322,7 @@ static void start_apic_timer(struct kvm_lapic *apic) if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { /* lapic timer in oneshot or periodic mode */ now = apic->lapic_timer.timer.base->get_time(); - apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT) + apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->divide_count; if (!apic->lapic_timer.period) @@ -1376,7 +1354,7 @@ static void start_apic_timer(struct kvm_lapic *apic) "timer initial count 0x%x, period %lldns, " "expire @ 0x%016" PRIx64 ".\n", __func__, APIC_BUS_CYCLE_NS, ktime_to_ns(now), - kvm_apic_get_reg(apic, APIC_TMICT), + kvm_lapic_get_reg(apic, APIC_TMICT), apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); @@ -1425,7 +1403,7 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) } } -static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) +int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; @@ -1457,7 +1435,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_DFR: if (!apic_x2apic_mode(apic)) { - apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); + kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); recalculate_apic_map(apic->vcpu->kvm); } else ret = 1; @@ -1465,17 +1443,17 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_SPIV: { u32 mask = 0x3ff; - if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) + if (kvm_lapic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) mask |= APIC_SPIV_DIRECTED_EOI; apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; u32 lvt_val; - for (i = 0; i < APIC_LVT_NUM; i++) { - lvt_val = kvm_apic_get_reg(apic, + for (i = 0; i < KVM_APIC_LVT_NUM; i++) { + lvt_val = kvm_lapic_get_reg(apic, APIC_LVTT + 0x10 * i); - apic_set_reg(apic, APIC_LVTT + 0x10 * i, + kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, lvt_val | APIC_LVT_MASKED); } apic_update_lvtt(apic); @@ -1486,14 +1464,14 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) } case APIC_ICR: /* No delay here, so we always clear the pending bit */ - apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); + kvm_lapic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); apic_send_ipi(apic); break; case APIC_ICR2: if (!apic_x2apic_mode(apic)) val &= 0xff000000; - apic_set_reg(apic, APIC_ICR2, val); + kvm_lapic_set_reg(apic, APIC_ICR2, val); break; case APIC_LVT0: @@ -1507,7 +1485,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) val |= APIC_LVT_MASKED; val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; - apic_set_reg(apic, reg, val); + kvm_lapic_set_reg(apic, reg, val); break; @@ -1515,7 +1493,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); - apic_set_reg(apic, APIC_LVTT, val); + kvm_lapic_set_reg(apic, APIC_LVTT, val); apic_update_lvtt(apic); break; @@ -1524,14 +1502,14 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; hrtimer_cancel(&apic->lapic_timer.timer); - apic_set_reg(apic, APIC_TMICT, val); + kvm_lapic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); break; case APIC_TDCR: if (val & 4) apic_debug("KVM_WRITE:TDCR %x\n", val); - apic_set_reg(apic, APIC_TDCR, val); + kvm_lapic_set_reg(apic, APIC_TDCR, val); update_divide_count(apic); break; @@ -1544,7 +1522,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_SELF_IPI: if (apic_x2apic_mode(apic)) { - apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); + kvm_lapic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); } else ret = 1; break; @@ -1556,6 +1534,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) apic_debug("Local APIC Write to read-only register %x\n", reg); return ret; } +EXPORT_SYMBOL_GPL(kvm_lapic_reg_write); static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t address, int len, const void *data) @@ -1585,14 +1564,14 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, apic_debug("%s: offset 0x%x with length 0x%x, and value is " "0x%x\n", __func__, offset, len, val); - apic_reg_write(apic, offset & 0xff0, val); + kvm_lapic_reg_write(apic, offset & 0xff0, val); return 0; } void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { - apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); + kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); @@ -1604,10 +1583,10 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) /* hw has done the conditional check and inst decode */ offset &= 0xff0; - apic_reg_read(vcpu->arch.apic, offset, 4, &val); + kvm_lapic_reg_read(vcpu->arch.apic, offset, 4, &val); /* TODO: optimize to just emulate side effect w/o one more write */ - apic_reg_write(vcpu->arch.apic, offset, val); + kvm_lapic_reg_write(vcpu->arch.apic, offset, val); } EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); @@ -1667,14 +1646,14 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) struct kvm_lapic *apic = vcpu->arch.apic; apic_set_tpr(apic, ((cr8 & 0x0f) << 4) - | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4)); + | (kvm_lapic_get_reg(apic, APIC_TASKPRI) & 4)); } u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { u64 tpr; - tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI); + tpr = (u64) kvm_lapic_get_reg(vcpu->arch.apic, APIC_TASKPRI); return (tpr & 0xf0) >> 4; } @@ -1740,28 +1719,28 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_apic_set_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); - for (i = 0; i < APIC_LVT_NUM; i++) - apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + for (i = 0; i < KVM_APIC_LVT_NUM; i++) + kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) - apic_set_reg(apic, APIC_LVT0, + kvm_lapic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); - apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0)); + apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); - apic_set_reg(apic, APIC_DFR, 0xffffffffU); + kvm_lapic_set_reg(apic, APIC_DFR, 0xffffffffU); apic_set_spiv(apic, 0xff); - apic_set_reg(apic, APIC_TASKPRI, 0); + kvm_lapic_set_reg(apic, APIC_TASKPRI, 0); if (!apic_x2apic_mode(apic)) kvm_apic_set_ldr(apic, 0); - apic_set_reg(apic, APIC_ESR, 0); - apic_set_reg(apic, APIC_ICR, 0); - apic_set_reg(apic, APIC_ICR2, 0); - apic_set_reg(apic, APIC_TDCR, 0); - apic_set_reg(apic, APIC_TMICT, 0); + kvm_lapic_set_reg(apic, APIC_ESR, 0); + kvm_lapic_set_reg(apic, APIC_ICR, 0); + kvm_lapic_set_reg(apic, APIC_ICR2, 0); + kvm_lapic_set_reg(apic, APIC_TDCR, 0); + kvm_lapic_set_reg(apic, APIC_TMICT, 0); for (i = 0; i < 8; i++) { - apic_set_reg(apic, APIC_IRR + 0x10 * i, 0); - apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); - apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); + kvm_lapic_set_reg(apic, APIC_IRR + 0x10 * i, 0); + kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0); + kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } apic->irr_pending = vcpu->arch.apicv_active; apic->isr_count = vcpu->arch.apicv_active ? 1 : 0; @@ -1806,7 +1785,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { - u32 reg = kvm_apic_get_reg(apic, lvt_type); + u32 reg = kvm_lapic_get_reg(apic, lvt_type); int vector, mode, trig_mode; if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { @@ -1901,14 +1880,14 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) apic_update_ppr(apic); highest_irr = apic_find_highest_irr(apic); if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI))) + ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI))) return -1; return highest_irr; } int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { - u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0); + u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; if (!kvm_apic_hw_enabled(vcpu->arch.apic)) @@ -1974,7 +1953,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); apic_update_lvtt(apic); - apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0)); + apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; @@ -1982,9 +1961,11 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { + if (kvm_x86_ops->apicv_post_state_restore) + kvm_x86_ops->apicv_post_state_restore(vcpu); kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + kvm_x86_ops->hwapic_isr_update(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -2097,7 +2078,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff; + tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff; max_irr = apic_find_highest_irr(apic); if (max_irr < 0) max_irr = 0; @@ -2139,8 +2120,8 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* if this is ICR write vector before command */ if (reg == APIC_ICR) - apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); - return apic_reg_write(apic, reg, (u32)data); + kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return kvm_lapic_reg_write(apic, reg, (u32)data); } int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) @@ -2157,10 +2138,10 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 1; } - if (apic_reg_read(apic, reg, 4, &low)) + if (kvm_lapic_reg_read(apic, reg, 4, &low)) return 1; if (reg == APIC_ICR) - apic_reg_read(apic, APIC_ICR2, 4, &high); + kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high); *data = (((u64)high) << 32) | low; @@ -2176,8 +2157,8 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) /* if this is ICR write vector before command */ if (reg == APIC_ICR) - apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); - return apic_reg_write(apic, reg, (u32)data); + kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return kvm_lapic_reg_write(apic, reg, (u32)data); } int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) @@ -2188,10 +2169,10 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) if (!lapic_in_kernel(vcpu)) return 1; - if (apic_reg_read(apic, reg, 4, &low)) + if (kvm_lapic_reg_read(apic, reg, 4, &low)) return 1; if (reg == APIC_ICR) - apic_reg_read(apic, APIC_ICR2, 4, &high); + kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high); *data = (((u64)high) << 32) | low; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f71183e502ee..891c6da7d4aa 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -7,6 +7,10 @@ #define KVM_APIC_INIT 0 #define KVM_APIC_SIPI 1 +#define KVM_APIC_LVT_NUM 6 + +#define KVM_APIC_SHORT_MASK 0xc0000 +#define KVM_APIC_DEST_MASK 0x800 struct kvm_timer { struct hrtimer timer; @@ -59,6 +63,11 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); +int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val); +int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, + void *data); +bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, + int short_hand, unsigned int dest, int dest_mode); void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); @@ -99,9 +108,32 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); void kvm_lapic_init(void); -static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off) +#define VEC_POS(v) ((v) & (32 - 1)) +#define REG_POS(v) (((v) >> 5) << 4) + +static inline void kvm_lapic_set_vector(int vec, void *bitmap) +{ + set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic) +{ + kvm_lapic_set_vector(vec, apic->regs + APIC_IRR); + /* + * irr_pending must be true if any interrupt is pending; set it after + * APIC_IRR to avoid race with apic_clear_irr + */ + apic->irr_pending = true; +} + +static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off) +{ + return *((u32 *) (apic->regs + reg_off)); +} + +static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { - return *((u32 *) (apic->regs + reg_off)); + *((u32 *) (apic->regs + reg_off)) = val; } extern struct static_key kvm_no_apic_vcpu; @@ -169,7 +201,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) static inline int kvm_apic_id(struct kvm_lapic *apic) { - return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; + return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff; } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 38c0c32926c9..24e800116ab4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1909,18 +1909,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, * since it has been deleted from active_mmu_pages but still can be found * at hast list. * - * for_each_gfn_indirect_valid_sp has skipped that kind of page and - * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped - * all the obsolete pages. + * for_each_gfn_valid_sp() has skipped that kind of pages. */ -#define for_each_gfn_sp(_kvm, _sp, _gfn) \ +#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if ((_sp)->gfn != (_gfn)) {} else + if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \ + || (_sp)->role.invalid) {} else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_gfn_sp(_kvm, _sp, _gfn) \ - if ((_sp)->role.direct || (_sp)->role.invalid) {} else + for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ + if ((_sp)->role.direct) {} else /* @sp->gfn should be write-protected at the call site */ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -1961,6 +1960,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } static void mmu_audit_disable(void) { } #endif +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); +} + static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { @@ -2105,11 +2109,6 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sp); } -static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); -} - static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -2136,10 +2135,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_sp(vcpu->kvm, sp, gfn) { - if (is_obsolete_sp(vcpu->kvm, sp)) - continue; - + for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) { if (!need_sync && sp->unsync) need_sync = true; diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 3f8c732117ec..c146f3c262c3 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -44,8 +44,6 @@ static bool msr_mtrr_valid(unsigned msr) case MSR_MTRRdefType: case MSR_IA32_CR_PAT: return true; - case 0x2f8: - return true; } return false; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fafd720ce10a..2214214c786b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -14,6 +14,9 @@ * the COPYING file in the top-level directory. * */ + +#define pr_fmt(fmt) "SVM: " fmt + #include <linux/kvm_host.h> #include "irq.h" @@ -32,6 +35,7 @@ #include <linux/trace_events.h> #include <linux/slab.h> +#include <asm/apic.h> #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> @@ -68,6 +72,8 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define SVM_FEATURE_DECODE_ASSIST (1 << 7) #define SVM_FEATURE_PAUSE_FILTER (1 << 10) +#define SVM_AVIC_DOORBELL 0xc001011b + #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ #define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ @@ -78,6 +84,18 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define TSC_RATIO_MIN 0x0000000000000001ULL #define TSC_RATIO_MAX 0x000000ffffffffffULL +#define AVIC_HPA_MASK ~((0xFFFULL << 52) || 0xFFF) + +/* + * 0xff is broadcast, so the max index allowed for physical APIC ID + * table is 0xfe. APIC IDs above 0xff are reserved. + */ +#define AVIC_MAX_PHYSICAL_ID_COUNT 255 + +#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 +#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 +#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -162,8 +180,21 @@ struct vcpu_svm { /* cached guest cpuid flags for faster access */ bool nrips_enabled : 1; + + u32 ldr_reg; + struct page *avic_backing_page; + u64 *avic_physical_id_cache; + bool avic_is_running; }; +#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) + +#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) +#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) +#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) +#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) + static DEFINE_PER_CPU(u64, current_tsc_ratio); #define TSC_RATIO_DEFAULT 0x0100000000ULL @@ -205,6 +236,10 @@ module_param(npt, int, S_IRUGO); static int nested = true; module_param(nested, int, S_IRUGO); +/* enable / disable AVIC */ +static int avic; +module_param(avic, int, S_IRUGO); + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -228,12 +263,18 @@ enum { VMCB_SEG, /* CS, DS, SS, ES, CPL */ VMCB_CR2, /* CR2 only */ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ + VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE, + * AVIC PHYSICAL_TABLE pointer, + * AVIC LOGICAL_TABLE pointer + */ VMCB_DIRTY_MAX, }; /* TPR and CR2 are always written before VMRUN */ #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) +#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL + static inline void mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -255,6 +296,23 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_svm, vcpu); } +static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) +{ + svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; + mark_dirty(svm->vmcb, VMCB_AVIC); +} + +static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 *entry = svm->avic_physical_id_cache; + + if (!entry) + return false; + + return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); +} + static void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h; @@ -923,6 +981,12 @@ static __init int svm_hardware_setup(void) } else kvm_disable_tdp(); + if (avic && (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC))) + avic = false; + + if (avic) + pr_info("AVIC enabled\n"); + return 0; err: @@ -1000,6 +1064,22 @@ static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } +static void avic_init_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb; + struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; + phys_addr_t bpa = page_to_phys(svm->avic_backing_page); + phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page); + phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page); + + vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; + vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; + vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; + vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + svm->vcpu.arch.apicv_active = true; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -1014,7 +1094,8 @@ static void init_vmcb(struct vcpu_svm *svm) set_cr_intercept(svm, INTERCEPT_CR0_WRITE); set_cr_intercept(svm, INTERCEPT_CR3_WRITE); set_cr_intercept(svm, INTERCEPT_CR4_WRITE); - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + if (!kvm_vcpu_apicv_active(&svm->vcpu)) + set_cr_intercept(svm, INTERCEPT_CR8_WRITE); set_dr_intercepts(svm); @@ -1110,9 +1191,197 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_PAUSE); } + if (avic) + avic_init_vmcb(svm); + mark_all_dirty(svm->vmcb); enable_gif(svm); + +} + +static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index) +{ + u64 *avic_physical_id_table; + struct kvm_arch *vm_data = &vcpu->kvm->arch; + + if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) + return NULL; + + avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page); + + return &avic_physical_id_table[index]; +} + +/** + * Note: + * AVIC hardware walks the nested page table to check permissions, + * but does not use the SPA address specified in the leaf page + * table entry since it uses address in the AVIC_BACKING_PAGE pointer + * field of the VMCB. Therefore, we set up the + * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. + */ +static int avic_init_access_page(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + int ret; + + if (kvm->arch.apic_access_page_done) + return 0; + + ret = x86_set_memory_region(kvm, + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, + PAGE_SIZE); + if (ret) + return ret; + + kvm->arch.apic_access_page_done = true; + return 0; +} + +static int avic_init_backing_page(struct kvm_vcpu *vcpu) +{ + int ret; + u64 *entry, new_entry; + int id = vcpu->vcpu_id; + struct vcpu_svm *svm = to_svm(vcpu); + + ret = avic_init_access_page(vcpu); + if (ret) + return ret; + + if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) + return -EINVAL; + + if (!svm->vcpu.arch.apic->regs) + return -EINVAL; + + svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs); + + /* Setting AVIC backing page address in the phy APIC ID table */ + entry = avic_get_physical_id_entry(vcpu, id); + if (!entry) + return -EINVAL; + + new_entry = READ_ONCE(*entry); + new_entry = (page_to_phys(svm->avic_backing_page) & + AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; + WRITE_ONCE(*entry, new_entry); + + svm->avic_physical_id_cache = entry; + + return 0; +} + +static void avic_vm_destroy(struct kvm *kvm) +{ + struct kvm_arch *vm_data = &kvm->arch; + + if (vm_data->avic_logical_id_table_page) + __free_page(vm_data->avic_logical_id_table_page); + if (vm_data->avic_physical_id_table_page) + __free_page(vm_data->avic_physical_id_table_page); +} + +static int avic_vm_init(struct kvm *kvm) +{ + int err = -ENOMEM; + struct kvm_arch *vm_data = &kvm->arch; + struct page *p_page; + struct page *l_page; + + if (!avic) + return 0; + + /* Allocating physical APIC ID table (4KB) */ + p_page = alloc_page(GFP_KERNEL); + if (!p_page) + goto free_avic; + + vm_data->avic_physical_id_table_page = p_page; + clear_page(page_address(p_page)); + + /* Allocating logical APIC ID table (4KB) */ + l_page = alloc_page(GFP_KERNEL); + if (!l_page) + goto free_avic; + + vm_data->avic_logical_id_table_page = l_page; + clear_page(page_address(l_page)); + + return 0; + +free_avic: + avic_vm_destroy(kvm); + return err; +} + +/** + * This function is called during VCPU halt/unhalt. + */ +static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +{ + u64 entry; + int h_physical_id = __default_cpu_present_to_apicid(vcpu->cpu); + struct vcpu_svm *svm = to_svm(vcpu); + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + svm->avic_is_running = is_run; + + /* ID = 0xff (broadcast), ID > 0xff (reserved) */ + if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) + return; + + entry = READ_ONCE(*(svm->avic_physical_id_cache)); + WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)); + + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + if (is_run) + entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + WRITE_ONCE(*(svm->avic_physical_id_cache), entry); +} + +static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + u64 entry; + /* ID = 0xff (broadcast), ID > 0xff (reserved) */ + int h_physical_id = __default_cpu_present_to_apicid(cpu); + struct vcpu_svm *svm = to_svm(vcpu); + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) + return; + + entry = READ_ONCE(*(svm->avic_physical_id_cache)); + WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); + + entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; + entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); + + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + if (svm->avic_is_running) + entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + + WRITE_ONCE(*(svm->avic_physical_id_cache), entry); +} + +static void avic_vcpu_put(struct kvm_vcpu *vcpu) +{ + u64 entry; + struct vcpu_svm *svm = to_svm(vcpu); + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + entry = READ_ONCE(*(svm->avic_physical_id_cache)); + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -1131,6 +1400,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); + + if (kvm_vcpu_apicv_active(vcpu) && !init_event) + avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) @@ -1169,6 +1441,17 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (!hsave_page) goto free_page3; + if (avic) { + err = avic_init_backing_page(&svm->vcpu); + if (err) + goto free_page4; + } + + /* We initialize this flag to true to make sure that the is_running + * bit would be set the first time the vcpu is loaded. + */ + svm->avic_is_running = true; + svm->nested.hsave = page_address(hsave_page); svm->msrpm = page_address(msrpm_pages); @@ -1187,6 +1470,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) return &svm->vcpu; +free_page4: + __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: @@ -1243,6 +1528,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* This assumes that the kernel never uses MSR_TSC_AUX */ if (static_cpu_has(X86_FEATURE_RDTSCP)) wrmsrl(MSR_TSC_AUX, svm->tsc_aux); + + avic_vcpu_load(vcpu, cpu); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) @@ -1250,6 +1537,8 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); int i; + avic_vcpu_put(vcpu); + ++vcpu->stat.host_state_reload; kvm_load_ldt(svm->host.ldt); #ifdef CONFIG_X86_64 @@ -1265,6 +1554,16 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } +static void svm_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + avic_set_running(vcpu, false); +} + +static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + avic_set_running(vcpu, true); +} + static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; @@ -2673,10 +2972,11 @@ static int clgi_interception(struct vcpu_svm *svm) disable_gif(svm); /* After a CLGI no interrupts should come */ - svm_clear_vintr(svm); - svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; - - mark_dirty(svm->vmcb, VMCB_INTR); + if (!kvm_vcpu_apicv_active(&svm->vcpu)) { + svm_clear_vintr(svm); + svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + mark_dirty(svm->vmcb, VMCB_INTR); + } return 1; } @@ -3212,6 +3512,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_IGNNE: vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; + case MSR_IA32_APICBASE: + if (kvm_vcpu_apicv_active(vcpu)) + avic_update_vapic_bar(to_svm(vcpu), data); + /* Follow through */ default: return kvm_set_msr_common(vcpu, msr); } @@ -3281,6 +3585,278 @@ static int mwait_interception(struct vcpu_svm *svm) return nop_interception(svm); } +enum avic_ipi_failure_cause { + AVIC_IPI_FAILURE_INVALID_INT_TYPE, + AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, + AVIC_IPI_FAILURE_INVALID_TARGET, + AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, +}; + +static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) +{ + u32 icrh = svm->vmcb->control.exit_info_1 >> 32; + u32 icrl = svm->vmcb->control.exit_info_1; + u32 id = svm->vmcb->control.exit_info_2 >> 32; + u32 index = svm->vmcb->control.exit_info_2 && 0xFF; + struct kvm_lapic *apic = svm->vcpu.arch.apic; + + trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index); + + switch (id) { + case AVIC_IPI_FAILURE_INVALID_INT_TYPE: + /* + * AVIC hardware handles the generation of + * IPIs when the specified Message Type is Fixed + * (also known as fixed delivery mode) and + * the Trigger Mode is edge-triggered. The hardware + * also supports self and broadcast delivery modes + * specified via the Destination Shorthand(DSH) + * field of the ICRL. Logical and physical APIC ID + * formats are supported. All other IPI types cause + * a #VMEXIT, which needs to emulated. + */ + kvm_lapic_reg_write(apic, APIC_ICR2, icrh); + kvm_lapic_reg_write(apic, APIC_ICR, icrl); + break; + case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { + int i; + struct kvm_vcpu *vcpu; + struct kvm *kvm = svm->vcpu.kvm; + struct kvm_lapic *apic = svm->vcpu.arch.apic; + + /* + * At this point, we expect that the AVIC HW has already + * set the appropriate IRR bits on the valid target + * vcpus. So, we just need to kick the appropriate vcpu. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + bool m = kvm_apic_match_dest(vcpu, apic, + icrl & KVM_APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & KVM_APIC_DEST_MASK); + + if (m && !avic_vcpu_is_running(vcpu)) + kvm_vcpu_wake_up(vcpu); + } + break; + } + case AVIC_IPI_FAILURE_INVALID_TARGET: + break; + case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: + WARN_ONCE(1, "Invalid backing page\n"); + break; + default: + pr_err("Unknown IPI interception\n"); + } + + return 1; +} + +static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) +{ + struct kvm_arch *vm_data = &vcpu->kvm->arch; + int index; + u32 *logical_apic_id_table; + int dlid = GET_APIC_LOGICAL_ID(ldr); + + if (!dlid) + return NULL; + + if (flat) { /* flat */ + index = ffs(dlid) - 1; + if (index > 7) + return NULL; + } else { /* cluster */ + int cluster = (dlid & 0xf0) >> 4; + int apic = ffs(dlid & 0x0f) - 1; + + if ((apic < 0) || (apic > 7) || + (cluster >= 0xf)) + return NULL; + index = (cluster << 2) + apic; + } + + logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page); + + return &logical_apic_id_table[index]; +} + +static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, + bool valid) +{ + bool flat; + u32 *entry, new_entry; + + flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; + entry = avic_get_logical_id_entry(vcpu, ldr, flat); + if (!entry) + return -EINVAL; + + new_entry = READ_ONCE(*entry); + new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); + if (valid) + new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; + else + new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK; + WRITE_ONCE(*entry, new_entry); + + return 0; +} + +static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) +{ + int ret; + struct vcpu_svm *svm = to_svm(vcpu); + u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); + + if (!ldr) + return 1; + + ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true); + if (ret && svm->ldr_reg) { + avic_ldr_write(vcpu, 0, svm->ldr_reg, false); + svm->ldr_reg = 0; + } else { + svm->ldr_reg = ldr; + } + return ret; +} + +static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) +{ + u64 *old, *new; + struct vcpu_svm *svm = to_svm(vcpu); + u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID); + u32 id = (apic_id_reg >> 24) & 0xff; + + if (vcpu->vcpu_id == id) + return 0; + + old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id); + new = avic_get_physical_id_entry(vcpu, id); + if (!new || !old) + return 1; + + /* We need to move physical_id_entry to new offset */ + *new = *old; + *old = 0ULL; + to_svm(vcpu)->avic_physical_id_cache = new; + + /* + * Also update the guest physical APIC ID in the logical + * APIC ID table entry if already setup the LDR. + */ + if (svm->ldr_reg) + avic_handle_ldr_update(vcpu); + + return 0; +} + +static int avic_handle_dfr_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_arch *vm_data = &vcpu->kvm->arch; + u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); + u32 mod = (dfr >> 28) & 0xf; + + /* + * We assume that all local APICs are using the same type. + * If this changes, we need to flush the AVIC logical + * APID id table. + */ + if (vm_data->ldr_mode == mod) + return 0; + + clear_page(page_address(vm_data->avic_logical_id_table_page)); + vm_data->ldr_mode = mod; + + if (svm->ldr_reg) + avic_handle_ldr_update(vcpu); + return 0; +} + +static int avic_unaccel_trap_write(struct vcpu_svm *svm) +{ + struct kvm_lapic *apic = svm->vcpu.arch.apic; + u32 offset = svm->vmcb->control.exit_info_1 & + AVIC_UNACCEL_ACCESS_OFFSET_MASK; + + switch (offset) { + case APIC_ID: + if (avic_handle_apic_id_update(&svm->vcpu)) + return 0; + break; + case APIC_LDR: + if (avic_handle_ldr_update(&svm->vcpu)) + return 0; + break; + case APIC_DFR: + avic_handle_dfr_update(&svm->vcpu); + break; + default: + break; + } + + kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); + + return 1; +} + +static bool is_avic_unaccelerated_access_trap(u32 offset) +{ + bool ret = false; + + switch (offset) { + case APIC_ID: + case APIC_EOI: + case APIC_RRR: + case APIC_LDR: + case APIC_DFR: + case APIC_SPIV: + case APIC_ESR: + case APIC_ICR: + case APIC_LVTT: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + case APIC_TMICT: + case APIC_TDCR: + ret = true; + break; + default: + break; + } + return ret; +} + +static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) +{ + int ret = 0; + u32 offset = svm->vmcb->control.exit_info_1 & + AVIC_UNACCEL_ACCESS_OFFSET_MASK; + u32 vector = svm->vmcb->control.exit_info_2 & + AVIC_UNACCEL_ACCESS_VECTOR_MASK; + bool write = (svm->vmcb->control.exit_info_1 >> 32) & + AVIC_UNACCEL_ACCESS_WRITE_MASK; + bool trap = is_avic_unaccelerated_access_trap(offset); + + trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset, + trap, write, vector); + if (trap) { + /* Handling Trap */ + WARN_ONCE(!write, "svm: Handling trap read.\n"); + ret = avic_unaccel_trap_write(svm); + } else { + /* Handling Fault */ + ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); + } + + return ret; +} + static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3344,6 +3920,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = pf_interception, [SVM_EXIT_RSM] = emulate_on_interception, + [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, + [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, }; static void dump_vmcb(struct kvm_vcpu *vcpu) @@ -3375,10 +3953,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); + pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); pr_err("%-20s%08x\n", "event_inj:", control->event_inj); pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); + pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); + pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); + pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3562,6 +4144,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; + /* The following fields are ignored when AVIC is enabled */ control = &svm->vmcb->control; control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; @@ -3583,11 +4166,17 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } +static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) +{ + return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK); +} + static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); - if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) + if (svm_nested_virtualize_tpr(vcpu) || + kvm_vcpu_apicv_active(vcpu)) return; clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); @@ -3606,11 +4195,28 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) static bool svm_get_enable_apicv(void) { - return false; + return avic; +} + +static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +{ } +static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +{ +} + +/* Note: Currently only used by Hyper-V. */ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + + if (!avic) + return; + + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + mark_dirty(vmcb, VMCB_INTR); } static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -3623,6 +4229,18 @@ static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) return; } +static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) +{ + kvm_lapic_set_irr(vec, vcpu->arch.apic); + smp_mb__after_atomic(); + + if (avic_vcpu_is_running(vcpu)) + wrmsrl(SVM_AVIC_DOORBELL, + __default_cpu_present_to_apicid(vcpu->cpu)); + else + kvm_vcpu_wake_up(vcpu); +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3677,6 +4295,9 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + if (kvm_vcpu_apicv_active(vcpu)) + return; + /* * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes * 1, because that's a separate STGI/VMRUN intercept. The next time we @@ -3728,7 +4349,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) + if (svm_nested_virtualize_tpr(vcpu)) return; if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { @@ -3742,7 +4363,8 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) + if (svm_nested_virtualize_tpr(vcpu) || + kvm_vcpu_apicv_active(vcpu)) return; cr8 = kvm_get_cr8(vcpu); @@ -4045,14 +4667,26 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) static void svm_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_cpuid_entry2 *entry; /* Update nrips enabled cache */ svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu); + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + entry = kvm_find_cpuid_entry(vcpu, 1, 0); + if (entry) + entry->ecx &= ~bit(X86_FEATURE_X2APIC); } static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { + case 0x1: + if (avic) + entry->ecx &= ~bit(X86_FEATURE_X2APIC); + break; case 0x80000001: if (nested) entry->ecx |= (1 << 2); /* Set SVM bit */ @@ -4307,6 +4941,15 @@ static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) { } +static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) +{ + if (avic_handle_apic_id_update(vcpu) != 0) + return; + if (avic_handle_dfr_update(vcpu) != 0) + return; + avic_handle_ldr_update(vcpu); +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -4322,9 +4965,14 @@ static struct kvm_x86_ops svm_x86_ops = { .vcpu_free = svm_free_vcpu, .vcpu_reset = svm_vcpu_reset, + .vm_init = avic_vm_init, + .vm_destroy = avic_vm_destroy, + .prepare_guest_switch = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, + .vcpu_blocking = svm_vcpu_blocking, + .vcpu_unblocking = svm_vcpu_unblocking, .update_bp_intercept = update_bp_intercept, .get_msr = svm_get_msr, @@ -4382,6 +5030,9 @@ static struct kvm_x86_ops svm_x86_ops = { .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .sync_pir_to_irr = svm_sync_pir_to_irr, + .hwapic_irr_update = svm_hwapic_irr_update, + .hwapic_isr_update = svm_hwapic_isr_update, + .apicv_post_state_restore = avic_post_state_restore, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, @@ -4415,6 +5066,7 @@ static struct kvm_x86_ops svm_x86_ops = { .sched_in = svm_sched_in, .pmu_ops = &amd_pmu_ops, + .deliver_posted_interrupt = svm_deliver_avic_intr, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b72743c5668d..8de925031b5c 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1291,6 +1291,63 @@ TRACE_EVENT(kvm_hv_stimer_cleanup, __entry->vcpu_id, __entry->timer_index) ); +/* + * Tracepoint for AMD AVIC + */ +TRACE_EVENT(kvm_avic_incomplete_ipi, + TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index), + TP_ARGS(vcpu, icrh, icrl, id, index), + + TP_STRUCT__entry( + __field(u32, vcpu) + __field(u32, icrh) + __field(u32, icrl) + __field(u32, id) + __field(u32, index) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->icrh = icrh; + __entry->icrl = icrl; + __entry->id = id; + __entry->index = index; + ), + + TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n", + __entry->vcpu, __entry->icrh, __entry->icrl, + __entry->id, __entry->index) +); + +TRACE_EVENT(kvm_avic_unaccelerated_access, + TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec), + TP_ARGS(vcpu, offset, ft, rw, vec), + + TP_STRUCT__entry( + __field(u32, vcpu) + __field(u32, offset) + __field(bool, ft) + __field(bool, rw) + __field(u32, vec) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->offset = offset; + __entry->ft = ft; + __entry->rw = rw; + __entry->vec = vec; + ), + + TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n", + __entry->vcpu, + __entry->offset, + __print_symbolic(__entry->offset, kvm_trace_symbol_apic), + __entry->ft ? "trap" : "fault", + __entry->rw ? "write" : "read", + __entry->vec) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cb47fe3da292..e605d1ed334f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5050,8 +5050,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx->vcpu.arch.cr0 = cr0; + vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx_set_cr4(vcpu, 0); vmx_set_efer(vcpu, 0); vmx_fpu_activate(vcpu); @@ -8318,19 +8318,19 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) vmcs_write64(APIC_ACCESS_ADDR, hpa); } -static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) +static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { u16 status; u8 old; - if (isr == -1) - isr = 0; + if (max_isr == -1) + max_isr = 0; status = vmcs_read16(GUEST_INTR_STATUS); old = status >> 8; - if (isr != old) { + if (max_isr != old) { status &= 0xff; - status |= isr << 8; + status |= max_isr << 8; vmcs_write16(GUEST_INTR_STATUS, status); } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 12f33e662382..c805cf494154 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -161,6 +161,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_exits", VCPU_STAT(halt_exits) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, + { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, @@ -2002,22 +2003,8 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) vcpu->arch.pv_time_enabled = false; } -static void accumulate_steal_time(struct kvm_vcpu *vcpu) -{ - u64 delta; - - if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) - return; - - delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; - vcpu->arch.st.last_steal = current->sched_info.run_delay; - vcpu->arch.st.accum_steal = delta; -} - static void record_steal_time(struct kvm_vcpu *vcpu) { - accumulate_steal_time(vcpu); - if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -2025,9 +2012,26 @@ static void record_steal_time(struct kvm_vcpu *vcpu) &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; - vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal; - vcpu->arch.st.steal.version += 2; - vcpu->arch.st.accum_steal = 0; + if (vcpu->arch.st.steal.version & 1) + vcpu->arch.st.steal.version += 1; /* first time write, random junk */ + + vcpu->arch.st.steal.version += 1; + + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + + smp_wmb(); + + vcpu->arch.st.steal.steal += current->sched_info.run_delay - + vcpu->arch.st.last_steal; + vcpu->arch.st.last_steal = current->sched_info.run_delay; + + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + + smp_wmb(); + + vcpu->arch.st.steal.version += 1; kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); @@ -7752,6 +7756,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); + if (kvm_x86_ops->vm_init) + return kvm_x86_ops->vm_init(kvm); + return 0; } @@ -7873,6 +7880,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0); x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); } + if (kvm_x86_ops->vm_destroy) + kvm_x86_ops->vm_destroy(kvm); kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); @@ -8355,19 +8364,21 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); +bool kvm_arch_has_irq_bypass(void) +{ + return kvm_x86_ops->update_pi_irte != NULL; +} + int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); - if (kvm_x86_ops->update_pi_irte) { - irqfd->producer = prod; - return kvm_x86_ops->update_pi_irte(irqfd->kvm, - prod->irq, irqfd->gsi, 1); - } + irqfd->producer = prod; - return -EINVAL; + return kvm_x86_ops->update_pi_irte(irqfd->kvm, + prod->irq, irqfd->gsi, 1); } void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, @@ -8377,11 +8388,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); - if (!kvm_x86_ops->update_pi_irte) { - WARN_ON(irqfd->producer != NULL); - return; - } - WARN_ON(irqfd->producer != prod); irqfd->producer = NULL; @@ -8429,3 +8435,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 14a95054d4e0..2ae8584b44c7 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -165,6 +165,7 @@ static __init int setup_hugepagesz(char *opt) } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); } else { + hugetlb_bad_size(); printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", ps >> 20); return 0; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index f70c1ff46125..9c086c57105c 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -617,9 +617,7 @@ static void __init numa_init_array(void) if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); - rr = next_node(rr, node_online_map); - if (rr == MAX_NUMNODES) - rr = first_node(node_online_map); + rr = next_node_in(rr, node_online_map); } } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4286f3618bd0..fe04a04dab8e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -110,11 +110,16 @@ static void bpf_flush_icache(void *start, void *end) ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) /* pick a register outside of BPF range for JIT internal work */ -#define AUX_REG (MAX_BPF_REG + 1) +#define AUX_REG (MAX_BPF_JIT_REG + 1) -/* the following table maps BPF registers to x64 registers. - * x64 register r12 is unused, since if used as base address register - * in load/store instructions, it always needs an extra byte of encoding +/* The following table maps BPF registers to x64 registers. + * + * x64 register r12 is unused, since if used as base address + * register in load/store instructions, it always needs an + * extra byte of encoding and is callee saved. + * + * r9 caches skb->len - skb->data_len + * r10 caches skb->data, and used for blinding (if enabled) */ static const int reg2hex[] = { [BPF_REG_0] = 0, /* rax */ @@ -128,6 +133,7 @@ static const int reg2hex[] = { [BPF_REG_8] = 6, /* r14 callee saved */ [BPF_REG_9] = 7, /* r15 callee saved */ [BPF_REG_FP] = 5, /* rbp readonly */ + [BPF_REG_AX] = 2, /* r10 temp register */ [AUX_REG] = 3, /* r11 temp register */ }; @@ -141,7 +147,8 @@ static bool is_ereg(u32 reg) BIT(AUX_REG) | BIT(BPF_REG_7) | BIT(BPF_REG_8) | - BIT(BPF_REG_9)); + BIT(BPF_REG_9) | + BIT(BPF_REG_AX)); } /* add modifiers if 'reg' maps to x64 registers r8..r15 */ @@ -182,6 +189,7 @@ static void jit_fill_hole(void *area, unsigned int size) struct jit_context { int cleanup_addr; /* epilogue code offset */ bool seen_ld_abs; + bool seen_ax_reg; }; /* maximum number of bytes emitted while JITing one eBPF insn */ @@ -345,6 +353,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, struct bpf_insn *insn = bpf_prog->insnsi; int insn_cnt = bpf_prog->len; bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); + bool seen_ax_reg = ctx->seen_ax_reg | (oldproglen == 0); bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i, cnt = 0; @@ -367,6 +376,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int ilen; u8 *func; + if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX) + ctx->seen_ax_reg = seen_ax_reg = true; + switch (insn->code) { /* ALU */ case BPF_ALU | BPF_ADD | BPF_X: @@ -1002,6 +1014,10 @@ common_load: * sk_load_* helpers also use %r10 and %r9d. * See bpf_jit.S */ + if (seen_ax_reg) + /* r10 = skb->data, mov %r10, off32(%rbx) */ + EMIT3_off32(0x4c, 0x8b, 0x93, + offsetof(struct sk_buff, data)); EMIT1_off32(0xE8, jmp_offset); /* call */ break; @@ -1073,25 +1089,37 @@ void bpf_jit_compile(struct bpf_prog *prog) { } -void bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; + struct bpf_prog *tmp, *orig_prog = prog; int proglen, oldproglen = 0; struct jit_context ctx = {}; + bool tmp_blinded = false; u8 *image = NULL; int *addrs; int pass; int i; if (!bpf_jit_enable) - return; + return orig_prog; - if (!prog || !prog->len) - return; + tmp = bpf_jit_blind_constants(prog); + /* If blinding was requested and we failed during blinding, + * we must fall back to the interpreter. + */ + if (IS_ERR(tmp)) + return orig_prog; + if (tmp != prog) { + tmp_blinded = true; + prog = tmp; + } addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL); - if (!addrs) - return; + if (!addrs) { + prog = orig_prog; + goto out; + } /* Before first pass, make a rough estimation of addrs[] * each bpf instruction is translated to less than 64 bytes @@ -1113,21 +1141,25 @@ void bpf_int_jit_compile(struct bpf_prog *prog) image = NULL; if (header) bpf_jit_binary_free(header); - goto out; + prog = orig_prog; + goto out_addrs; } if (image) { if (proglen != oldproglen) { pr_err("bpf_jit: proglen=%d != oldproglen=%d\n", proglen, oldproglen); - goto out; + prog = orig_prog; + goto out_addrs; } break; } if (proglen == oldproglen) { header = bpf_jit_binary_alloc(proglen, &image, 1, jit_fill_hole); - if (!header) - goto out; + if (!header) { + prog = orig_prog; + goto out_addrs; + } } oldproglen = proglen; } @@ -1141,8 +1173,14 @@ void bpf_int_jit_compile(struct bpf_prog *prog) prog->bpf_func = (void *)image; prog->jited = 1; } -out: + +out_addrs: kfree(addrs); +out: + if (tmp_blinded) + bpf_jit_prog_release_other(prog, prog == orig_prog ? + tmp : orig_prog); + return prog; } void bpf_jit_free(struct bpf_prog *fp) diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 3cd69832d7f4..b2a4e2a61f6b 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -396,7 +396,6 @@ int __init pci_acpi_init(void) return -ENODEV; printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); - acpi_irq_penalty_init(); pcibios_enable_irq = acpi_pci_irq_enable; pcibios_disable_irq = acpi_pci_irq_disable; x86_init.pci.init_irq = x86_init_noop; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 381a43c40bf7..8196054fedb0 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -516,7 +516,7 @@ void __init pcibios_set_cache_line_size(void) int __init pcibios_init(void) { - if (!raw_pci_ops) { + if (!raw_pci_ops && !raw_pci_ext_ops) { printk(KERN_WARNING "PCI: System does not support PCI\n"); return 0; } diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index b7de1929714b..837ea36a837d 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -552,9 +552,16 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone); +/* + * Broadwell EP Home Agent BARs erroneously return non-zero values when read. + * + * See http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html + * entry BDF2. + */ static void pci_bdwep_bar(struct pci_dev *dev) { dev->non_compliant_bars = 1; } +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_bdwep_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 4bd08b0fc8ea..99ddab79215e 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -491,8 +491,11 @@ int __init pci_xen_initial_domain(void) #endif __acpi_register_gsi = acpi_register_gsi_xen; __acpi_unregister_gsi = NULL; - /* Pre-allocate legacy irqs */ - for (irq = 0; irq < nr_legacy_irqs(); irq++) { + /* + * Pre-allocate the legacy IRQs. Use NR_LEGACY_IRQS here + * because we don't have a PIC and thus nr_legacy_irqs() is zero. + */ + for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { int trigger, polarity; if (acpi_get_override_irq(irq, &trigger, &polarity) == -1) diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 097cb09d917b..4480c06cade7 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -371,5 +371,5 @@ bool efi_reboot_required(void) bool efi_poweroff_required(void) { - return !!acpi_gbl_reduced_hardware; + return acpi_gbl_reduced_hardware || acpi_no_s5; } diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index 237c6831e095..6be22f991b59 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -61,7 +61,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (!vdso_enabled) return 0; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, VM_READ|VM_EXEC| diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 7ab29518a3b9..e345891450c3 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -393,6 +393,9 @@ static unsigned long __init xen_set_identity_and_remap_chunk( unsigned long i = 0; unsigned long n = end_pfn - start_pfn; + if (remap_pfn == 0) + remap_pfn = nr_pages; + while (i < n) { unsigned long cur_pfn = start_pfn + i; unsigned long left = n - i; @@ -438,17 +441,29 @@ static unsigned long __init xen_set_identity_and_remap_chunk( return remap_pfn; } -static void __init xen_set_identity_and_remap(unsigned long nr_pages) +static unsigned long __init xen_count_remap_pages( + unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long remap_pages) +{ + if (start_pfn >= nr_pages) + return remap_pages; + + return remap_pages + min(end_pfn, nr_pages) - start_pfn; +} + +static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, + unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn, + unsigned long nr_pages, unsigned long last_val)) { phys_addr_t start = 0; - unsigned long last_pfn = nr_pages; + unsigned long ret_val = 0; const struct e820entry *entry = xen_e820_map; int i; /* * Combine non-RAM regions and gaps until a RAM region (or the - * end of the map) is reached, then set the 1:1 map and - * remap the memory in those non-RAM regions. + * end of the map) is reached, then call the provided function + * to perform its duty on the non-RAM region. * * The combined non-RAM regions are rounded to a whole number * of pages so any partial pages are accessible via the 1:1 @@ -466,14 +481,13 @@ static void __init xen_set_identity_and_remap(unsigned long nr_pages) end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) - last_pfn = xen_set_identity_and_remap_chunk( - start_pfn, end_pfn, nr_pages, - last_pfn); + ret_val = func(start_pfn, end_pfn, nr_pages, + ret_val); start = end; } } - pr_info("Released %ld page(s)\n", xen_released_pages); + return ret_val; } /* @@ -596,35 +610,6 @@ static void __init xen_ignore_unusable(void) } } -static unsigned long __init xen_count_remap_pages(unsigned long max_pfn) -{ - unsigned long extra = 0; - unsigned long start_pfn, end_pfn; - const struct e820entry *entry = xen_e820_map; - int i; - - end_pfn = 0; - for (i = 0; i < xen_e820_map_entries; i++, entry++) { - start_pfn = PFN_DOWN(entry->addr); - /* Adjacent regions on non-page boundaries handling! */ - end_pfn = min(end_pfn, start_pfn); - - if (start_pfn >= max_pfn) - return extra + max_pfn - end_pfn; - - /* Add any holes in map to result. */ - extra += start_pfn - end_pfn; - - end_pfn = PFN_UP(entry->addr + entry->size); - end_pfn = min(end_pfn, max_pfn); - - if (entry->type != E820_RAM) - extra += end_pfn - start_pfn; - } - - return extra; -} - bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) { struct e820entry *entry; @@ -804,7 +789,7 @@ char * __init xen_memory_setup(void) max_pages = xen_get_max_pages(); /* How many extra pages do we need due to remapping? */ - max_pages += xen_count_remap_pages(max_pfn); + max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages); if (max_pages > max_pfn) extra_pages += max_pages - max_pfn; @@ -922,7 +907,9 @@ char * __init xen_memory_setup(void) * Set identity map on non-RAM pages and prepare remapping the * underlying RAM. */ - xen_set_identity_and_remap(max_pfn); + xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk); + + pr_info("Released %ld page(s)\n", xen_released_pages); return "Xen"; } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index a0a4e554c6f1..6deba5bc7e34 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -290,11 +290,11 @@ static int xen_vcpuop_set_next_event(unsigned long delta, WARN_ON(!clockevent_state_oneshot(evt)); single.timeout_abs_ns = get_abs_timeout(delta); - single.flags = VCPU_SSHOTTMR_future; + /* Get an event anyway, even if the timeout is already expired */ + single.flags = 0; ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single); - - BUG_ON(ret != 0 && ret != -ETIME); + BUG_ON(ret != 0); return ret; } |